# Unsupervised Learning - Creating a WordCloud.

Word2Vec, we will apply it to the articles headlines to create a wordcloud.

The PyTagCloud library has been used (https://github.com/atizo/PyTagCloud) and updated a few lines of code to Python 3.

In [36]:
reset -fs

In [37]:
import os
import webbrowser
from pytagcloud import create_tag_image, make_tags
from pytagcloud.lang.counter import get_tag_counts
import matplotlib.pyplot as plt
%matplotlib inline
import string
import gensim
import re
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.cluster import KMeans
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import label_binarize, MultiLabelBinarizer, binarize
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve, mean_squared_error, r2_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition.online_lda import LatentDirichletAllocation
sns.set_style('white')

In [38]:
def clean_up_article(article):
    """
    Function to remove HTML characters </br> and replace n\\'t with not, it\\s with it is, and \\'s with s.
    """
    article = re.sub('</br>','', article)
    article = re.sub("(n\\'t)","not", article)
    article = re.sub("(it\\'s)","it is", article)
    article = re.sub("(\\'s)","s", article)
    article = re.sub("(\\t)"," ", article)
    return article

In [39]:
def process_dataframe_text(article):
    """
    Function that takes text from a dataframe and 1st removes punctuation, 2nd removes stopwords and converts
    into lower case, and finally using PorterStemmer, stems the words, returning a list of the processed text in lowercase.
    """
    stemmer = nltk.stem.PorterStemmer()
    
    # Removing punctuation by checking every character in the text passed to the function.
    remove_punctuation = [char for char in article if char not in string.punctuation]

    # Once punctuation has been removed, we join them again to form a string.
    remove_punctuation = ''.join(remove_punctuation)
    
    # Removing stopwords and converting to lower case.
    remove_stop = [word.lower() for word in remove_punctuation.split() if word.lower() not in nltk.corpus.stopwords.words('english')]
    
    # Stemming words using PorterStemmer.
    return [stemmer.stem(word) for word in remove_stop]
    
    # NOTE:
    # Below would be the return function without stemming.
    #return [word.lower() for word in remove_punctuation.split() if word.lower() not in nltk.corpus.stopwords.words('english')]

In [40]:
def to_string(text):
    """
    Function that will convert the items on a list to a string.
    """
    return ' '.join(text)

### Loading dataset into a pandas dataframe.

In [41]:
economic_df = pd.read_csv('Full-Economic-News-DFE-839861.csv', encoding='utf-8')

In [42]:
new_column_names = ['unit_id', 'golden', 'unit_state', 'trusted_judgments', 'last_judgment_at','positivity', 'positivity_confidence', 'relevance', 'relevance_confidence', 'article_id', 'article_date', 'article_headline', 'positivity_gold', 'relevance_gold', 'article_text']

#### Text pre-processing.

In [43]:
economic_df.columns = new_column_names

In [44]:
economic_df['article_headline'] = economic_df['article_headline'].apply(clean_up_article)

In [45]:
economic_df['article_text'] = economic_df['article_text'].apply(clean_up_article)

In [46]:
economic_df['headline_normalized'] = economic_df['article_headline'].apply(process_dataframe_text)

In [47]:
economic_df['article_normalized'] = economic_df['article_text'].apply(process_dataframe_text)

In [48]:
economic_df['headline_string'] = economic_df['headline_normalized'].apply(to_string)

In [49]:
economic_df['article_string'] = economic_df['article_normalized'].apply(to_string)

>https://rare-technologies.com/word2vec-tutorial/

In [73]:
model = gensim.models.Word2Vec(economic_df['article_normalized'])

In [74]:
model.save('mymodel')
new_model = gensim.models.Word2Vec.load('mymodel')

In [75]:
new_model.train(economic_df['article_headline'])

905715

In [76]:
model.similarity('greenspan', 'stock')

-0.06011642513662676

In [107]:
model.similarity('greenspan', 'fed')

0.71049436234238539

In [77]:
model.similarity('nasdaq', 'greenspan')

-0.11178458422568185

In [78]:
vocab = list(model.vocab.keys())

In [79]:
vocab[:6]

['passeng', 'swift', '1316', 'buffett', 'cbot', 'corpth']

In [80]:
'greenspan' in model.vocab

True

In [81]:
model['greenspan']

array([-0.7585234 , -0.52212447,  1.16526139,  0.58837956, -2.1601944 ,
        0.08093447, -0.10149591, -0.25714204,  0.16621384,  0.17069639,
       -1.03715515, -0.24143887,  0.87928903, -0.06602565, -0.79553014,
        1.19008636, -0.37376732,  0.06707131,  0.59703028, -2.4613874 ,
        0.70434433,  1.01207721, -0.28649503, -0.35165769, -0.37745035,
        0.39194298, -0.04731251,  0.32446748,  0.03981505,  0.96417546,
       -0.76356763, -0.81166303,  1.02501583, -0.24210557,  0.86496794,
       -0.29363218,  0.88314688, -0.54662448,  0.67107481,  0.06698391,
        1.71587193,  0.66748732, -0.3219853 ,  0.87728035, -1.56003535,
        0.24265859,  1.10236251,  1.45430446,  0.00788433, -0.23296517,
        0.77996922,  1.17424107, -0.50324214, -1.27326918,  2.77410579,
        0.12999743,  0.92814118,  0.69201434,  1.44236314,  1.85400093,
       -0.03820363,  0.5722605 , -1.50694728, -0.1275685 ,  2.30743432,
       -0.14839871, -0.79961169,  0.55645603, -0.6351648 ,  1.27

In [82]:
# Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
# average of 5 words per cluster
word_vectors = model.syn0
num_clusters = int(word_vectors.shape[0] / 5)

# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans( n_clusters = num_clusters )
idx = kmeans_clustering.fit_predict( word_vectors )

In [83]:
word_centroid_map = dict(zip( model.index2word, idx ))

In [95]:
all_words

[['expansionth',
  'yearsthat',
  'longexpect',
  'wrath',
  'midpoint',
  'minuscul',
  'lick',
  'overheat',
  'thinner',
  'thornton',
  'ent',
  '“great',
  '1997the',
  'october’',
  'percentit',
  'montha',
  '1982the',
  '2013the',
  'quarterpercentagepoint',
  'turnabout',
  'skirmish',
  'yearsom',
  'd2',
  'tempo',
  'singlehandedli',
  'customarili',
  'moneyfund',
  'beforeth',
  'preval',
  'bewar',
  'crawl',
  'shrivel',
  '“more',
  'measureth',
  'weekit',
  'lockedin',
  'economiststh',
  'jobth',
  '16month',
  'monday”',
  'dateth',
  'solvent',
  'disregard',
  'underfund',
  'eaten',
  'in1',
  'conscious',
  'unseen',
  'fedfund',
  'mid1970',
  'ate',
  'horribl',
  'endth',
  'ee',
  'incept',
  'americans’',
  'breakout',
  '271',
  'incometh',
  'foray',
  'halfpercentagepoint',
  'laborforc',
  'lockin',
  'ratesit',
  'unadjust',
  'cometh',
  'shorterterm',
  'yearand',
  'disinfl',
  'economists’',
  'salestax',
  'citywid',
  'foresaw',
  'nearest',
  '

In [100]:
# For the first 10 clusters
all_words = []
word = []
for cluster in range(0,10):
    #
    # Print the cluster number  
    print("\nCluster %d" % cluster)
    #
    # Find all of the words for that cluster number, and print them out
    words = []
    
    text = ''
    for i in range(0,len(word_centroid_map.values())):
        if( list(word_centroid_map.values())[i] == cluster ):
            words.append(list(word_centroid_map.keys())[i])
            word.append(list(word_centroid_map.keys())[i])
            all_words.append(words)
            
            text = text + ' '.join(word)
    print(words)
    


Cluster 0
['expansionth', 'yearsthat', 'longexpect', 'wrath', 'midpoint', 'minuscul', 'lick', 'overheat', 'thinner', 'thornton', 'ent', '“great', '1997the', 'october’', 'percentit', 'montha', '1982the', '2013the', 'quarterpercentagepoint', 'turnabout', 'skirmish', 'yearsom', 'd2', 'tempo', 'singlehandedli', 'customarili', 'moneyfund', 'beforeth', 'preval', 'bewar', 'crawl', 'shrivel', '“more', 'measureth', 'weekit', 'lockedin', 'economiststh', 'jobth', '16month', 'monday”', 'dateth', 'solvent', 'disregard', 'underfund', 'eaten', 'in1', 'conscious', 'unseen', 'fedfund', 'mid1970', 'ate', 'horribl', 'endth', 'ee', 'incept', 'americans’', 'breakout', '271', 'incometh', 'foray', 'halfpercentagepoint', 'laborforc', 'lockin', 'ratesit', 'unadjust', 'cometh', 'shorterterm', 'yearand', 'disinfl', 'economists’', 'salestax', 'citywid', 'foresaw', 'nearest', 'inconceiv', 'shallow', 'fourtenth', 'nil', 'barth', 'unif', 'claw', 'lightvehicl', 'repriev', '1988the', 'gogo', 'months”', 'imped', 'offt

#### Creating WordCloud for headlines.

In [101]:
CLOUD_TEXT = text
tags = make_tags(get_tag_counts(CLOUD_TEXT), maxsize=60)

expansionth
yearsthat
longexpect
wrath
midpoint
minuscul
lick
overheat
thinner
thornton
ent
great
1997the
october
percentit
montha
1982the
2013the
quarterpercentagepoint
turnabout
skirmish
yearsom
d2
tempo
singlehandedli
customarili
moneyfund
beforeth
preval
bewar
crawl
shrivel
more
measureth
weekit
lockedin
economiststh
jobth
16month
monday
dateth
solvent
disregard
underfund
eaten
in1
conscious
unseen
fedfund
mid1970
ate
horribl
endth
ee
incept
americans
breakout
271
incometh
foray
halfpercentagepoint
laborforc
lockin
ratesit
unadjust
cometh
shorterterm
yearand
disinfl
economists
salestax
citywid
foresaw
nearest
inconceiv
shallow
fourtenth
nil
barth
unif
claw
lightvehicl
repriev
1988the
gogo
months
imped
offth
breakeven
stolen
belowinvestmentgrad
2019
yearthat
showedth
eclect
hy
fourfold
yearif
currentdollar
varianc
teaser
consumers
addon
highinterest
itthi
bias
1
twenti
timea
blizzard
230000
afterinfl
yardstick
monthsin
unmistak
omi
invari
deficitsin
mediumterm
setup
imperil
unseason

In [102]:
filename = 'word2vec_cloud.png'

In [103]:
create_tag_image(tags, filename, size=(900, 600), fontname='Lobster')

In [104]:
chrome_path = 'open -a /Applications/Google\ Chrome.app %s'

In [105]:
webbrowser.get(chrome_path).open('file://' + os.path.realpath(filename))

True

http://scikit-learn.org/stable/modules/model_persistence.html

In [108]:
from sklearn import svm
from sklearn import datasets
clf = svm.SVC()
iris = datasets.load_iris()
X, y = iris.data, iris.target
clf.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [109]:
from sklearn.externals import joblib
joblib.dump(clf, 'filename.pkl') 

['filename.pkl']

In [110]:
clf_2 = joblib.load('filename.pkl') 

In [111]:
clf_2.predict(X)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])