# Unsupervised Learning - Creating a WordCloud.

Using LDA (LatentDirichletAllocation), we will apply it to the articles headlines to create a wordcloud.

The PyTagCloud library has been used (https://github.com/atizo/PyTagCloud) and updated a few lines of code to Python 3.

In [1]:
reset -fs

In [2]:
import os
import webbrowser
from pytagcloud import create_tag_image, make_tags
from pytagcloud.lang.counter import get_tag_counts
import matplotlib.pyplot as plt
%matplotlib inline
import string
import re
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import label_binarize, MultiLabelBinarizer, binarize
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve, mean_squared_error, r2_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition.online_lda import LatentDirichletAllocation
sns.set_style('white')

In [3]:
def clean_up_article(article):
    """
    Function to remove HTML characters </br> and replace n\\'t with not, it\\s with it is, and \\'s with s.
    """
    article = re.sub('</br>','', article)
    article = re.sub("(n\\'t)","not", article)
    article = re.sub("(it\\'s)","it is", article)
    article = re.sub("(\\'s)","s", article)
    article = re.sub("(\\t)"," ", article)
    return article

In [4]:
def process_dataframe_text(article):
    """
    Function that takes text from a dataframe and 1st removes punctuation, 2nd removes stopwords and converts
    into lower case, and finally using PorterStemmer, stems the words, returning a list of the processed text in lowercase.
    """
    stemmer = nltk.stem.PorterStemmer()
    
    # Removing punctuation by checking every character in the text passed to the function.
    remove_punctuation = [char for char in article if char not in string.punctuation]

    # Once punctuation has been removed, we join them again to form a string.
    remove_punctuation = ''.join(remove_punctuation)
    
    # Removing stopwords and converting to lower case.
    remove_stop = [word.lower() for word in remove_punctuation.split() if word.lower() not in nltk.corpus.stopwords.words('english')]
    
    # Stemming words using PorterStemmer.
    return [stemmer.stem(word) for word in remove_stop]
    
    # NOTE:
    # Below would be the return function without stemming.
    #return [word.lower() for word in remove_punctuation.split() if word.lower() not in nltk.corpus.stopwords.words('english')]

In [5]:
def to_string(text):
    """
    Function that will convert the items on a list to a string.
    """
    return ' '.join(text)

### Loading dataset into a pandas dataframe.

In [6]:
economic_df = pd.read_csv('Full-Economic-News-DFE-839861.csv', encoding='utf-8')

In [7]:
new_column_names = ['unit_id', 'golden', 'unit_state', 'trusted_judgments', 'last_judgment_at','positivity', 'positivity_confidence', 'relevance', 'relevance_confidence', 'article_id', 'article_date', 'article_headline', 'positivity_gold', 'relevance_gold', 'article_text']

#### Text pre-processing.

In [8]:
economic_df.columns = new_column_names

In [9]:
economic_df['article_headline'] = economic_df['article_headline'].apply(clean_up_article)

In [10]:
economic_df['article_text'] = economic_df['article_text'].apply(clean_up_article)

In [11]:
economic_df['headline_normalized'] = economic_df['article_headline'].apply(process_dataframe_text)

In [12]:
economic_df['article_normalized'] = economic_df['article_text'].apply(process_dataframe_text)

In [13]:
economic_df['headline_string'] = economic_df['headline_normalized'].apply(to_string)

In [14]:
economic_df['article_string'] = economic_df['article_normalized'].apply(to_string)

In [15]:
economic_df.head()

Unnamed: 0,unit_id,golden,unit_state,trusted_judgments,last_judgment_at,positivity,positivity_confidence,relevance,relevance_confidence,article_id,article_date,article_headline,positivity_gold,relevance_gold,article_text,headline_normalized,article_normalized,headline_string,article_string
0,842613455,False,finalized,3,12/5/2015 17:48:27,3.0,0.64,yes,0.64,wsj_398217788,1991-08-14,Yields on CDs Fell in the Latest Week,,,NEW YORK -- Yields on most certificates of dep...,"[yield, cd, fell, latest, week]","[new, york, yield, certif, deposit, offer, maj...",yield cd fell latest week,new york yield certif deposit offer major bank...
1,842613456,False,finalized,3,12/5/2015 16:54:25,,,no,1.0,wsj_399019502,2007-08-21,The Morning Brief: White House Seeks to Limit ...,,,The Wall Street Journal OnlineThe Morning Brie...,"[morn, brief, white, hous, seek, limit, child,...","[wall, street, journal, onlineth, morn, brief,...",morn brief white hous seek limit child insur p...,wall street journal onlineth morn brief look d...
2,842613457,False,finalized,3,12/5/2015 01:59:03,,,no,1.0,wsj_398284048,1991-11-14,Banking Bill Negotiators Set Compromise --- Pl...,,,WASHINGTON -- In an effort to achieve banking ...,"[bank, bill, negoti, set, compromis, plan, wid...","[washington, effort, achiev, bank, reform, sen...",bank bill negoti set compromis plan widen bank...,washington effort achiev bank reform senat neg...
3,842613458,False,finalized,3,12/5/2015 02:19:39,,0.0,no,0.675,wsj_397959018,1986-06-16,Managers Journal: Sniffing Out Drug Abusers Is...,,,The statistics on the enormous costs of employ...,"[manag, journal, snif, drug, abus, quick, fix]","[statist, enorm, cost, employe, drug, abus, we...",manag journal snif drug abus quick fix,statist enorm cost employe drug abus well know...
4,842613459,False,finalized,3,12/5/2015 17:48:27,3.0,0.3257,yes,0.64,wsj_398838054,2002-10-04,Currency Trading: Dollar Remains in Tight Rang...,,,NEW YORK -- Indecision marked the dollars tone...,"[currenc, trade, dollar, remain, tight, rang, ...","[new, york, indecis, mark, dollar, tone, trade...",currenc trade dollar remain tight rang amid wa...,new york indecis mark dollar tone trader paus ...


#### Using TfidfVectorizer.

In [16]:
vectorized = TfidfVectorizer(max_features=1000, max_df=0.95, min_df=2)

In [17]:
headline_tf = vectorized.fit_transform(economic_df['headline_string'])

In [18]:
lda = LatentDirichletAllocation(n_topics=20,
                                max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=42)

In [19]:
lda.fit(headline_tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_jobs=1, n_topics=20, perp_tol=0.1, random_state=42,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [20]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [21]:
def get_top_words(model, feature_names, n_top_words):
    text = ''
    for topic_idx, topic in enumerate(model.components_):
        text = text + " ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]])
    return text

In [22]:
print("Headline Topics in LDA model:\n")
tf_feature_names = vectorized.get_feature_names()
n_top_words = 20
print_top_words(lda, tf_feature_names, n_top_words)

Headline Topics in LDA model:

Topic #0:
plan hous secur time program fed rail seek brief would fight gop buy offer key interest rate studi chairman bank
Topic #1:
world nation china loan bank urg donot hill leader law japanes advis let brazil student land stabil us 30 septemb
Topic #2:
decemb saturday 1978 1980 toward age deriv note attack treasuri light 12 look april background post special row wait reagan
Topic #3:
way boom control america govern face capit counti target option question bid quiet beat buyer weekend problem best take mean
Topic #4:
ahead wall street tape report feder suppli roundup money live special journal challeng monday like week middl pain holiday local
Topic #5:
save region bet chief ford must cpi sustain name lift bond speech industri risk possibl right race latin greenspan lender
Topic #6:
mortgag home fall rate percent yet revers away cours reit increas profit boost price must ralli far dollar euro weigh
Topic #7:
stock dow market post gain price digest rise

#### Creating WordCloud for headlines.

In [23]:
tf_feature_names = vectorized.get_feature_names()
n_top_words = 20
text = get_top_words(lda, tf_feature_names, n_top_words)

In [24]:
CLOUD_TEXT = text
tags = make_tags(get_tag_counts(CLOUD_TEXT), maxsize=60)

plan
hous
secur
time
program
fed
rail
seek
brief
would
fight
gop
buy
offer
key
interest
rate
studi
chairman
bankworld
nation
china
loan
bank
urg
donot
hill
leader
law
japanes
advis
let
brazil
student
land
stabil
us
30
septembdecemb
saturday
1978
1980
toward
age
deriv
note
attack
treasuri
light
12
look
april
background
post
special
row
wait
reaganway
boom
control
america
govern
face
capit
counti
target
option
question
bid
quiet
beat
buyer
weekend
problem
best
take
meanahead
wall
street
tape
report
feder
suppli
roundup
money
live
special
journal
challeng
monday
like
week
middl
pain
holiday
localsave
region
bet
chief
ford
must
cpi
sustain
name
lift
bond
speech
industri
risk
possibl
right
race
latin
greenspan
lendermortgag
home
fall
rate
percent
yet
revers
away
cours
reit
increas
profit
boost
price
must
ralli
far
dollar
euro
weighstock
dow
market
post
gain
price
digest
rise
ralli
bond
trade
profit
declin
drop
sale
high
rate
point
industri
falljob
offic
invest
area
reserv
incom
financi
fore

In [25]:
filename = 'headlines_cloud.png'

In [26]:
create_tag_image(tags, filename, size=(900, 600), fontname='Lobster')

In [27]:
chrome_path = 'open -a /Applications/Google\ Chrome.app %s'

In [28]:
webbrowser.get(chrome_path).open('file://' + os.path.realpath(filename))

True

#### Creating WordCloud for articles.

In [29]:
article_tf = vectorized.fit_transform(economic_df['article_string'])

In [30]:
lda.fit(article_tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_jobs=1, n_topics=20, perp_tol=0.1, random_state=42,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [31]:
tf_feature_names = vectorized.get_feature_names()
n_top_words = 20
text = get_top_words(lda, tf_feature_names, n_top_words)

In [32]:
CLOUD_TEXT = text
tags = make_tags(get_tag_counts(CLOUD_TEXT), maxsize=60)

said
would
year
tax
bank
state
presid
feder
econom
hous
budget
percent
govern
one
nation
new
rate
compani
billion
usoption
stock
index
gold
year
compani
citi
global
canadian
nasdaq
fund
investor
dec
correct
chang
market
stockmarket
name
close
sayhome
treasuri
consum
said
may
yield
hous
indic
although
move
peopl
price
govern
econom
stock
commerc
often
name
continu
bondmr
provid
contract
univers
econom
quarter
cost
clinton
design
school
fourth
care
would
peopl
health
yield
one
student
billion
pricejapan
survey
quarter
compani
pay
japanes
buyer
us
recoveri
alway
trade
week
govern
save
deposit
consum
servic
germani
would
bankbank
trader
central
dollar
effort
us
interest
germani
currenc
yesterday
rais
struggl
quickli
eight
evid
rate
attempt
billion
caus
consumtrillion
health
100
studi
spend
18
yield
issu
expect
journal
jump
basi
grew
bond
center
reach
publish
privat
avoid
interestmillion
share
revenu
cent
net
compani
inc
earn
nasdaq
said
result
analyst
corp
billion
profit
oper
incom
rose
te

In [33]:
filename = 'articles_cloud.png'

In [34]:
create_tag_image(tags, filename, size=(900, 600), fontname='Lobster')

In [35]:
chrome_path = 'open -a /Applications/Google\ Chrome.app %s'

In [36]:
webbrowser.get(chrome_path).open('file://' + os.path.realpath(filename))

True