### Let's look into these comments and see if we can identify certain topics from it? 


In [47]:
import pandas as pd
pd.set_option('display.max_colwidth',1000)
#import numpy as np
import matplotlib.pyplot as plt

#NLP plugins required
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords 
import string


#Topic Modeling 
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation as LDA

#Import visualization tools for LDA models
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [2]:
df = pd.read_csv('nyt_comments.csv', index_col = 0)
df[1:3]

Unnamed: 0,index,commentID,userDisplayName,userLocation,commentBody,recommendations,replyCount,replies,editorsSelection,recommendedFlag,isAnonymous
1,1,107400445,AACNY,New York,"Not so ""peaceful"" in the Bronx last night.",1,0,[],False,0,False
2,2,107400536,karen,florida,I was awaiting a huge bolt of lightening to explode over Trump while he was mishandling the Bible.,2,0,[],False,0,False


Since we're only doing topic modeling, we only just need the text column. In this case it's the commentBody.

In [61]:
stop_words = stopwords.words('english')
def tokenizer(text):
    
    #All characters in this string will be converted to lowercase
    text = text.lower()
    
    #Removing sentence punctuations
    for punctuation_mark in string.punctuation:
        text = text.replace(punctuation_mark,'')
    
    #Creating our list of tokens
    list_of_tokens = text.split(' ')
    #Creating our cleaned tokens list 
    cleaned_tokens = []
    #Let us use a stemmer
    stemmer = SnowballStemmer(language = 'english')
    
    #Removing Stop Words in our list of tokens and any tokens that happens to be empty strings
    for token in list_of_tokens:
        if (not token in stop_words) and (token != ''):
            #Stem tokens
            token_stemmed = stemmer.stem(token)
            #appending our finalized cleaned token
            cleaned_tokens.append(token_stemmed)
    
    return cleaned_tokens

In [75]:
#Instatiate TFIDF model
tfidf = TfidfVectorizer(tokenizer = tokenizer, 
                           min_df = 0.05, 
                           ngram_range = (1,3))

In [76]:
token_matrix = tfidf.fit_transform(df.commentBody)

In [77]:
token_matrix

<1336x91 sparse matrix of type '<class 'numpy.float64'>'
	with 11020 stored elements in Compressed Sparse Row format>

In [78]:
#If you would like to see the matrix in a dataframe form
matrix_df = pd.DataFrame(token_matrix.toarray(),
                        columns = tfidf.get_feature_names())

In [79]:
matrix_df

Unnamed: 0,action,america,american,back,barr,bibl,black,call,church,citi,...,us,use,violenc,vote,want,way,white,white hous,would,year
0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,...,0.230176,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
1,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
2,0.0,0.0,0.0,0.000000,0.000000,0.857992,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.000000,0.000000,0.588851,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
4,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1331,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
1332,0.0,0.0,0.0,0.191946,0.302002,0.000000,0.0,0.0,0.156675,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.326875,0.389152,0.0,0.0
1333,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
1334,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0


In [101]:
%%time

num_topics = 5
#NMF Topic Modeling
NMF_model = NMF(n_components = num_topics)
NMF_model.fit(token_matrix)

CPU times: user 104 ms, sys: 7.32 ms, total: 111 ms
Wall time: 59.9 ms


NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=5, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [103]:
#I'm looking for the top 25 words for each topic
top_n_words = 20
token_names = tfidf.get_feature_names()

for topic_num, topic in enumerate(NMF_model.components_):
    top_tokens = [token_names[i] for i in topic.argsort()][::-1][:top_n_words] #Returns the indices that would sort an array
    top_n = ', '.join(top_tokens) 
    print(f'Topic {topic_num}: {top_n}')

Topic 0: polic, peopl, us, need, get, american, time, offic, go, like, would, countri, presid, mani, militari, one, right, nation, think, black
Topic 1: bibl, church, photo, photo op, op, presid, front, hold, tear, use, gas, would, tear gas, clear, could, show, last, go, white, order
Topic 2: barr, order, law, right, clear, know, action, state, like, offic, govern, take, tear gas, gas, time, first, citizen, tear, dont, presid
Topic 3: protest, peac, peac protest, right, gas, tear, loot, use, tear gas, violenc, clear, one, street, white, order, riot, hous, could, citizen, forc
Topic 4: trump, america, militari, support, like, call, american, white, want, year, countri, could, come, vote, hous, white hous, make, republican, thing, never


In [107]:
n_words = 25
token_names = tfidf.get_feature_names()
topic_list = []
for topic_num, topic in enumerate(NMF_model.components_):
    top_tokens = [token_names[i] for i in topic.argsort()][::-1][:n_words]
    top_n = ' '.join(top_tokens)
    topic_list.append(f"topic_{'_'.join(top_tokens[:3])}") 
    
    print(f'Topic {topic_num}: {top_n}')

Topic 0: polic peopl us need get american time offic go like would countri presid mani militari one right nation think black see know want way state
Topic 1: bibl church photo photo op op presid front hold tear use gas would tear gas clear could show last go white order even white hous hous man never
Topic 2: barr order law right clear know action state like offic govern take tear gas gas time first citizen tear dont presid peac us one use get
Topic 3: protest peac peac protest right gas tear loot use tear gas violenc clear one street white order riot hous could citizen forc white hous leader make law take
Topic 4: trump america militari support like call american white want year countri could come vote hous white hous make republican thing never take power us man first


In [None]:
terms, sizes = getTermsAndSizes(topics_display_list[0])

num_top_words = 30
fontsize_base = 30 / np.max(sizes) # font size for word with largest share in corpus

num_topics = 1

for t in range(num_topics):
    fig, ax = plt.subplots(1, num_topics, figsize=(6, 12))
    plt.ylim(0, num_top_words + 1.0)
    plt.xticks([])
    plt.yticks([])
    plt.title('Topic #{}'.format(t))

    for i, (word, share) in enumerate(zip(terms, sizes)):
        word = word + " (" + str(share) + ")"
        plt.text(0.3, num_top_words-i-1.0, word, fontsize=fontsize_base*share)

plt.tight_layout()


In [69]:
print(topic_list)

['topic_across_act_action', 'topic_riot_job_anyth', 'topic_reason_burn_dc', 'topic_becom_rememb_looter', 'topic_never_march_matter', 'topic_better_media_caus', 'topic_law_looter_rioter', 'topic_militari_long_month', 'topic_legal_dc_death', 'topic_everyon_protect_set', 'topic_societi_hold_probabl', 'topic_fail_fear_govern', 'topic_must_polit_right', 'topic_may_hold bibl_feder', 'topic_look_race_probabl']


# Using LDA Topic Modeling

In [87]:
%%time
#I want 15 topics generated
num_topics = 10

#instatiate LDA model
lda_model = LDA(n_components = num_topics, n_jobs = -1)
lda_model.fit(token_matrix)

CPU times: user 317 ms, sys: 168 ms, total: 486 ms
Wall time: 13.8 s


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=-1,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

In [88]:
#I'm looking for the top 15 words for each topic
top_n_words = 25
token_names = tfidf.get_feature_names()

for topic_num, topic in enumerate(lda_model.components_):
    top_tokens = [token_names[i] for i in topic.argsort()][::-1][:top_n_words] #Returns the indices that would sort an array
    top_n = ', '.join(top_tokens) 
    print(f'Topic {topic_num}: {top_n}')

Topic 0: us, first, govern, year, peopl, trump, time, need, get, right, polic, black, offic, way, america, presid, vote, even, think, want, nation, countri, american, one, make
Topic 1: barr, trump, vote, republican, get, know, go, dont, must, like, order, action, elect, offic, way, one, take, make, peopl, come, mani, right, time, countri, us
Topic 2: protest, peac, peac protest, one, support, peopl, trump, show, last, right, need, good, violenc, take, make, countri, hold, look, clear, would, us, way, time, like, go
Topic 3: america, gas, tear, tear gas, come, trump, use, make, polic, never, presid, action, forc, peac, thing, protest, call, could, even, us, clear, law, would, countri, back
Topic 4: white, hous, man, white hous, trump, say, day, think, see, mani, peopl, black, offic, time, polic, one, power, want, go, make, presid, know, use, happen, would
Topic 5: would, leader, need, presid, like, trump, citi, countri, nation, call, time, much, bibl, peopl, polic, see, american, go, f

In [72]:
print('\033[1m' + 'Hello')

[1mHello


In [105]:
%%time
#Import visualization tools for LDA models
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

# Let us visualize these topics
pyLDAvis.sklearn.prepare(lda_model, token_matrix, tfidf)

CPU times: user 763 ms, sys: 272 ms, total: 1.04 s
Wall time: 20 s


In [106]:
pyLDAvis.sklearn.prepare(NMF_model, token_matrix, tfidf)

  return dists / dists.sum(axis=1)[:, None]


ValidationError: 
 * Not all rows (distributions) in doc_topic_dists sum to 1.