### Let's look into these comments and see if we can identify certain topics from it? 


In [83]:
import pandas as pd
pd.set_option('display.max_colwidth',1000)
import numpy as np
import matplotlib.pyplot as plt

#NLP plugins required
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords 


#Topic Modeling 
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation as LDA

#Import visualization tools for LDA models
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [13]:
df = pd.read_csv('nyt_comments.csv', index_col = 0)
df.head(3)

Unnamed: 0,index,commentID,userDisplayName,userLocation,commentBody,recommendations,replyCount,replies,editorsSelection,recommendedFlag,isAnonymous
0,0,107400406,Peter,New York,"Identity politics brings out the worst in people. Whether it is Amy Cooper, Jussie Smollett, riots where people attacks innocents to prove another point, all it shows is that the accusers are flawed, that victimhood is a very problematic position, fraught with unfair aggression. How quickly victims turn to perpetrators today is probably due to how the position is celebrated in media. No wonder. Victimhood allows you to spit in the face of others without consequences - it is the most entitled and privileged position there is today. What happens to a society when it only celebrates weakness? It is the beginning of the end. The downfall of true morality and individual happiness and trust. Because you cannot trust the person who solves life by accusing, not through overcoming and giving. People of color are becoming incredibly racist in the USA. I have read books about race presented with such bias that they made me imagine that this is how Mein Kampf was written, Hitler’s statement of...",0,0,[],False,0,False
1,1,107400445,AACNY,New York,"Not so ""peaceful"" in the Bronx last night.",1,0,[],False,0,False
2,2,107400536,karen,florida,I was awaiting a huge bolt of lightening to explode over Trump while he was mishandling the Bible.,2,0,[],False,0,False


Since we're only doing topic modeling, we only just need the text column. In this case it's the commentBody.

In [73]:
stop_words = stopwords.words('english')
stop_words.extend([' '])
def tokenizer(text):
    '''
    Simple tokenizer:
    1.) Removes stopwords
    2.) Use Snowball stemmer
    '''
    
    #Split each word up in text, which is a long string of words. 
    #These words are called tokens
    
    list_of_tokens = text.split(' ')
    
    #Let us use a stemmer
    stemmer = SnowballStemmer(language = 'english')
    
    #list of cleaned_tokens
    cleaned_tokens = []

    #Remove Stopwords
    for token in list_of_tokens:
        if (not token in stop_words):
            # Stemm words
            token_stemmed = stemmer.stem(token)
                
            cleaned_tokens.append(token_stemmed)
            
    return cleaned_tokens


In [74]:
#Instatiate TFIDF model
tfidf = TfidfVectorizer(tokenizer = tokenizer, 
                           min_df = 25, 
                           ngram_range = (1,3), 
                           lowercase = True)

In [75]:
matrix = tfidf.fit_transform(df.commentBody)

In [76]:
matrix_df = pd.DataFrame(matrix.toarray(),
                        columns = tfidf.get_feature_names())

In [77]:
%%time
#I want 15 topics generated
num_topics = 15

#NMF Topic Modeling
NMF_model = NMF(n_components = num_topics)
NMF_model.fit(matrix)

CPU times: user 711 ms, sys: 18.1 ms, total: 729 ms
Wall time: 238 ms


NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=15, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [78]:
n_words = 20
token_names = tfidf.get_feature_names()
topic_list = []
for topic_num, topic in enumerate(NMF_model.components_):
    top_tokens = [token_names[i] for i in topic.argsort()][::-1][:n_words]
    top_n = ' '.join(top_tokens)
    topic_list.append(f"topic_{'_'.join(top_tokens[:3])}") 
    
    print(f'Topic {topic_num}: {top_n}')

Topic 0:     trump countri want march back night care last sure senat country. that gop call anyon seem tri take
Topic 1: get go keep vote arrest way november. stop cop civil don't like state away democrat back let place sinc it
Topic 2: trump america call donald want donald trump man support  trump realli long people. well vote refus could demonstr without let president.
Topic 3: barr order trump. attorney law clear trump, attorney general author general action constitut wonder know park justic amend he mr. believ
Topic 4: protest peac peac protest gass right tear could clear - use gas tear gas violenc continu leader loot rubber attack violent thing
Topic 5: photo photo op op tear church use gas tear gas front clear st. protestor last rubber bullet gass demonstr way watch far
Topic 6: militari order use law american us state forc citizen unit general unit state follow law enforc attorney act enforc attack give attorney general
Topic 7: polic offic polic offic black cop georg brutal fl

In [79]:
print(topic_list)

['topic__ _ trump', 'topic_get_go_keep', 'topic_trump_america_call', 'topic_barr_order_trump.', 'topic_protest_peac_peac protest', 'topic_photo_photo op_op', 'topic_militari_order_use', 'topic_polic_offic_polic offic', 'topic_need_donald_support', 'topic_white_white hous_hous', 'topic_would_like_support', 'topic_peopl_live_good', 'topic_presid_state_unit state', 'topic_bibl_hold_church', 'topic_one_time_mani']


# Using LDA Topic Modeling

In [80]:
%%time
#instatiate LDA model
lda = LDA(n_components = num_topics, n_jobs = -1)
lda.fit(matrix)

CPU times: user 270 ms, sys: 20.7 ms, total: 291 ms
Wall time: 1.06 s


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=15, n_jobs=-1,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

In [81]:
n_words = 15
token_names = tfidf.get_feature_names()
topic_list = []
for topic_num, topic in enumerate(lda.components_):
    top_tokens = [token_names[i] for i in topic.argsort()][::-1][:n_words]
    top_n = ' '.join(top_tokens)
    topic_list.append(f"topic_{'_'.join(top_tokens[:3])}") 
    
    print(f'Topic {topic_num}: {top_n}')

Topic 0: america trump law  trump, presid civil care great power order war action democrat make
Topic 1: state -- now. refus georg georg floyd floyd right sinc unit state trump unit way civil barr
Topic 2:  polic protest trump peopl peac right need american offic use first militari like black
Topic 3:  countri white hous hous white rule long trump country. final yesterday around take time fail
Topic 4: barr attorney general wonder attorney general anyon author republican order hear mr. it  park trump
Topic 5: peopl get go know trump  he one realli ever come demonstr mani real hope
Topic 6: still support photo want good trump don't photo op op stand st. it.  get american
Topic 7: gas tear tear gas  presid use protest peac state unit trump unit state forc rubber crowd
Topic 8: bibl church  photo front trump photo op op hold clear could gass hold bibl front church peac
Topic 9: leader nation guard vote nation guard march hope november. everi feder people. trump member protest thought
Topi

In [87]:
%%time
# Let us visualize these topics
pyLDAvis.sklearn.prepare(lda, matrix, tfidf)

CPU times: user 909 ms, sys: 124 ms, total: 1.03 s
Wall time: 4.65 s
