### Let's look into these comments and see if we can identify certain topics from it? 


In [14]:
import pandas as pd
pd.set_option('display.max_colwidth',1000)
import numpy as np
import matplotlib.pyplot as plt

#NLP plugins required
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords 


#Topic Modeling 
from sklearn.decomposition import NMF


In [13]:
df = pd.read_csv('nyt_comments.csv', index_col = 0)
df.head(3)

Unnamed: 0,index,commentID,userDisplayName,userLocation,commentBody,recommendations,replyCount,replies,editorsSelection,recommendedFlag,isAnonymous
0,0,107400406,Peter,New York,"Identity politics brings out the worst in people. Whether it is Amy Cooper, Jussie Smollett, riots where people attacks innocents to prove another point, all it shows is that the accusers are flawed, that victimhood is a very problematic position, fraught with unfair aggression. How quickly victims turn to perpetrators today is probably due to how the position is celebrated in media. No wonder. Victimhood allows you to spit in the face of others without consequences - it is the most entitled and privileged position there is today. What happens to a society when it only celebrates weakness? It is the beginning of the end. The downfall of true morality and individual happiness and trust. Because you cannot trust the person who solves life by accusing, not through overcoming and giving. People of color are becoming incredibly racist in the USA. I have read books about race presented with such bias that they made me imagine that this is how Mein Kampf was written, Hitler’s statement of...",0,0,[],False,0,False
1,1,107400445,AACNY,New York,"Not so ""peaceful"" in the Bronx last night.",1,0,[],False,0,False
2,2,107400536,karen,florida,I was awaiting a huge bolt of lightening to explode over Trump while he was mishandling the Bible.,2,0,[],False,0,False


Since we're only doing topic modeling, we only just need the text column. In this case it's the commentBody.

In [15]:
stop_words = stopwords.words('english')
def tokenizer(text):
    '''
    Simple tokenizer:
    1.) Removes stopwords
    2.) Use Snowball stemmer
    '''
    
    #Split each word up in text, which is a long string of words. 
    #These words are called tokens
    
    list_of_tokens = text.split(' ')
    
    #Let us use a stemmer
    stemmer = SnowballStemmer()
    
    #list of cleaned_tokens
    cleaned_tokens = []

    #Remove Stopwords
    for token in list_of_tokens:
        if (not token in stop_words):
            # Stemm words
            token_stemmed = stemmer.stem(token)
                
            cleaned_tokens.append(token_stemmed)
            
    return cleaned_tokens


In [16]:
#Instatiate TFIDF model
tfidf = TfidfVectorizer(tokenizer = tokenizer, 
                           min_df = 25, 
                           ngram_range = (1,3), 
                           lowercase = True)

In [21]:
matrix = tfidf.fit_transform(df.commentBody)

In [26]:
matrix_df = pd.DataFrame(matrix.toarray(),
                        columns = tfidf.get_feature_names())

In [33]:
#Topic Modeling

NMF_model = NMF(n_components = 20)
NMF_model.fit(matrix)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=20, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [34]:
n_words = 20
token_names = tfidf.get_feature_names()
topic_list = []
for topic_num, topic in enumerate(NMF_model.components_):
    top_tokens = [token_names[i] for i in topic.argsort()][::-1][:n_words]
    top_n = ' '.join(top_tokens)
    topic_list.append(f"topic_{'_'.join(top_tokens[:3])}") 
    
    print(f'Topic {topic_num}: {top_n}')

Topic 0:     trump countri want back march night care sure last senat that country. gop said take seem tri anyon
Topic 1: america great - come much republican carri civil know he power novemb world democraci help countri think war speak said
Topic 2: trump donald want donald trump support  trump people. realli well vote refus long demonstr let take could bring without year actual
Topic 3: barr trump. trump, author action know wonder clear attorney justic he mr. attorney general park constitut general believ tri two peac
Topic 4: protest peac peac protest gass - could violenc continu clear make looter violent leader loot cover anoth stand lead attack way
Topic 5: bibl hold church hold bibl front front church pose could show burn go look i'm even read gass stand yesterday anyon noth
Topic 6: militari use american us forc citizen unit state turn threaten attack - act shoot unit state power war come govern follow
Topic 7: polic offic polic offic brutal citi violenc cop protect violent forc

In [31]:
print(topic_list)

['topic__ _ trump', 'topic_peopl_get_go', 'topic_trump_america_call', 'topic_barr_order_trump.', 'topic_protest_peac_peac protest', 'topic_bibl_photo_church', 'topic_militari_order_presid', 'topic_polic_offic_polic offic', 'topic_need_donald_start', 'topic_white_gas_tear']
