### Let's look into these comments and see if we can identify certain topics from it? 


In [1]:
import pandas as pd
pd.set_option('display.max_colwidth',1000)
#import numpy as np
import matplotlib.pyplot as plt

#NLP plugins required
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords 


#Topic Modeling 
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation as LDA

#Import visualization tools for LDA models
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [2]:
df = pd.read_csv('nyt_comments.csv', index_col = 0)
df[1:3]

Unnamed: 0,index,commentID,userDisplayName,userLocation,commentBody,recommendations,replyCount,replies,editorsSelection,recommendedFlag,isAnonymous
1,1,107400445,AACNY,New York,"Not so ""peaceful"" in the Bronx last night.",1,0,[],False,0,False
2,2,107400536,karen,florida,I was awaiting a huge bolt of lightening to explode over Trump while he was mishandling the Bible.,2,0,[],False,0,False


Since we're only doing topic modeling, we only just need the text column. In this case it's the commentBody.

In [3]:
stop_words = stopwords.words('english')
def tokenizer(text):
    '''
    Simple tokenizer:
    1.) Removes stopwords
    2.) Use Snowball stemmer
    '''
    
    #Split each word up in text, which is a long string of words. 
    #These words are called tokens
    
    list_of_tokens = text.split(' ')
    
    #Let us use a stemmer
    stemmer = SnowballStemmer(language = 'english')
    
    #list of cleaned_tokens
    cleaned_tokens = []

    #Remove Stopwords
    for token in list_of_tokens:
        if (not token in stop_words):
            # Stemm words
            token_stemmed = stemmer.stem(token)
                
            cleaned_tokens.append(token_stemmed)
            
    return cleaned_tokens

In [4]:
#Instatiate TFIDF model
tfidf = TfidfVectorizer(tokenizer = tokenizer, 
                           min_df = 25, 
                           ngram_range = (1,3), 
                           lowercase = True)

In [10]:
token_matrix = tfidf.fit_transform(df.commentBody)

In [6]:
token_matrix

<1336x300 sparse matrix of type '<class 'numpy.float64'>'
	with 17406 stored elements in Compressed Sparse Row format>

In [7]:
#If you would like to see the matrix in a dataframe form
matrix_df = pd.DataFrame(token_matrix.toarray(),
                        columns = tfidf.get_feature_names())

In [11]:
matrix_df

Unnamed: 0,Unnamed: 1,Unnamed: 2,trump,-,--,across,act,action,actual,administr,...,white hous,without,wonder,word,work,world,would,year,yesterday,yet
0,0.000000,0.0,0.0,0.161496,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.176041,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1331,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1332,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.191121,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1333,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1334,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
%%time
#I want 15 topics generated
num_topics = 15

#NMF Topic Modeling
NMF_model = NMF(n_components = num_topics)
NMF_model.fit(matrix)

NameError: name 'matrix' is not defined

In [9]:
n_words = 15
token_names = tfidf.get_feature_names()
topic_list = []
for topic_num, topic in enumerate(NMF_model.components_):
    top_tokens = [token_names[i] for i in topic.argsort()][::-1][:n_words]
    top_n = ' '.join(top_tokens)
    topic_list.append(f"topic_{'_'.join(top_tokens[:3])}") 
    
    print(f'Topic {topic_num}: {top_n}')

AttributeError: 'NMF' object has no attribute 'components_'

In [None]:
print(topic_list)

# Using LDA Topic Modeling

In [12]:
%%time
#I want 15 topics generated
num_topics = 15

#instatiate LDA model
lda_model = LDA(n_components = num_topics, n_jobs = -1)
lda_model.fit(token_matrix)

CPU times: user 285 ms, sys: 138 ms, total: 423 ms
Wall time: 3.07 s


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=15, n_jobs=-1,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

In [33]:
#
top_n_words = 15
token_names = tfidf.get_feature_names()
topic_list = []
for topic_num, topic in enumerate(lda_model.components_):
    top_tokens = [token_names[i] for i in topic.argsort()][::-1][:top_n_words] #Returns the indices that would sort an array
    top_n = ' '.join(top_tokens) 
    topic_list.append(f"topic_{'_'.join(top_tokens)}") 
    
    print(f'Topic {topic_num}: {top_n}')

Topic 0: militari trump know say  it time protect get polic don't citi american never realli
Topic 1: help much feder offic law enforc enforc law anyon civil american us outrag get arrest support
Topic 2:  peopl trump polic need black vote year would like donald get mani american countri
Topic 3:    care polic men cop last night georg - crimin without destroy follow away
Topic 4: hear need burn ever polic turn left support action them. mani white fear govern like
Topic 5: go must let see step happen i'm point way everyon report read white church time
Topic 6: loot well now. last elect republican old nation tri anoth looter around november. vote 
Topic 7:  trump protest polic peac use peopl white right american militari presid need - order
Topic 8: peopl follow free human good everyon busi go one protest offic respons think president. take
Topic 9: one hope across offic polic still black can't end us get polic offic may country. least
Topic 10: barr trump order wonder  that trump. got t

In [None]:
%%time
# Let us visualize these topics
pyLDAvis.sklearn.prepare(lda, matrix, tfidf)