# Online User Comments Analysis - Topic Modeling

In [50]:
import pandas as pd

df1 = pd.read_csv('data/patriot45_Alex2.0.csv')[:499]
df2 = pd.read_csv('data/HT.csv',encoding='latin1')[:151]
df3 = pd.read_csv('data/DrVinnie.csv',encoding='latin1')[:151]
df4 = pd.read_csv('data/FeelLucky.csv',encoding='latin1')[:151]
df5 = pd.read_csv('data/Acaciavet.csv',encoding='latin1')[:151]
df6 = pd.read_csv('data/mike_radant.csv',encoding='latin1')[:151]
df7 = pd.read_csv('data/TheGrandAdmiral.csv',encoding='latin1')[:151]
df8 = pd.read_csv('data/disrespekt.csv',encoding='latin1')[:151]
df9 = pd.read_csv('data/Pouncekitty.csv',encoding='latin1')[:151]
df10 = pd.read_csv('data/Reese_Witheredpoon.csv',encoding='latin1')[:151]

df = pd.concat([df1,df2,df3,df4,df5,
               df6,df7,df8,df9,df10])

## Text Data Preprocessing

In [51]:
import numpy as np
import pandas as pd
import texthero as hero
from texthero import preprocessing
import plotly.express
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# Segment dataset based on hate speech labels: hate/offensive/ordinary
hateDF = df.loc[df['Hate?'] == 1.0]
offensiveDF = df.loc[df['Hate?'] == 2.0]
ordinaryDF = df.loc[df['Hate?'] == 0.0]

# Clean the article date column
cleaned_date = []

for date in df['Article Date'].values.tolist():
    
    if str(date)[-4:-3] and int(str(date)[-4:-3]) > 3:
        cleaned_date.append( str(date)[2:-4] + '0' + str(date)[-4:-3] )
        
    else:
        cleaned_date.append( str(date)[2:-2] )
df['cleaned_date'] = cleaned_date
df['cleaned_date'] = pd.to_datetime(df.cleaned_date)

# Hate and offensive dataframes
hateDF = df.loc[df['Hate?'] == 1.0]
offensiveDF = df.loc[df['Hate?'] == 2.0]
ordinaryDF = df.loc[df['Hate?'] == 0.0]

hate_dates = hateDF['cleaned_date']#.values.tolist()
hate_vol = hateDF['Hate?']

offensive_dates = offensiveDF['cleaned_date']
offensive_vol = offensiveDF['Hate?']

ordinary_dates = ordinaryDF['cleaned_date']
ordinary_vol = ordinaryDF['Hate?']

hateDF['cleaned_date'] = pd.to_datetime(hateDF.cleaned_date)
offensiveDF['cleaned_date'] = pd.to_datetime(offensiveDF.cleaned_date)
ordinaryDF['cleaned_date'] = pd.to_datetime(ordinaryDF.cleaned_date)

## Topic Modeling Helper Functions

In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF 
import numpy as np
import pandas as pd
import texthero as hero
from texthero import preprocessing
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import base64
import plotly.graph_objs as go
import plotly.offline as pyo
from IPython.core.display import display, HTML
import IPython

# NMF - top words per topic
def display_topic_keywords(model, feature_names, num_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx+1)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-num_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx+1)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-num_top_words - 1:-1]]
        
    return pd.DataFrame(topic_dict)

# NMF - top representative documents per topic
def display_topic_docs(topics_matrix, docs_matrix, feature_names, docs, num_top_words, num_top_docs):
    
    per_topic_comments = []
    all_comments = []
    
    all_topics_comments = []
    
    for topic_index, topic in enumerate(topics_matrix):
        #print('\n')
        #print("Topic %d:" % (topic_index+1))
        #print( " ".join([feature_names[i] for i in topic.argsort()[:-num_top_words-1 : -1]]))
        
        top_doc_idx = np.argsort( docs_matrix[:,topic_index] )[::-1][0:num_top_docs]
        
        all_topics_comments.append( 'topic_'+str(topic_index+1)+'_comments' )
        
        # Store all tweets per topic into a string
        all_topics_comments[topic_index] = []

        for doc_idx in top_doc_idx:
            #print( docs[doc_idx] )
            all_topics_comments[topic_index].append( docs[doc_idx] )
            
    return all_topics_comments

def get_descriptor(all_terms, H, topic_idx, num_top_words):
    # Reverse sort values to sort indices
    top_indices = np.argsort( H[topic_idx,:] )[::-1]
    
    top_terms = []
    for term_idx in top_indices[0:num_top_words]:
        top_terms.append( all_terms[term_idx] )
    return top_terms


# Text cleaning 
custom_pipeline = [preprocessing.lowercase,
                       preprocessing.remove_stopwords,
                       preprocessing.remove_digits,
                       preprocessing.remove_punctuation,
                       preprocessing.remove_diacritics,
                       preprocessing.remove_whitespace]
cleaned_text = hero.clean(hateDF['Comments'],
                                pipeline=custom_pipeline)
#pd.set_option('max_colwidth', 100000)
#cleaned_text = df2['cleaned'].to_string(index=False)


# TF-IDF
text_data = cleaned_text.values.tolist()
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, norm='l2') #, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(text_data)
X_tfidf = tfidf
#print( "Created %d X %d TF-IDF-normalized document-term matrix" % (tfidf.shape[0], tfidf.shape[1]) )

tfidf_terms = tfidf_vectorizer.get_feature_names()
#print( "Vocabulary has %d distinct terms" % len(tfidf_terms))
#print('\n\n\n\n')

# Hate Speech Topic Modeling - Nonnegative Matrix Factorization (NMF)
Each topic will be represented and described by the top 10 keywords and relevant comments, in addition to other useful data visualizations.

In [61]:
k = 10
num_top_words = 10
num_top_docs = 10
nmf_model = NMF(n_components=k, random_state=42, init='nndsvd', alpha=0.1, l1_ratio=0.5)
nmf_W = nmf_model.fit_transform( X_tfidf )
nmf_H = nmf_model.components_


# Topic Overview
keywords = '-------- Hate Speech Topics Overview --------'
bolded = "\033[1m" + keywords + "\033[0m"
print(bolded)
for topic_idx in range(k):
    descriptor = get_descriptor(tfidf_terms, nmf_H, topic_idx, num_top_words)
    str_descriptor = ", ".join( descriptor )
    print("Topic %02d: %s" % ( topic_idx+1, str_descriptor ))
print('\n')


# Visualize topic proportions
topics = pd.DataFrame(nmf_W.T).idxmax(axis=1, skipna=True)
fig = plt.figure(figsize=(13,7))
plt.bar(x=topics.index+1, height = topics.mul(100) / topics.sum() )
plt.gca().yaxis.set_major_formatter(PercentFormatter())
fig.suptitle('Topic Proportion', fontsize=20)
plt.xlabel('Topics', fontsize=18)
proportion_image = "topic_proportion_k=" + str(k) + ".png"
plt.savefig(proportion_image)
plt.close()

# Display proportion graph using base64
with open(proportion_image, 'rb') as propimg:
#proportion_graph_encoded = base64.b64encode(open(proportion_image, 'rb').read())
    proportion_graph_encoded = base64.b64encode(propimg.read())
    proportion_encoded_str = proportion_graph_encoded.decode('utf-8')

# create an image template
proportion_template = '<img align="left" src="data:image/png;base64, {image}">'

# create HTML object, using the string template
proportion_html = HTML(proportion_template.format(image=proportion_encoded_str))
display(proportion_html)
    
    
# Topic Proportion over time
# First convert dates to appropriate formats
# Then append to NMF dataframe
proportionsOverTime = []
dates = pd.to_datetime(hateDF['cleaned_date']).dt.strftime('%Y-%m-%d')
hate_dates = [d.strftime('%Y-%m-%d') if not pd.isnull(d) else '' for d in hateDF['cleaned_date']]
nmfDF = pd.DataFrame(nmf_W)
#nmfDF['date'] = dates
nmfDF['date'] = hate_dates
    
topicDistDF = nmfDF.groupby('date').aggregate('sum').apply(lambda x: x / sum(x))
for i in topicDistDF.columns:
    proportionsOverTime.append(go.Scatter(x = topicDistDF.index,
                                          y = topicDistDF[i],
                                          name = 'Topic ' + str(i+1)))

timeseriesHeader = '<h3> Topic Proportions Over Time </h3>'
display(HTML(timeseriesHeader))
layout = go.Layout(title='Topic Proportions Over Time')
fig = go.Figure(data=proportionsOverTime)
fig.show(renderer = "notebook_connected")

    
# Obtain topic proportions 
proportions = topics.mul(100) / topics.sum()
proportions_list = np.around(proportions, decimals=2)
    
all_topics_comments = display_topic_docs(nmf_H, nmf_W, tfidf_terms, df['Comments'].values.tolist(), #df['Content'].values.tolist(), 
                       num_top_words, num_top_docs)

# Topic Details
for topic_idx in range(k):
    descriptor = get_descriptor(tfidf_terms, nmf_H, topic_idx, num_top_words)
    str_descriptor = ", ".join( descriptor )
        
    # Topic index & proportion as header 
    # Topic keywords
    total = np.sum(topics)
    topic_prop = (topics[topic_idx]/total) * 100
        
    topic_name = ''.join( ("Topic %02d - Proportion %0.2f" % ( topic_idx+1, topic_prop ), '%') )
    bolded_topics = "\033[1m" + topic_name + "\033[0m"
    print(bolded_topics)
        
    keywords = 'Keywords: '
    bolded_keywords = "\033[1m" + keywords + "\033[0m"
    print(bolded_keywords, str_descriptor)
        
        
    # Key tweets per topic
    topic_idx_comments = all_topics_comments[topic_idx]
    for idx, comment in enumerate(topic_idx_comments):
        print("Comment %02d: %s" % ( idx+1, comment ))
    print('\n')


[1m-------- Hate Speech Topics Overview --------[0m
Topic 01: race, war, usa, really, remove, needs, plague, black, infestation, country
Topic 02: shoot, em, lizbeth, bastards, son, good, arrest, gitmo, cities, vermin
Topic 03: blacks, human, black, like, never, humans, people, white, women, society
Topic 04: kill, hopefully, liberal, every, find, family, help, cnn, level, bastards
Topic 05: charlie, foxtrot, kilo, uniform, lynch, black, primates, zulu, zulus, negros
Topic 06: need, reproduce, paying, welfare, end, assassinations, extinct, city, plague, start
Topic 07: one, every, stards, good, needs, name, bastards, another, black, see
Topic 08: needs, hung, murder, cuomo, nursing, mass, pos, murderer, hang, worthless
Topic 09: reggins, sick, vile, inhuman, gay, zulus, fired, filled, finally, find
Topic 10: antifa, blm, start, criminals, shooting, putting, killing, terrorists, sight, white




[1mTopic 01 - Proportion 10.57%[0m
[1mKeywords: [0m race, war, usa, really, remove, needs, plague, black, infestation, country
Comment 01: hell the CIA is probably taking money from him also.
Comment 02: The excuse of looting stores to protest a killing is pitiful no matter what color you are so if these guys are forced to defend the business it is not a racial problem but a criminal problem.  Put the blame on the criminals not the color of criminals.
Comment 03: Drop a few and take the air out of them and the rest will understand it ain' worth the money Soros is paying and leave.
Comment 04: When you are using your office for financial gain and using the voters of your district as fools diverting attention away from your inability to actually represent your constituents and re-focusing that attention by blatant lies on an opposing parties POTUS then she is as good as it gets for the democratic idiots who put her in office time and time again.  They certainly deserve her and her st

# Offensive Speech Topics

In [62]:
# Text cleaning 
df2 = pd.DataFrame()
custom_pipeline = [preprocessing.lowercase,
                       preprocessing.remove_stopwords,
                       preprocessing.remove_digits,
                       preprocessing.remove_punctuation,
                       preprocessing.remove_diacritics,
                       preprocessing.remove_whitespace]
cleaned_text = hero.clean(offensiveDF['Comments'],
                                pipeline=custom_pipeline)


# TF-IDF
text_data = cleaned_text.values.tolist()
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, norm='l2') #, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(text_data)
X_tfidf = tfidf
#print( "Created %d X %d TF-IDF-normalized document-term matrix" % (tfidf.shape[0], tfidf.shape[1]) )

tfidf_terms = tfidf_vectorizer.get_feature_names()
#print( "Vocabulary has %d distinct terms" % len(tfidf_terms))
#print('\n\n\n\n')


k = 13
num_top_words = 10
num_top_docs = 10
nmf_model = NMF(n_components=k, random_state=42, init='nndsvd', alpha=0.1, l1_ratio=0.5)
nmf_W = nmf_model.fit_transform( X_tfidf )
nmf_H = nmf_model.components_


# Topic Overview
keywords = '-------- Offensive Speech Topics Overview --------'
bolded = "\033[1m" + keywords + "\033[0m"
print(bolded)
for topic_idx in range(k):
    descriptor = get_descriptor(tfidf_terms, nmf_H, topic_idx, num_top_words)
    str_descriptor = ", ".join( descriptor )
    print("Topic %02d: %s" % ( topic_idx+1, str_descriptor ))
print('\n')


# Visualize topic proportions
topics = pd.DataFrame(nmf_W.T).idxmax(axis=1, skipna=True)
fig = plt.figure(figsize=(13,7))
plt.bar(x=topics.index+1, height = topics.mul(100) / topics.sum() )
plt.gca().yaxis.set_major_formatter(PercentFormatter())
fig.suptitle('Topic Proportion', fontsize=20)
plt.xlabel('Topics', fontsize=18)
proportion_image = "topic_proportion_k=" + str(k) + ".png"
plt.savefig(proportion_image)
plt.close()

# Display proportion graph using base64
with open(proportion_image, 'rb') as propimg:
#proportion_graph_encoded = base64.b64encode(open(proportion_image, 'rb').read())
    proportion_graph_encoded = base64.b64encode(propimg.read())
    proportion_encoded_str = proportion_graph_encoded.decode('utf-8')

# create an image template
proportion_template = '<img align="left" src="data:image/png;base64, {image}">'

# create HTML object, using the string template
proportion_html = HTML(proportion_template.format(image=proportion_encoded_str))
display(proportion_html)
    
    
# Topic Proportion over time
# First convert dates to appropriate formats
# Then append to NMF dataframe
proportionsOverTime = []
dates = pd.to_datetime(offensiveDF['cleaned_date']).dt.strftime('%Y-%m-%d')
hate_dates = [d.strftime('%Y-%m-%d') if not pd.isnull(d) else '' for d in offensiveDF['cleaned_date']]
nmfDF = pd.DataFrame(nmf_W)
#nmfDF['date'] = dates
nmfDF['date'] = hate_dates
    
topicDistDF = nmfDF.groupby('date').aggregate('sum').apply(lambda x: x / sum(x))
for i in topicDistDF.columns:
    proportionsOverTime.append(go.Scatter(x = topicDistDF.index,
                                          y = topicDistDF[i],
                                          name = 'Topic ' + str(i+1)))

timeseriesHeader = '<h3> Topic Proportions Over Time </h3>'
display(HTML(timeseriesHeader))
layout = go.Layout(title='Topic Proportions Over Time')
fig = go.Figure(data=proportionsOverTime)
fig.show(renderer = "notebook_connected")

    
# Obtain topic proportions 
proportions = topics.mul(100) / topics.sum()
proportions_list = np.around(proportions, decimals=2)
    
all_topics_comments = display_topic_docs(nmf_H, nmf_W, tfidf_terms, df['Comments'].values.tolist(), #df['Content'].values.tolist(), 
                       num_top_words, num_top_docs)

# Topic Details
for topic_idx in range(k):
    descriptor = get_descriptor(tfidf_terms, nmf_H, topic_idx, num_top_words)
    str_descriptor = ", ".join( descriptor )
        
    # Topic index & proportion as header 
    # Topic keywords
    total = np.sum(topics)
    topic_prop = (topics[topic_idx]/total) * 100
        
    topic_name = ''.join( ("Topic %02d - Proportion %0.2f" % ( topic_idx+1, topic_prop ), '%') )
    bolded_topics = "\033[1m" + topic_name + "\033[0m"
    print(bolded_topics)
        
    keywords = 'Keywords: '
    bolded_keywords = "\033[1m" + keywords + "\033[0m"
    print(bolded_keywords, str_descriptor)
        
        
    # Key tweets per topic
    topic_idx_comments = all_topics_comments[topic_idx]
    for idx, comment in enumerate(topic_idx_comments):
        print("Comment %02d: %s" % ( idx+1, comment ))
    print('\n')


[1m-------- Offensive Speech Topics Overview --------[0m
Topic 01: trump, dems, virus, people, country, would, america, war, stop, china
Topic 02: democraps, badly, nov, recover, years, destroy, must, takes, take, punish
Topic 03: charlie, uniform, kilo, tango, foxtrot, november, elect, unifrom, oscar, unity
Topic 04: black, people, get, loves, hate, moves, lot, uneducated, crime, gay
Topic 05: like, democrats, killed, sounds, sshole, california, nothing, one, support, toast
Topic 06: stupid, seem, tho, find, lead, better, rules, let, power, people
Topic 07: biden, get, covid, hour, debates, hero, virus, fight, election, recover
Topic 08: white, anti, tax, lives, guy, party, matter, people, time, basically
Topic 09: cities, leave, make, areas, residential, fools, rural, stay, private, come
Topic 10: democrat, privilege, described, thanks, party, suits, nothing, police, toast, alien
Topic 11: blm, antifa, needs, good, terrorists, control, hate, usa, love, nfl
Topic 12: bad, dude, corn

[1mTopic 01 - Proportion 3.53%[0m
[1mKeywords: [0m trump, dems, virus, people, country, would, america, war, stop, china
Comment 01: No there is no law saying that anyone must stand and pay reverence to the flag at that time for public gatherings.  Yes, no law but understanding and appreciating the fact the only reason you even have the opportunity to be at the event or have the decision to make at any time is because millions of patriots gave everything for you and i to be able to do as we please.  So it would seem that our youth or the individuals who would use these moments of reverence to kneel as a political or racial statement have forgotten or decided it was not worth the effort to give that remembrance to the heroes who gave all so we could have anything.  Being appreciative of blessings either from God Almighty or those who sacrificed their lives in war seems to be a small pittance compared to the actual cost.  God Bless America and her Falling Heroes
Comment 02: And exact

# Ordinary Speech Topics

In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF 
import numpy as np
import pandas as pd
import texthero as hero
from texthero import preprocessing

def get_descriptor(all_terms, H, topic_idx, num_top_words):
    # Reverse sort values to sort indices
    top_indices = np.argsort( H[topic_idx,:] )[::-1]
    
    top_terms = []
    for term_idx in top_indices[0:num_top_words]:
        top_terms.append( all_terms[term_idx] )
    return top_terms

# Text cleaning 
df2 = pd.DataFrame()
custom_pipeline = [preprocessing.lowercase,
                       preprocessing.remove_stopwords,
                       preprocessing.remove_digits,
                       preprocessing.remove_punctuation,
                       preprocessing.remove_diacritics,
                       preprocessing.remove_whitespace]
df2['cleaned'] = hero.clean(ordinaryDF['Comments'],
                                pipeline=custom_pipeline)
pd.set_option('max_colwidth', 100000)
cleaned_text = df2['cleaned'].to_string(index=False)


# TF-IDF
text_data = df2['cleaned'].values.tolist()
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, norm='l2') #, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(text_data)
X_tfidf = tfidf
#print( "Created %d X %d TF-IDF-normalized document-term matrix" % (tfidf.shape[0], tfidf.shape[1]) )

tfidf_terms = tfidf_vectorizer.get_feature_names()
#print( "Vocabulary has %d distinct terms" % len(tfidf_terms))

k = 10
num_top_words = 10
nmf_model = NMF(n_components=k, random_state=42, init='nndsvd', alpha=0.1, l1_ratio=0.5)
W = nmf_model.fit_transform( X_tfidf )
H = nmf_model.components_

print('-------- Ordinary Speech Topics --------')
for topic_idx in range(k):
    descriptor = get_descriptor(tfidf_terms, H, topic_idx, num_top_words)
    str_descriptor = ", ".join( descriptor )
    print("Topic %02d: %s" % ( topic_idx+1, str_descriptor ))
print('\n')

-------- Ordinary Speech Topics --------
Topic 01: trump, say, vote, wrong, dems, get, years, obama, perfect, busy
Topic 02: ha, kid, thing, death, slow, hillary, line, remember, fish, call
Topic 03: got, hair, funny, buddy, church, head, thru, use, least, going
Topic 04: exactly, nothing, news, president, durham, team, chose, put, knew, young
Topic 05: like, much, mom, looks, necessities, looked, people, war, damn, culture
Topic 06: go, already, light, time, love, george, guys, obama, want, behind
Topic 07: one, cares, dollars, named, especially, forgot, obama, power, laws, effect
Topic 08: would, anyone, right, bang, left, end, well, heard, front, white
Topic 09: man, mon, good, yet, nick, great, peace, long, hard, yeah
Topic 10: black, true, culture, said, sad, society, white, free, notice, basically


