In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image

from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim import corpora, models, similarities
from gensim.models import CoherenceModel

from gensim.test.utils import datapath
from collections import Counter

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
data=pd.read_csv('all_data.csv')

### Identify most common words

In [None]:
wordCounts = Counter(' '.join(data['text']).split(' '))

In [None]:
commonStopWords = list(list(zip(*wordCounts.most_common(15)))[0])
commonStopWords.extend(['afghanistan','pet','petdeals','want','products','shop','lovers'])
commonStopWords=set(commonStopWords)

In [None]:
def getTopicDist(df,lda, cdict, ntopics, mapping):
  tdict = {i:0 for i in range(ntopics)}
  total = df.shape[0]
  for _,row in df.iterrows():
    text=row['text'].split(' ')
    bow = cdict.doc2bow(text)
    output = list(zip(*lda[bow][0]))
    indices = output[0]
    probs = output[1]
    topic = indices[np.argmax(probs)]
    tdict[topic] += 1/total
  print(tdict)
  for topic in mapping.keys():
    print(topic, sum([tdict[i] for i in mapping[topic]])*100)

### Opinion - Support

In [None]:
alltexts1 = list(data[data.opinion=='Support']['text'])
alltexts1 = [[w for w in t.split(' ') if w not in commonStopWords] for t in alltexts1]

In [None]:
common_dictionary1 = Dictionary(alltexts1)

In [None]:
common_corpus1 = [common_dictionary1.doc2bow(text) for text in alltexts1]

In [None]:
lda1 = models.LdaMulticore(common_corpus1, id2word=common_dictionary1, num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [None]:
lda1.print_topics()

In [None]:
mapping1={'choice':[0,4], 'unborn':[9,7], 'religion':[5], 'healthcare':[3,8]}

In [None]:
coherence_model_lda = CoherenceModel(model=lda1, texts=alltexts1, dictionary=common_dictionary1, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
df=data[data.opinion=='Support']
getTopicDist(df,lda1, common_dictionary1, 10,mapping1)

### Opinion - Against

In [None]:
alltexts2 = list(data[data.opinion=='Against']['text'])
alltexts2 = [[w for w in t.split(' ') if w not in commonStopWords] for t in alltexts2]

In [None]:
common_dictionary2 = Dictionary(alltexts2)

In [None]:
common_corpus2 = [common_dictionary2.doc2bow(text) for text in alltexts2]

In [None]:
lda2 = models.LdaMulticore(common_corpus2, id2word=common_dictionary2, num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [None]:
lda2.print_topics()

In [None]:
mapping2={'nationalist':[0,5,2], 'unborn':[9,7], 'religion':[4], 'healthcare':[3,8]}

In [None]:
coherence_model_lda = CoherenceModel(model=lda2, texts=alltexts2, dictionary=common_dictionary2, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
df=data[data.opinion=='Against']
getTopicDist(df,lda2, common_dictionary2, 10, mapping2)

### What are males talking about?

In [None]:
males_1=data[(data.gender=='male') & (data.opinion=='Support')]
maletexts_1 = list(males_1['text'])
maletexts_1 = (' '.join(maletexts_1)).split(' ')

males_0=data[(data.gender=='male') & (data.opinion=='Against')]
maletexts_0 = list(males_0['text'])
maletexts_0 = (' '.join(maletexts_0)).split(' ')

In [None]:
#lda1[common_dictionary1.doc2bow(maletexts_1)][0]
getTopicDist(males_1,lda1, common_dictionary1, 10, mapping1)

In [None]:
getTopicDist(males_0,lda2, common_dictionary2, 10, mapping2)

### What are females talking about?

In [None]:
females_1=data[(data.gender=='female') & (data.opinion=='Support')]
femaletexts_1 = list(females_1['text'])
femaletexts_1 = (' '.join(femaletexts_1)).split(' ')

females_0=data[(data.gender=='female') & (data.opinion=='Against')]
femaletexts_0 = list(females_0['text'])
femaletexts_0 = (' '.join(femaletexts_0)).split(' ')

In [None]:
getTopicDist(females_1,lda1, common_dictionary1, 10, mapping1)

In [None]:
getTopicDist(females_0,lda2, common_dictionary2, 10, mapping2)

### What are whites talking about?

In [None]:
white_1=data[(data.race=='white') & (data.opinion=='Support')]
whitetexts_1 = list(white_1['text'])
whitetexts_1 = (' '.join(whitetexts_1)).split(' ')

white_0=data[(data.race=='white') & (data.opinion=='Against')]
whitetexts_0 = list(white_0['text'])
whitetexts_0 = (' '.join(whitetexts_0)).split(' ')

In [None]:
getTopicDist(white_1,lda1, common_dictionary1, 10, mapping1)

In [None]:
getTopicDist(white_0,lda2, common_dictionary2, 10, mapping2)

### What are non-whites talking about?

In [None]:
nonwhite_1=data[(data.race!='white') & (data.opinion=='Support')]
nonwhitetexts_1 = list(nonwhite_1['text'])
nonwhitetexts_1 = (' '.join(nonwhitetexts_1)).split(' ')

nonwhite_0=data[(data.race!='white') & (data.opinion=='Against')]
nonwhitetexts_0 = list(nonwhite_0['text'])
nonwhitetexts_0 = (' '.join(nonwhitetexts_0)).split(' ')

In [None]:
getTopicDist(nonwhite_1,lda1, common_dictionary1, 10, mapping1)

In [None]:
getTopicDist(nonwhite_0,lda2, common_dictionary2, 10, mapping2)

### What are adults talking about?

In [None]:
adult_1=data[(data.age=='>=40') & (data.opinion=='Support')]
adulttexts_1 = list(adult_1['text'])
adulttexts_1 = (' '.join(adulttexts_1)).split(' ')

adult_0=data[(data.age=='>=40') & (data.opinion=='Against')]
adulttexts_0 = list(adult_0['text'])
adulttexts_0 = (' '.join(adulttexts_0)).split(' ')

In [None]:
getTopicDist(adult_1,lda1, common_dictionary1, 10, mapping1)

In [None]:
getTopicDist(adult_0,lda2, common_dictionary2, 10, mapping2)

### What are middle-aged talking about?

In [None]:
ma_1=data[(~data.age.isin(['>=40','<=18'])) & (data.opinion=='Support')]
matexts_1 = list(ma_1['text'])
adulttexts_1 = (' '.join(matexts_1)).split(' ')

ma_0=data[(~data.age.isin(['>=40','<=18'])) & (data.opinion=='Against')]
matexts_0 = list(ma_0['text'])
matexts_0 = (' '.join(matexts_0)).split(' ')

In [None]:
getTopicDist(ma_1,lda1, common_dictionary1, 10, mapping1)

In [None]:
getTopicDist(ma_0,lda2, common_dictionary2, 10, mapping2)

### What are kids talking about?

In [None]:
kid_1=data[(data.age=='<=18') & (data.opinion=='Support')]
kidtexts_1 = list(kid_1['text'])
kidtexts_1 = (' '.join(kidtexts_1)).split(' ')

kid_0=data[(data.age=='<=18') & (data.opinion=='Against')]
kidtexts_0 = list(kid_0['text'])
kidtexts_0 = (' '.join(kidtexts_0)).split(' ')

In [None]:
getTopicDist(kid_1,lda1, common_dictionary1, 10, mapping1)

In [None]:
getTopicDist(kid_0,lda2, common_dictionary2, 10, mapping2)

## Sentiment

### Positive

In [None]:
alltexts_pos = list(data[data.sentiment=='pos']['text'])
alltexts_pos = [[w for w in t.split(' ') if w not in commonStopWords] for t in alltexts_pos]

In [None]:
common_dictionaryp = Dictionary(alltexts_pos)

In [None]:
common_corpusp = [common_dictionaryp.doc2bow(text) for text in alltexts_pos]

In [None]:
ldap = models.LdaMulticore(common_corpusp, id2word=common_dictionaryp, num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [None]:
ldap.print_topics()

In [None]:
stopwords = set(STOPWORDS)
text = ' '.join([w for w in ' '.join(data[data.sentiment=='pos']['text']).split(' ') if w not in commonStopWords])
wordcloud = WordCloud(width = 1500, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10).generate(text)

plt.figure(figsize = (15, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

### Negative

In [None]:
alltexts_neg = list(data[data.sentiment=='neg']['text'])
alltexts_neg = [[w for w in t.split(' ') if w not in commonStopWords] for t in alltexts_neg]

In [None]:
common_dictionaryn = Dictionary(alltexts_neg)

In [None]:
common_corpusn = [common_dictionaryn.doc2bow(text) for text in alltexts_neg]

In [None]:
ldan = models.LdaMulticore(common_corpusn, id2word=common_dictionaryn, num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [None]:
ldan.print_topics()

In [None]:
stopwords = set(STOPWORDS)
text = ' '.join([w for w in ' '.join(data[data.sentiment=='neg']['text']).split(' ') if w not in commonStopWords])
wordcloud = WordCloud(width = 1500, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10).generate(text)

plt.figure(figsize = (15, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

## Global LDA + Sentiment

In [None]:
alltexts = list(data['text'])
alltexts = [[w for w in t.split(' ') if w not in commonStopWords] for t in alltexts]

common_dictionary = Dictionary(alltexts)

common_corpus = [common_dictionary.doc2bow(text) for text in alltexts]

lda = models.LdaMulticore(common_corpus, id2word=common_dictionary, num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

lda.print_topics()

In [None]:
mapping={'choice':[0], 'unborn':[3], 'religion':[5,7], 'healthcare':[9], 'hate':[1]}

In [None]:
# Get dominant topic
dominantTopics = []

for _,row in data.iterrows():
    text=row['text'].split(' ')
    bow = common_dictionary.doc2bow(text)
    output = list(zip(*lda[bow][0]))
    indices = output[0]
    probs = output[1]
    topicNames = ['choice','unborn','religion','healthcare','hate speech']
    maxTopicId = indices[np.argmax(probs)]
    if maxTopicId==0:
      dominantTopics.append('choice')
    elif maxTopicId==3:
      dominantTopics.append('unborn')
    elif maxTopicId in [5,7]:
      dominantTopics.append('religion')
    elif maxTopicId==9:
      dominantTopics.append('healthcare')
    elif maxTopicId==1:
      dominantTopics.append('hate speech')
    else:
      dominantTopics.append('other')

In [None]:
data['dominantTopic']=dominantTopics

In [None]:
mask = np.array(Image.open("4852757-middle.png"))
texas = np.array(Image.open("texas.jpg"))

### Covid

In [None]:
coviddata = data[data.text.str.contains('vaccin')]
print(coviddata.shape[0])

colors = ImageColorGenerator(mask)

stopwords = set(STOPWORDS)
text = ' '.join([w for w in ' '.join(coviddata['text']).split(' ') if w not in commonStopWords])
text = text.replace('vaccinated','').replace('vaccination','')
wordcloud = WordCloud(width = 1500, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10, 
                mask=mask,
                color_func=colors,
                max_words=200).generate(text)

plt.figure(figsize = (15, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

### Texas Taliban

In [None]:
ttdata = data[data.text.str.contains('taliban')]
print(ttdata.shape[0])

colors = ImageColorGenerator(mask)

stopwords = set(STOPWORDS)
text = ' '.join([w for w in ' '.join(ttdata['text']).split(' ') if w not in commonStopWords])

wordcloud = WordCloud(width = 1500, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10, 
                mask=mask,
                color_func=colors,
                max_words=200).generate(text)

plt.figure(figsize = (15, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

### God

In [None]:
ttdata = data[data.text.str.contains('god') | data.text.str.contains('bible') | data.text.str.contains('catholic')]
print(ttdata.shape[0])

colors = ImageColorGenerator(mask)

stopwords = set(STOPWORDS)
text = ' '.join([w for w in ' '.join(ttdata['text']).split(' ') if w not in commonStopWords])

wordcloud = WordCloud(width = 1500, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10, 
                mask=mask,
                color_func=colors,
                max_words=200).generate(text)

plt.figure(figsize = (15, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

### Republican

In [None]:
ttdata = data[data.text.str.contains('healthcare') | data.text.str.contains('fund') | data.text.str.contains('donate')]
print(ttdata.shape[0])

colors = ImageColorGenerator(mask)

stopwords = set(STOPWORDS)
text = ' '.join([w for w in ' '.join(ttdata['text']).split(' ') if w not in commonStopWords])
text = text.replace('human','').replace('woman','').replace('think','')

wordcloud = WordCloud(width = 1500, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10, 
                mask=mask,
                color_func=colors).generate(text)

plt.figure(figsize = (15, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

In [None]:
texans = data[(~data.description.isna()
 & (data.description.str.lower().str.contains('texas')
 | data.description.str.lower().str.contains('tx'))) | (data.texas==1)]

colors = ImageColorGenerator(mask)

stopwords = set(STOPWORDS)
text = ' '.join([w for w in ' '.join(texans['text']).split(' ') if w not in commonStopWords])
text = text.replace('state','').replace('think','').replace('new',' ')

wordcloud = WordCloud(width = 1500, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10, 
                mask=texas).generate(text)

plt.figure(figsize = (15, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

In [None]:
texans.to_csv('texans.csv',index=False)