In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

pd.options.display.max_colwidth = 500

In [2]:
train_text = pd.read_csv('./datasets/cleaned_donald_chappo_text.csv', index_col=0)
train_text.dropna(inplace=True)
train_text.head()

Unnamed: 0,author,body,date,score,subreddit
0,nixfu,no completed deals not doing business he was looking into it but never did it duh,2018-11-30,1,The_Donald
1,Crypulous,trump has been amazing on a lot of things but on the wall i agree with ann i also think trump responds to certain critics so coulter is actually doing him a favor by keeping the pressure on you can support someone without agreeing with everything they do no wall would be very bad for trump s legacy a wall and an end to anchor babies are key issues for a lot of voters,2018-11-30,1,The_Donald
2,Dueler312,actually fox news did show it,2018-11-30,1,The_Donald
3,soberlight,the excuse about being concerned about their response is pretty thin considering they re trampling over rule of law as it is we don t really have much of a tomorrow when it comes to the whoring of justice x b x b,2018-11-30,1,The_Donald
4,enterthewalrus,well arizona did not get martha mcsally but it looks like the did trade up to get sinema,2018-11-30,1,The_Donald


In [3]:
train_text.shape

(57710, 5)

In [4]:
X = train_text['body']
y = train_text['subreddit'].map(lambda x: 1 if x == 'The_Donald' else 0)

In [5]:
#baseline
y.value_counts(normalize=True)

1    0.511159
0    0.488841
Name: subreddit, dtype: float64

Analyzing Text with Logistic Regression (no stopwords)

In [6]:
def text_dataframe(text, n_features, sw=None):
    count = CountVectorizer(stop_words=sw, max_features=n_features)
    text = count.fit_transform(text)
    text = text.toarray()
    return pd.DataFrame(text, columns=count.get_feature_names())

In [7]:
X_vect = text_dataframe(X, n_features=40000, sw='english')

In [8]:
logit = LogisticRegression()

In [9]:
logit.fit(X_vect, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
logit.score(X_vect, y)

0.8540807485704384

Creating a dictonary with key values pairs that correspond to the odds and associated word as calculated by the logistic regressoin

In [11]:
coef_dict = {}

for i in zip(np.exp(logit.coef_[0]),X_vect.columns):
    coef_dict[float(i[0])] = i[1]

In [12]:
# taking the largest 15 odds, these correlate strongly text from The Donald
donald_top_ten = pd.Series(np.exp(logit.coef_[0])).sort_values(ascending=False).head(15)

In [13]:
# taking the smallest 15 odds, these correlate strongly text from ChapoTrapHouse
chapo_top_ten = pd.Series(np.exp(logit.coef_[0])).sort_values(ascending=True).head(15)

The top 15 words that correlate with The Donald and their associated odds

In [14]:
for i in donald_top_ten:
    print(coef_dict[i], '---', i)

pede --- 35.64945049729865
pedes --- 18.575974212978817
kek --- 18.264577438722885
seth --- 11.890973949991421
geotus --- 9.907162200279926
redacted --- 8.390915901586679
swamp --- 8.319730573413143
bigot --- 7.02894685563287
cnn --- 6.88520345436
msm --- 6.775028675390836
invaders --- 6.292425256350223
fraud --- 6.150562659472404
reeeeeee --- 5.938297442591888
bongino --- 5.92558642226683
doj --- 5.828860889645256


The top 15 words that correlate with ChapoTrapHouse and their associated odds (relative to being in The Donald)

In [15]:
for i in chapo_top_ten:
    print(coef_dict[i], '---', i)

hog --- 0.016530407379071032
chapo --- 0.017758700870684076
chud --- 0.020228481038927222
chuds --- 0.024050645155193093
hellworld --- 0.03845439688377442
praxis --- 0.04441243122423325
virgil --- 0.05910299916998219
comrade --- 0.06876201661473547
volcel --- 0.07055586846998575
creator --- 0.07689137885955226
felix --- 0.07865962697434863
neoliberal --- 0.07906274396118222
centrists --- 0.08220656490271964
hogs --- 0.08747933681358229
reactionary --- 0.09358096760763611


Analyzing Text with Logistic Regression (stopwords)

In [16]:
X_vect = text_dataframe(X, n_features=40000)

In [17]:
logit.fit(X_vect, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [18]:
coef_dict = {}

for i in zip(np.exp(logit.coef_[0]),X_vect.columns):
    coef_dict[float(i[0])] = i[1]

In [19]:
donald_top_ten = pd.Series(np.exp(logit.coef_[0])).sort_values(ascending=False).head(15)

In [20]:
chapo_top_ten = pd.Series(np.exp(logit.coef_[0])).sort_values(ascending=True).head(15)

In [21]:
for i in donald_top_ten:
    print(i, '---', coef_dict[i])

37.47783685968139 --- pede
19.782647791562685 --- kek
17.045163424797746 --- pedes
11.190944183111428 --- seth
9.958451451189601 --- geotus
8.231329461798067 --- swamp
7.422725562394931 --- redacted
7.120373659441393 --- bigot
6.865087504485365 --- rsbn
6.775433913468901 --- cnn
6.472031307695681 --- bongino
6.344885802804554 --- msm
6.237600762235194 --- invaders
6.230367424647058 --- reeeeeee
6.064776327180565 --- fraud


In [22]:
for i in chapo_top_ten:
    print(i, '---', coef_dict[i])

0.01787706445446757 --- chapo
0.017925598970587285 --- hog
0.019751544548353858 --- chud
0.02326630226393725 --- chuds
0.04075526840626377 --- hellworld
0.0446548971525446 --- praxis
0.06209126235926216 --- virgil
0.06805762319936069 --- comrade
0.07133231125673185 --- neoliberal
0.0760120969740782 --- volcel
0.07829353870981363 --- creator
0.08057010209022371 --- felix
0.08824406940995792 --- centrists
0.08891462278970465 --- hogs
0.09107684503462962 --- tankies


---

Analyzing the most frequent words in both subreddits

In [23]:
#breaking up the combined data by subreddit for simplicity
the_donald = train_text.loc[train_text['subreddit'] == 'The_Donald']
chapo = train_text.loc[train_text['subreddit'] == 'ChapoTrapHouse']

In [24]:
snapshillbot_mask = chapo.body.map(lambda x: True if 'SnapshillBot' in x else False)
chapo = chapo.loc[~snapshillbot_mask]

In [25]:
the_donald.shape

(29499, 5)

In [26]:
chapo.shape

(28211, 5)

In [27]:
#defining a function to return a dataframe with the total count of each word in the text
def word_frequency(text, sw=None, ngrams=(1,1)):
    count = CountVectorizer(stop_words=sw, ngram_range=ngrams)
    text = count.fit_transform(text)
    sums = text.toarray().sum(axis=0)
    return pd.DataFrame([sums], columns=count.get_feature_names()).T

In [28]:
chapo_no_s_words = word_frequency(chapo.body)

In [29]:
chapo_no_s_words.sort_values(0, ascending=False).head(10)

Unnamed: 0,0
the,26917
to,15976
and,13198
of,12916
that,10253
is,10090
it,9846
you,8264
in,8086
for,5592


In [30]:
chapo_s_words = word_frequency(chapo['body'], sw='english')

In [31]:
chapo_s_words.sort_values(0, ascending=False).head(10)

Unnamed: 0,0
like,3507
people,3240
just,3131
don,1978
think,1658
good,1439
right,1293
shit,1247
really,1155
know,1150


In [32]:
donald_no_s_words = word_frequency(the_donald['body'])

In [33]:
donald_no_s_words.sort_values(0, ascending=False).head(10)

Unnamed: 0,0
the,27364
to,16895
and,14035
of,11212
is,10389
it,10050
that,9504
in,8481
you,7294
they,7172


In [34]:
donald_s_words = word_frequency(the_donald['body'], sw='english')

In [35]:
donald_s_words.sort_values(0, ascending=False).head(10)

Unnamed: 0,0
like,2940
just,2832
people,2525
don,2141
trump,1824
think,1352
know,1302
time,1159
right,1119
good,1043


Calculating the number of unique words in the top 1,000 words for each subreddit

In [36]:
# the 1000 most common words in the donald
donald_top_1000 = list(donald_s_words.sort_values(0, ascending=False).head(1000).index)

In [37]:
# the 1000 most common words in chapo trap house
chapo_top_1000 = list(chapo_s_words.sort_values(0, ascending=False).head(1000).index)

In [38]:
# finding the number of these words that are unique to the donald
donald_specific = []
for word in donald_top_1000:
    if word not in chapo_top_1000:
        donald_specific.append(word)

In [39]:
len(donald_specific)

245

In [40]:
# finding the number of these words that are unique to the chapo trap house (this should be the same)
donald_specific = []
chapo_specific = []
for word in chapo_top_1000:
    if word not in donald_top_1000:
        chapo_specific.append(word)

In [41]:
len(chapo_specific)

245

For reference, repeating the process with the Zelda and Bitcoin comments to compare unique word counts

In [42]:
combined_data = pd.read_csv('./datasets/zelda-bitcoin.csv', index_col=0)
combined_data.dropna(inplace=True)
combined_data.shape

(13159, 6)

In [43]:
zelda = combined_data.loc[combined_data['subreddit'] == 'zelda']

In [44]:
zelda.shape

(4420, 6)

In [45]:
bitcoin = combined_data.loc[combined_data['subreddit'] == 'Bitcoin']

In [46]:
bitcoin.shape

(8739, 6)

In [47]:
zelda_words = word_frequency(zelda['title'], sw='english')
bitcoin_words = word_frequency(bitcoin.title, sw='english')

In [48]:
zelda_top_1000 = list(zelda_words.sort_values(0, ascending=False).head(1000).index)
bitcoin_top_1000 = list(bitcoin_words.sort_values(0, ascending=False).head(1000).index)

In [49]:
zelda_specific = []
for word in zelda_top_1000:
    if word not in bitcoin_top_1000:
        zelda_specific.append(word)

In [50]:
len(zelda_specific)

683

In [51]:
bitcoin_specific = []
for word in bitcoin_top_1000:
    if word not in zelda_top_1000:
        bitcoin_specific.append(word)

In [52]:
len(bitcoin_specific)

683

Significantly more unique words for the Zelda and Bitcoin subreddits, this makes sense and helps to explain why the classification of these subreddits had a significantly higher accuracy. 

---

In [53]:
# defining a function that returns the shared words out of the n most common words
def common_words(num):
    donald = list(donald_s_words.sort_values(0, ascending=False).head(num).index)
    chapo = list(chapo_s_words.sort_values(0, ascending=False).head(num).index)

    common = []
    for word in donald:
        if word in chapo:
            common.append(word)
            
    return common

In [54]:
# defining a function that returns the unique words for each subreddit out of the n most common words
def seperate_words(text_1, text_2, num):
    text_1 = list(text_1.sort_values(0, ascending=False).head(num).index)
    text_2 = list(text_2.sort_values(0, ascending=False).head(num).index)

    text_1_list = []
    for word in text_1:
        if word not in text_2:
            text_1_list.append(word)
            
    text_2_list = []
    for word in text_2:
        if word not in text_1:
            text_2_list.append(word)
            
    return text_1_list, text_2_list

---

Top phrases in each subreddit

In [55]:
donald_bigrams = word_frequency(the_donald['body'], ngrams=(2,2))
chapo_bigrams = word_frequency(chapo.body, ngrams=(2,2))

unique_donald_bigrams, unique_chapo_bigrams = seperate_words(donald_bigrams, chapo_bigrams, 1000)

In [56]:
unique_donald_bigrams[0:15]

['trump is',
 'president trump',
 'the president',
 'the wall',
 'fake news',
 'if if',
 'the law',
 'to win',
 'the border',
 'orange man',
 'our country',
 'we will',
 'they need',
 'did not',
 'allowed to']

In [57]:
unique_chapo_bigrams[0:15]

['right wing',
 'working class',
 'go on',
 'was created',
 'created by',
 'is pretty',
 'holy shit',
 'this message',
 'by bot',
 'message was',
 'contact creator',
 'bot contact',
 'your hog',
 'pretty much',
 'on chapo']

In [61]:
donald_trigrams = word_frequency(the_donald['body'], ngrams=(3,3))

In [62]:
chapo_trigrams = word_frequency(chapo.body, ngrams=(3,3))

In [63]:
unique_donald_trigrams, unique_chapo_trigrams = seperate_words(donald_trigrams, chapo_trigrams, 1000)

In [64]:
unique_donald_trigrams[:15]

['if if if',
 'we have the',
 'orange man bad',
 'the white house',
 'the deep state',
 'in the house',
 'build the wall',
 'have the best',
 'and they are',
 'the love of',
 'for the love',
 'all over the',
 'we don need',
 'love of god',
 'women and children']

In [65]:
unique_chapo_trigrams[:15]

['was created by',
 'message was created',
 'bot contact creator',
 'created by bot',
 'this message was',
 'by bot contact',
 'go on chapo',
 'the working class',
 'to own the',
 'the volcel police',
 'post your hog',
 'the guy who',
 'show us your',
 'you re just',
 'no nut november']

In [66]:
# looking at how the phrase orange man bad is used
orange_man_mask = the_donald.body.map(lambda x: True if 'orange man bad' in x else False)
the_donald.loc[orange_man_mask, :].head()

Unnamed: 0,author,body,date,score,subreddit
175,BiglyPepe,wow you should run for president seems like you have it all figured out orange man bad,2018-11-30,1,The_Donald
182,DoYouBelieveInMAGA,x b orange man bad x b,2018-11-30,1,The_Donald
745,CaptainFrosty88,the year is general obama now a genetically modified robot and colonel clinton have just entered the th year of the great orange man bad conflict the racist pig general trump has lead the right to march upon our lands california is the last remaining state our munitions are down to musket per men we have to share as we outlawed every gun except muskets to be safe meanwhile the right are cheating and using full autos the bastards i fear this day will be my last,2018-11-29,1,The_Donald
2009,gordo7054,while downing s discussions with the president s team violated no laws it could possibly help trump which should be illegal because orange man bad,2018-11-28,1,The_Donald
2169,cl1ft,orange man bad someone compliment orange man they re bad compliment must be false bad,2018-11-28,1,The_Donald


Modeling with custom stop words

In [67]:
common_10 = common_words(10)

In [68]:
stop_words = text.ENGLISH_STOP_WORDS.union(common_10)

In [69]:
X = train_text.body
y = train_text.subreddit.map(lambda x: 1 if x == 'The_Donald' else 0)

In [70]:
count = CountVectorizer(stop_words=common_10)
nb = MultinomialNB()

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .1, stratify = y, random_state=42)

In [72]:
X_train = count.fit_transform(X_train)
X_test = count.transform(X_test)

In [73]:
X_train.shape

(51939, 39942)

In [74]:
nb.fit(X_train, y_train)
nb.score(X_train, y_train)

0.8165155278307245

In [75]:
nb.score(X_test, y_test)

0.7336683417085427

No significant improvement 

In [76]:
common_100 = common_words(100)

In [77]:
stop_words = text.ENGLISH_STOP_WORDS.union(common_100)

In [78]:
X = train_text.body
y = train_text.subreddit.map(lambda x: 1 if x == 'The_Donald' else 0)

In [79]:
count = CountVectorizer(stop_words=common_100)
nb = MultinomialNB()

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .1, stratify = y, random_state=42)

In [81]:
X_train = count.fit_transform(X_train)
X_test = count.transform(X_test)

In [82]:
X_train.shape

(51939, 39874)

In [83]:
nb.fit(X_train, y_train)
nb.score(X_train, y_train)

0.8158031536995322

In [84]:
nb.score(X_test, y_test)

0.7282966556922543

Slight Decrease in Performance - Removing common words does not seem to improve the score