# Subreddit Classification - Preprocessing and EDA

### Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import time
import warnings
import regex as re

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
warnings.filterwarnings('ignore')
np.random.seed(824)
from bs4 import BeautifulSoup 
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB

## Grab cleaned dataframe

In [2]:
df = pd.read_csv('../datasets/preprocessed_data.csv')

In [3]:
df.drop(columns = 'Unnamed: 0', inplace = True)

In [4]:
df.head(2)

Unnamed: 0,title,selftext,subreddit,title_char_count,title_word_count,selftext_char_count,selftext_word_count,clean_title,clean_selftext,clean_title_word_count,clean_selftext_word_count,all_content,clean_content_word_count
0,7 rules for surviving The Crack,"“We have to rewrite it, there is no other poss...",1,31,6,6003,1138,rules surviving crack,rewrite possibility rewrite understand rewriti...,3,451,rules surviving crack rewrite possibility rewr...,1138
1,I was followed home one night and got a tape t...,It was a warm night in a town just outside of ...,1,61,13,2471,505,followed home night got tape morning,warm night town just outside london ontario co...,6,193,followed home night got tape morning warm nigh...,505


In [5]:
df.isnull().sum()

title                          7
selftext                       6
subreddit                      0
title_char_count               0
title_word_count               0
selftext_char_count            0
selftext_word_count            0
clean_title                  274
clean_selftext                42
clean_title_word_count         0
clean_selftext_word_count      0
all_content                    0
clean_content_word_count       0
dtype: int64

In [6]:
df = df.dropna()

In [7]:
df.isnull().sum()

title                        0
selftext                     0
subreddit                    0
title_char_count             0
title_word_count             0
selftext_char_count          0
selftext_word_count          0
clean_title                  0
clean_selftext               0
clean_title_word_count       0
clean_selftext_word_count    0
all_content                  0
clean_content_word_count     0
dtype: int64

## Calculate some quick counts of total characters and total words

# Single Word Trends

In [8]:
target = df['subreddit']

In [9]:
target.shape

(19691,)

In [10]:
cvec = CountVectorizer(stop_words = 'english', min_df=2, max_df=1.0, ngram_range=(1, 1))

In [11]:
term_mat = cvec.fit_transform(df['clean_title'])

In [12]:
len(cvec.get_feature_names())

4703

In [13]:
#cvec.get_feature_names()

['abandon',
 'abandoned',
 'abattoir',
 'abducted',
 'abduction',
 'ability',
 'able',
 'abomination',
 'abortion',
 'absolutely',
 'abuse',
 'abyss',
 'accept',
 'access',
 'accessing',
 'accident',
 'accidental',
 'accidentally',
 'account',
 'accounts',
 'accurate',
 'accused',
 'achieved',
 'acid',
 'acre',
 'act',
 'acting',
 'actions',
 'active',
 'activity',
 'actors',
 'actress',
 'actual',
 'actually',
 'ad',
 'add',
 'addict',
 'addicted',
 'addiction',
 'addictive',
 'address',
 'adirondacks',
 'admiral',
 'admit',
 'adopted',
 'adrenaline',
 'adult',
 'adults',
 'adventure',
 'adventures',
 'advice',
 'advised',
 'af',
 'affect',
 'afghanistan',
 'aficionados',
 'afraid',
 'africa',
 'afterlife',
 'aftermath',
 'afternoon',
 'age',
 'aged',
 'agency',
 'agenda',
 'agent',
 'ages',
 'aggressive',
 'ago',
 'agony',
 'agoraphobic',
 'agreed',
 'ai',
 'aicha',
 'aid',
 'ain',
 'air',
 'airbnb',
 'airways',
 'aita',
 'aka',
 'alarm',
 'alaskan',
 'albert',
 'album',
 'alert',
 '

In [14]:
term_df = pd.DataFrame(term_mat.toarray(), columns=cvec.get_feature_names())

In [15]:
term_df.head()

Unnamed: 0,abandon,abandoned,abattoir,abducted,abduction,ability,able,abomination,abortion,absolutely,...,yowie,yrs,zero,zombie,zombies,zone,zoo,zoom,zozo,zwooorp
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### We'll insert the target back in..

In [16]:
term_df.insert(0, 'my_subreddits', target)

In [17]:
term_df.head()

Unnamed: 0,my_subreddits,abandon,abandoned,abattoir,abducted,abduction,ability,able,abomination,abortion,...,yowie,yrs,zero,zombie,zombies,zone,zoo,zoom,zozo,zwooorp
0,1.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Let's look at some distributions...

We can use `groupby` to get some aggregates over our classes. `sum` will give us the total time a word occurs in a class, `mean` will give us the average.

In [18]:
term_df.groupby('my_subreddits').mean()

Unnamed: 0_level_0,abandon,abandoned,abattoir,abducted,abduction,ability,able,abomination,abortion,absolutely,...,yowie,yrs,zero,zombie,zombies,zone,zoo,zoom,zozo,zwooorp
my_subreddits,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,0.000315,0.004098,0.0,0.000315,0.000525,0.000105,0.000525,0.0,0.0,0.00042,...,0.0,0.00021,0.0,0.000315,0.00021,0.000105,0.00021,0.000105,0.000105,0.0
1.0,0.000101,0.003748,0.000405,0.000101,0.000203,0.000101,0.000203,0.000101,0.000203,0.000203,...,0.000304,0.000101,0.000304,0.000405,0.000101,0.000101,0.000709,0.000203,0.000101,0.000203


In [19]:
term_df.groupby('my_subreddits').sum()

Unnamed: 0_level_0,abandon,abandoned,abattoir,abducted,abduction,ability,able,abomination,abortion,absolutely,...,yowie,yrs,zero,zombie,zombies,zone,zoo,zoom,zozo,zwooorp
my_subreddits,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,3,39,0,3,5,1,5,0,0,4,...,0,2,0,3,2,1,2,1,1,0
1.0,1,37,4,1,2,1,2,1,2,2,...,3,1,3,4,1,1,7,2,1,2


In [20]:

#The code for frequency. I have a column in “term_df”
## Machine Learning frequency of "data"
#term_df[term_df['Subreddit']==0]['WORD IM INTERESTED IN'].value_counts().to_dict() 

By transposing the outputing and sorting by one of our groups, we can see what our most prevalent words are.

In [21]:
term_df.groupby('my_subreddits').mean().T.sort_values(1, ascending=False).head(20)


my_subreddits,0.0,1.0
think,0.021017,0.032212
night,0.029844,0.028768
house,0.038462,0.028059
story,0.126103,0.025527
rules,0.001261,0.024615
know,0.01797,0.022387
just,0.013556,0.020158
man,0.035519,0.01874
room,0.014502,0.017524
strange,0.009983,0.017423


In [22]:
term_df.groupby('my_subreddits').mean().T.sort_values(1, ascending=False).head(20)


my_subreddits,0.0,1.0
think,0.021017,0.032212
night,0.029844,0.028768
house,0.038462,0.028059
story,0.126103,0.025527
rules,0.001261,0.024615
know,0.01797,0.022387
just,0.013556,0.020158
man,0.035519,0.01874
room,0.014502,0.017524
strange,0.009983,0.017423


We can grab the list of the top 50 terms in each of our classes.

In [23]:
top_words_scarystories = list(term_df.groupby('my_subreddits').
     mean().T.sort_values(0, ascending=False).head(25).index)

top_words_nosleep = list(term_df.groupby('my_subreddits').
     mean().T.sort_values(1, ascending=False).head(25).index)

In [24]:
top_words_overlap = [nosleep for nosleep in top_words_nosleep if nosleep in top_words_scarystories]
top_words_overlap

['think',
 'night',
 'house',
 'story',
 'know',
 'just',
 'man',
 'room',
 'sleep',
 'friend',
 'saw',
 'weird']

In [25]:
from scipy.stats import ttest_ind

ttest_dict = {}
for word in top_words_overlap:
    ttest_dict[word] = ttest_ind(term_df[term_df['my_subreddits']==1][word], 
         term_df[term_df['my_subreddits']==0][word])

In [26]:
ttest_dict

{'think': Ttest_indResult(statistic=4.806622669215303, pvalue=1.5465134427985994e-06),
 'night': Ttest_indResult(statistic=-0.4371444428386487, pvalue=0.6620114673259707),
 'house': Ttest_indResult(statistic=-4.0003847974145454, pvalue=6.347429792105374e-05),
 'story': Ttest_indResult(statistic=-26.78679395124042, pvalue=3.0577244246769314e-155),
 'know': Ttest_indResult(statistic=2.1350806161338487, pvalue=0.03276692560864695),
 'just': Ttest_indResult(statistic=3.499835876328026, pvalue=0.0004665897131896592),
 'man': Ttest_indResult(statistic=-7.204372993495433, pvalue=6.044926817317988e-13),
 'room': Ttest_indResult(statistic=1.6479395019116825, pvalue=0.09938129192042443),
 'sleep': Ttest_indResult(statistic=0.07973445653659228, pvalue=0.9364492773457922),
 'friend': Ttest_indResult(statistic=0.7254275333371751, pvalue=0.468198543491948),
 'saw': Ttest_indResult(statistic=-1.0284982719559945, pvalue=0.3037283352828392),
 'weird': Ttest_indResult(statistic=-1.2287623719231209, pval

In [27]:
overlap_stats = pd.DataFrame.from_dict(ttest_dict).T

In [28]:
overlap_stats.rename(columns={0: "Statistic", 1: "p-value"}, inplace = True)

In [29]:
overlap_stats

Unnamed: 0,Statistic,p-value
think,4.806623,1.546513e-06
night,-0.437144,0.6620115
house,-4.000385,6.34743e-05
story,-26.786794,3.0577239999999998e-155
know,2.135081,0.03276693
just,3.499836,0.0004665897
man,-7.204373,6.044927e-13
room,1.64794,0.09938129
sleep,0.079734,0.9364493
friend,0.725428,0.4681985


In [30]:
term_df.groupby(by='my_subreddits').sum()[top_words_overlap].T

my_subreddits,0.0,1.0
think,200,318
night,284,284
house,366,277
story,1200,252
know,171,221
just,129,199
man,338,185
room,138,173
sleep,149,156
friend,137,155


In [31]:
title_single = term_df.groupby(by='my_subreddits').mean()[top_words_overlap].T
title_single.rename(columns = {0 : 'scarystories', 1 : 'nosleep'}, inplace = True)
title_single.to_csv('../datasets/title_top_single.csv')

In [32]:
overlap_stats.to_csv('../datasets/title_single_overlap_stats.csv')

In [33]:
# Overlap words that are NOT statistically significant
insig = overlap_stats[(overlap_stats['p-value'] > 0.05)]
insig.head(60)

Unnamed: 0,Statistic,p-value
night,-0.437144,0.662011
room,1.64794,0.099381
sleep,0.079734,0.936449
friend,0.725428,0.468199
saw,-1.028498,0.303728
weird,-1.228762,0.219176


In [34]:
# Overlap words that ARE statistically significant
signif = overlap_stats[(overlap_stats['p-value'] < 0.05)]
signif.head(60)

Unnamed: 0,Statistic,p-value
think,4.806623,1.546513e-06
house,-4.000385,6.34743e-05
story,-26.786794,3.0577239999999998e-155
know,2.135081,0.03276693
just,3.499836,0.0004665897
man,-7.204373,6.044927e-13


# Paired Word Trends

In [35]:
cvec = CountVectorizer(stop_words = 'english', min_df=2, max_df=1.0, ngram_range=(2, 2))

In [36]:
term_mat = cvec.fit_transform(df['clean_title'])

In [37]:
len(cvec.get_feature_names())

6213

In [38]:
# cvec.get_feature_names()

In [39]:
term_df = pd.DataFrame(term_mat.toarray(), columns=cvec.get_feature_names())

In [40]:
term_df.head()

Unnamed: 0,abandoned beach,abandoned building,abandoned camp,abandoned church,abandoned county,abandoned farm,abandoned hospital,abandoned house,abandoned howling,abandoned school,...,yokai jokes,young girl,youtube channel,youtube horror,youtube scary,youtube short,youtube video,yowie story,yrs old,zoo night
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### We'll insert the target back in..

In [41]:
term_df.insert(0, 'my_subreddits', target)

### Let's look at some distributions...

We can use `groupby` to get some aggregates over our classes. `sum` will give us the total time a word occurs in a class, `mean` will give us the average.

In [42]:
term_df.groupby('my_subreddits').mean()

Unnamed: 0_level_0,abandoned beach,abandoned building,abandoned camp,abandoned church,abandoned county,abandoned farm,abandoned hospital,abandoned house,abandoned howling,abandoned school,...,yokai jokes,young girl,youtube channel,youtube horror,youtube scary,youtube short,youtube video,yowie story,yrs old,zoo night
my_subreddits,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,0.0,0.000315,0.0,0.00021,0.000315,0.000315,0.000105,0.000946,0.00021,0.000315,...,0.000105,0.000105,0.003573,0.000315,0.000315,0.00021,0.000525,0.0,0.000105,0.0
1.0,0.000203,0.000101,0.000203,0.0,0.0,0.0,0.000304,0.000608,0.000304,0.0,...,0.000101,0.000304,0.000405,0.0,0.0,0.0,0.0,0.000304,0.000101,0.000203


In [43]:
term_df.groupby('my_subreddits').sum()

Unnamed: 0_level_0,abandoned beach,abandoned building,abandoned camp,abandoned church,abandoned county,abandoned farm,abandoned hospital,abandoned house,abandoned howling,abandoned school,...,yokai jokes,young girl,youtube channel,youtube horror,youtube scary,youtube short,youtube video,yowie story,yrs old,zoo night
my_subreddits,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,0,3,0,2,3,3,1,9,2,3,...,1,1,34,3,3,2,5,0,1,0
1.0,2,1,2,0,0,0,3,6,3,0,...,1,3,4,0,0,0,0,3,1,2


In [44]:
term_df.groupby('my_subreddits').mean().T.sort_values(0, ascending=False).head(20)


my_subreddits,0.0,1.0
true story,0.037831,0.002532
scary story,0.021438,0.001013
scary stories,0.016078,0.001418
sleep paralysis,0.009878,0.003748
horror story,0.009668,0.002431
haunted house,0.006515,0.00081
horror stories,0.005359,0.000608
true scary,0.004729,0.000405
know scary,0.004098,0.0
scary true,0.003573,0.0


In [45]:
term_df.groupby('my_subreddits').mean().T.sort_values(1, ascending=False).head(20)

my_subreddits,0.0,1.0
set rules,0.0,0.004862
list rules,0.000105,0.004457
years ago,0.001681,0.004153
sleep paralysis,0.009878,0.003748
amusement park,0.000105,0.003241
serial killer,0.001261,0.003039
best friend,0.001786,0.002938
security guard,0.00021,0.002938
need help,0.002732,0.002836
got job,0.0,0.002735


We can grab the list of the top 50 terms in each of our classes.

In [46]:
top_words_scarystories = list(term_df.groupby('my_subreddits').
     mean().T.sort_values(0, ascending=False).head(60).index)

top_words_nosleep = list(term_df.groupby('my_subreddits').
     mean().T.sort_values(1, ascending=False).head(60).index)

In [47]:
top_words_overlap = [nosleep for nosleep in top_words_nosleep if nosleep in top_words_scarystories]
top_words_overlap

['years ago',
 'sleep paralysis',
 'serial killer',
 'best friend',
 'need help',
 'true story',
 'horror story',
 'short story',
 'ouija board',
 'night shift',
 'scary stories']

In [48]:
ttest_dict = {}
for word in top_words_overlap:
    ttest_dict[word] = ttest_ind(term_df[term_df['my_subreddits']==1][word], 
         term_df[term_df['my_subreddits']==0][word])

In [49]:
overlap_stats = pd.DataFrame.from_dict(ttest_dict).T

In [50]:
overlap_stats.rename(columns={0: "Statistic", 1: "p-value"}, inplace = True)

In [51]:
term_df.groupby(by='my_subreddits').mean()[top_words_overlap].T


my_subreddits,0.0,1.0
years ago,0.001681,0.004153
sleep paralysis,0.009878,0.003748
serial killer,0.001261,0.003039
best friend,0.001786,0.002938
need help,0.002732,0.002836
true story,0.037831,0.002532
horror story,0.009668,0.002431
short story,0.001997,0.001925
ouija board,0.001576,0.001925
night shift,0.001997,0.001621


In [52]:
title_twoword = term_df.groupby(by='my_subreddits').mean()[top_words_overlap].T
title_twoword.rename(columns = {0 : 'scarystories', 1 : 'nosleep'}, inplace = True)
title_twoword.to_csv('../datasets/title_top_twoword.csv')

In [53]:
overlap_stats.to_csv('../datasets/title_twoword_overlap_stats.csv')

In [54]:
# Overlap words that ARE statistically significant
insig = overlap_stats[(overlap_stats['p-value'] < 0.05)]
insig.head(60)

Unnamed: 0,Statistic,p-value
years ago,3.178576,0.00148233
sleep paralysis,-5.212156,1.885718e-07
serial killer,2.662144,0.007770874
true story,-17.753837,5.7637829999999997e-70
horror story,-6.538935,6.35034e-11
scary stories,-11.010546,4.118309e-28


In [55]:
# Overlap words that are NOT statistically significant
insig = overlap_stats[(overlap_stats['p-value'] > 0.05)]
insig.head(60)

Unnamed: 0,Statistic,p-value
best friend,1.64703,0.099568
need help,0.134958,0.892646
short story,-0.113315,0.909782
ouija board,0.579508,0.562253
night shift,-0.616358,0.537666


# Three Word Trends

In [56]:
cvec = CountVectorizer(stop_words = 'english', min_df=2, max_df=1.0, ngram_range=(3, 3))

In [57]:
term_mat = cvec.fit_transform(df['clean_title'])

In [58]:
len(cvec.get_feature_names())

3114

In [59]:
# cvec.get_feature_names()

In [60]:
term_df = pd.DataFrame(term_mat.toarray(), columns=cvec.get_feature_names())

In [61]:
term_df.head()

Unnamed: 0,abandoned county hospital,abandoned hospital woods,abandoned house pear,abandoned howling village,abandoned shopping mall,abandoned son hasn,abandoned stroller rolled,access american psychiatric,accident occurred school,accidentally created worse,...,years tried escape,yellow door true,yggdrasil brother journal,yggdrasil roots threaded,yo tf did,young girl caught,youtube channel narrate,youtube scary stories,yowie story aussie,zoo night animal
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### We'll insert the target back in..

In [62]:
term_df.insert(0, 'my_subreddits', target)

### Let's look at some distributions...

We can use `groupby` to get some aggregates over our classes. `sum` will give us the total time a word occurs in a class, `mean` will give us the average.

In [63]:
term_df.groupby('my_subreddits').mean()

Unnamed: 0_level_0,abandoned county hospital,abandoned hospital woods,abandoned house pear,abandoned howling village,abandoned shopping mall,abandoned son hasn,abandoned stroller rolled,access american psychiatric,accident occurred school,accidentally created worse,...,years tried escape,yellow door true,yggdrasil brother journal,yggdrasil roots threaded,yo tf did,young girl caught,youtube channel narrate,youtube scary stories,yowie story aussie,zoo night animal
my_subreddits,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,0.000315,0.0,0.0,0.00021,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000105,0.0,0.0,0.0,0.0,0.00021,0.00021,0.0,0.0
1.0,0.0,0.000203,0.000405,0.000304,0.000203,0.000203,0.000203,0.000203,0.000304,0.000203,...,0.000304,0.000101,0.000203,0.000304,0.000304,0.000203,0.0,0.0,0.000304,0.000203


In [64]:
term_df.groupby('my_subreddits').sum()

Unnamed: 0_level_0,abandoned county hospital,abandoned hospital woods,abandoned house pear,abandoned howling village,abandoned shopping mall,abandoned son hasn,abandoned stroller rolled,access american psychiatric,accident occurred school,accidentally created worse,...,years tried escape,yellow door true,yggdrasil brother journal,yggdrasil roots threaded,yo tf did,young girl caught,youtube channel narrate,youtube scary stories,yowie story aussie,zoo night animal
my_subreddits,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,3,0,0,2,0,0,0,0,0,0,...,0,1,0,0,0,0,2,2,0,0
1.0,0,2,4,3,2,2,2,2,3,2,...,3,1,2,3,3,2,0,0,3,2


In [65]:
term_df.groupby('my_subreddits').mean().T.sort_values(0, ascending=False).head(20)


my_subreddits,0.0,1.0
know scary story,0.003363,0.0
true scary stories,0.002732,0.000101
sentence horror story,0.001892,0.0
native scary story,0.001681,0.0
true native scary,0.001681,0.0
horror game podcast,0.001576,0.0
think house haunted,0.001366,0.0
episode horror game,0.001261,0.0
scary true story,0.001156,0.0
short horror story,0.001051,0.000101


In [66]:
term_df.groupby('my_subreddits').mean().T.sort_values(1, ascending=False).head(20)

my_subreddits,0.0,1.0
working amusement park,0.0,0.002431
different kind police,0.0,0.002127
kind police officer,0.0,0.002127
strange set rules,0.0,0.001722
budget slasher film,0.0,0.001621
film think really,0.0,0.001621
slasher film think,0.0,0.001621
really killing stars,0.0,0.001621
actress set low,0.0,0.001621
think really killing,0.0,0.001621


We can grab the list of the top 50 terms in each of our classes.

In [67]:
top_words_scarystories = list(term_df.groupby('my_subreddits').
     mean().T.sort_values(0, ascending=False).head(715).index)

top_words_nosleep = list(term_df.groupby('my_subreddits').
     mean().T.sort_values(1, ascending=False).head(715).index)

In [68]:
top_words_overlap = [nosleep for nosleep in top_words_nosleep if nosleep in top_words_scarystories]
top_words_overlap

['happened years ago',
 'woods dead body',
 'true horror story',
 'girl scared half',
 'stopped trick treating',
 'public doesn know',
 'reason stopped trick',
 'scariest thing happened',
 'inunaki mura abandoned',
 'sleep paralysis demon',
 'howling village rural',
 'village rural japan',
 'night guard local']

In [69]:
ttest_dict = {}
for word in top_words_overlap:
    ttest_dict[word] = ttest_ind(term_df[term_df['my_subreddits']==1][word], 
         term_df[term_df['my_subreddits']==0][word])

In [70]:
overlap_stats = pd.DataFrame.from_dict(ttest_dict).T

In [71]:
overlap_stats.rename(columns={0: "Statistic", 1: "p-value"}, inplace = True)

In [72]:
term_df.groupby(by='my_subreddits').mean()[top_words_overlap].T

my_subreddits,0.0,1.0
happened years ago,0.00021,0.00081
woods dead body,0.000315,0.000405
true horror story,0.000525,0.000405
girl scared half,0.000105,0.000304
stopped trick treating,0.00021,0.000304
public doesn know,0.000315,0.000304
reason stopped trick,0.00021,0.000304
scariest thing happened,0.000841,0.000304
inunaki mura abandoned,0.00021,0.000304
sleep paralysis demon,0.000315,0.000304


In [73]:
title_threeword = term_df.groupby(by='my_subreddits').mean()[top_words_overlap].T
title_threeword.rename(columns = {0 : 'scarystories', 1 : 'nosleep'}, inplace = True)
title_threeword.to_csv('../datasets/title_top_threeword.csv')

In [74]:
overlap_stats.to_csv('../datasets/title_threeword_overlap_stats.csv')

In [75]:
# Overlap words that ARE statistically significant
insig = overlap_stats[(overlap_stats['p-value'] < 0.05)]
insig.head(60)

Unnamed: 0,Statistic,p-value


In [76]:
# Overlap words that are NOT statistically significant
insig = overlap_stats[(overlap_stats['p-value'] > 0.05)]
insig.head(60)

Unnamed: 0,Statistic,p-value
happened years ago,1.840152,0.065761
woods dead body,0.329482,0.741795
true horror story,-0.388556,0.697609
girl scared half,0.963511,0.335303
stopped trick treating,0.406257,0.684558
public doesn know,-0.044989,0.964116
reason stopped trick,0.406257,0.684558
scariest thing happened,-1.569185,0.116621
inunaki mura abandoned,0.406257,0.684558
sleep paralysis demon,-0.044989,0.964116
