## Modeling

In [55]:
# imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import (train_test_split, 
                                     cross_val_score)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import (RandomForestClassifier, 
                                BaggingClassifier)
from sklearn.model_selection import StratifiedKFold

from nltk.stem.porter import *
from  nltk.stem import PorterStemmer




In [56]:
def score(model, name, X, y):
    cv = StratifiedKFold(n_splits=10,
                         random_state=73,
                         shuffle=True)
    s = cross_val_score(model, X, y, cv=cv,
                        n_jobs=-1)
    print('{} Score: {:.2f} +- {:.3f}'.format(name, 
                                              s.mean(), 
                                              2 * s.std()))

### Set up

In [57]:
# load data
reddits = pd.read_csv('../data/reddit_clean.csv')

In [58]:
# create target
reddits['target'] = reddits.num_comments.\
    apply(lambda x: 0 if x < reddits.num_comments.median() else 1)

In [59]:
# baseline 0.5085933045532296
reddits.target.value_counts(normalize=True)[1]

0.5085933045532296

In [60]:
# instantiate vectorizer
tvec = TfidfVectorizer(stop_words = 'english',
                       strip_accents = 'ascii',
                       max_features = 200,
                       max_df = .60,
                       norm = 'l2'
                      )
tvec.fit(reddits.title)

TfidfVectorizer(max_df=0.6, max_features=200, stop_words='english',
                strip_accents='ascii')

In [61]:
# create vectorized dataframe
reddits_tvec = pd.DataFrame(tvec.transform(reddits.title).todense(),
                  columns = tvec.get_feature_names())

In [62]:
# concat vectorized and reddit dataframes
reddits_vect = pd.concat([reddits,reddits_tvec],axis=1)

In [63]:
# instatiate X and y
X = reddits_vect[['upvote_ratio', 'length_time','score',
        'num_words', 'title_length','depp','ukraine','school','police','gun',
        'amber','pride','man','2022','gun', 'new','year','years',
        'today','season','russian','hate','spoilers','free','buy',
        'actually','work','life','fuck','shares','dog']]

y = reddits_vect.target.values

In [64]:
# set up train test split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=.2,
                                                   random_state=73)

In [65]:
# standard scale the data
ss = StandardScaler()
ss.fit(X_train, y_train)
X_train_scaled = ss.transform(X_train)
X_test_scaled = ss.transform(X_test)

### Logistic regression

In [66]:
lr = LogisticRegression(solver='liblinear',
                       C=100,
                       penalty='l2',
                       random_state=73
                       )

lr.fit(X_train_scaled, y_train)

LogisticRegression(C=100, random_state=73, solver='liblinear')

In [67]:
# score X_train
score(lr, 'Logistic regression', X_train_scaled, y_train)

Logistic regression Score: 0.72 +- 0.031


In [68]:
# score X_test
score(lr, 'Logistic regression', X_test_scaled, y_test)

Logistic regression Score: 0.71 +- 0.053


### Random forest

In [69]:
# instantiate and fit
rf = RandomForestClassifier(n_jobs=-1,
                            random_state=73,
                            n_estimators=200,
                            min_samples_leaf=5,
                            warm_start=True)
rf.fit(X_train, y_train)

RandomForestClassifier(min_samples_leaf=5, n_estimators=200, n_jobs=-1,
                       random_state=73, warm_start=True)

In [70]:
# score the X_train
score(rf, 'Random forest', X_train, y_train)

Random forest Score: 0.74 +- 0.032


In [71]:
# score the X_test
score(rf, 'Random forest', X_test, y_test)

Random forest Score: 0.71 +- 0.046


### Bagging

In [72]:
# instantiate 
dt = DecisionTreeClassifier(random_state=73,
                            class_weight='balanced',)
bdt = BaggingClassifier(dt,
                        random_state=73,
                        n_jobs=-1,
                        n_estimators=200)

bdt.fit(X_train, y_train)

In [74]:
# score X_train
score(bdt, 'Bagging', X_train, y_train)

Bagging Score: 0.72 +- 0.028


In [54]:
# score X_test
score(bdt, 'Bagging', X_test, y_test)

Bagging Score: 0.71 +- 0.037


### Stemmer

In [75]:
stemmer = PorterStemmer()

In [82]:
reddits_stem = reddits['title'].apply(lambda x: stemmer.stem(x))


In [83]:
reddits_stem.head()

0        banned from my street on friday is this allow
1    rift in this weeks iron banner is inherently n...
2               nadal5 defeats djokovic 1 62 46 62 764
3                                   what would you say
4                                             mosh pit
Name: title, dtype: object

In [81]:
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()

In [84]:
reddits.title.apply(lambda x: lem.lemmatize(x))

0          Banned from my street on Friday Is this allowed
1        Rift in this weeks Iron Banner is inherently n...
2                   Nadal5 defeats Djokovic 1 62 46 62 764
3                                       What would you say
4                                                 Mosh pit
                               ...                        
12272                                                  100
12273                            Still Early But Not Wrong
12274          WuTang is for the children its for Apes too
12275    Made an alternate version of this meme templat...
12276    Keep digging hedgies U R FUK BUY HODL DRS Lock...
Name: title, Length: 12277, dtype: object

In [None]:
reddits_greater = reddits_vect[reddits_vect.target == 1].drop(['author', 
            'title','created', 'subreddit', 'num_comments',
            'score', 'upvote_ratio', 'length_time',
            'num_words', 'title_length', 'target'], axis=1).sum().to_frame('count1')

In [None]:
reddits_lesser = reddits_vect[reddits_vect.target == 0].drop(['author', 
            'title','created', 'subreddit', 'num_comments',
            'score', 'upvote_ratio', 'length_time',
            'num_words', 'title_length', 'target'], axis=1).sum().to_frame('count0')

In [None]:
test = pd.concat((reddits_greater, reddits_lesser), axis=1)

In [None]:
# test.sort_values('count0', ascending = False)

test['diff'] = test.count1 - test.count0

In [None]:
test.sort_values('diff', ascending=False).head(50)

In [None]:
test.to_csv('../data/test.csv')