In [109]:
import requests, json, time, re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import pickle
from sklearn.metrics import confusion_matrix

In [9]:
reddit = pd.read_csv('./datasets/reddit_4.csv')

In [13]:
X = reddit.drop('is_news', axis=1)
y = reddit['is_news']

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=24)

In [14]:
pickled = open('mnb_tfidf_gs_pkl', 'rb')

In [15]:
gs = pickle.load(pickled)

In [17]:
gs.best_params_

{'mnb__alpha': 0.1,
 'tfidf__max_df': 0.75,
 'tfidf__ngram_range': (1, 2),
 'tfidf__norm': 'l2',
 'tfidf__stop_words': None}

In [23]:
gs.best_estimator_

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.75, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('mnb',
                 MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True))],
         verbose=False)

In [24]:
mnb = MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [27]:
tfidf = TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.75, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)

In [30]:
train_raw = tfidf.fit_transform(X_train['text'])

In [31]:
train_df = pd.SparseDataFrame(train_raw, columns=tfidf.get_feature_names())

In [32]:
train_df.fillna(0, inplace=True)

In [33]:
train_df.isnull().sum().sum()

0

In [34]:
test_raw = tfidf.transform(X_test['text'])
test_df = pd.SparseDataFrame(test_raw, columns=tfidf.get_feature_names())
test_df.fillna(0, inplace=True)
test_df.head()

Unnamed: 0,000,000 000,000 americans,000 baby,000 brown,000 burning,000 cases,000 child,000 children,000 criminal,...,zoologists,zoologists thrilled,zuckerberg,zuckerberg has,zuckerberg lost,zuckerberg of,zuckerberg prepares,zuckerberg touts,zumtrel,zumtrel flooby
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
mnb.fit(train_df, y_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [39]:
mnb.score(train_df, y_train)

1.0

In [40]:
mnb.score(test_df, y_test)

0.8492849284928493

In [58]:
len(np.transpose(mnb.coef_))

29416

In [90]:
feat_importance = pd.DataFrame(np.transpose(mnb.coef_), train_df.columns, columns=['is_news'])

In [108]:
feat_importance.sort_values(by='is_news', ascending=True).head(10000)

Unnamed: 0,is_news
zumtrel flooby,-11.15327
spectacular,-11.15327
spectacular news,-11.15327
speculate,-11.15327
in rpg,-11.15327
in rowboat,-11.15327
in road,-11.15327
special meetings,-11.15327
in ringing,-11.15327
in rich,-11.15327


In [97]:
feat_importance['is_news'].sort_values(ascending=True).keys

<bound method Series.keys of zumtrel flooby     -11.153270
spectacular        -11.153270
spectacular news   -11.153270
speculate          -11.153270
in rpg             -11.153270
in rowboat         -11.153270
in road            -11.153270
special meetings   -11.153270
in ringing         -11.153270
in rich            -11.153270
in revealing       -11.153270
in retrospect      -11.153270
speculate about    -11.153270
speech is          -11.153270
in real            -11.153270
in ring            -11.153270
speech oprah       -11.153270
in same            -11.153270
special flights    -11.153270
in solidarity      -11.153270
in solely          -11.153270
in solar           -11.153270
in smaller         -11.153270
spazio sells       -11.153270
in sioux           -11.153270
special guest      -11.153270
speak to           -11.153270
speakers white     -11.153270
in septic          -11.153270
in search          -11.153270
                      ...    
marijuana           -7.507146
woman      

In [121]:
y_pred = mnb.predict(test_df)

In [122]:
cm = confusion_matrix(y_test, y_pred)

In [125]:
cm_df = pd.DataFrame(cm, columns=['pred r/TheOnion', 'pred r/news'], index=['actual r/TheOnion', 'actual r/news'])

In [126]:
cm_df

Unnamed: 0,pred r/TheOnion,pred r/news
actual r/TheOnion,475,59
actual r/news,78,297


In [136]:
cvec = CountVectorizer()

In [149]:
train_raw = cvec.fit_transform(X_train['text']).toarray()

In [141]:
train_df = pd.SparseDataFrame(train_raw, columns=cvec.get_feature_names())

In [145]:
train_df.fillna(0, inplace=True)

In [151]:
train_df = pd.DataFrame(train_raw, columns=cvec.get_feature_names())

In [165]:
train_df.sum().sort_values(ascending=False)

to            868
of            628
in            537
the           437
for           370
on            258
after         213
and           210
with          192
man           160
that          160
at            158
is            154
by            153
he            152
from          142
who           141
has           126
it            126
new           122
this          120
as            108
his           104
trump         103
be            101
out            90
just           88
year           82
have           80
up             76
             ... 
migrating       1
milano          1
miles           1
miley           1
milkshake       1
mill            1
millwall        1
milo            1
minaj           1
mets            1
metroid         1
metric          1
merit           1
melting         1
membership      1
meme            1
memorable       1
memorials       1
memories        1
mercifully      1
merkel          1
method          1
mesa            1
mess            1
messages  