In [1]:
import requests, json, time, re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import pickle

In [2]:
reddit = pd.read_csv('./datasets/reddit_5.csv')

In [3]:
X = reddit.drop('is_theO', axis=1)
y = reddit['is_theO']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=24)

In [5]:
pickled = open('mnb_tfidf_gs_pkl', 'rb')

In [6]:
gs = pickle.load(pickled)

In [7]:
gs.best_params_

{'mnb__alpha': 0.1,
 'tfidf__max_df': 0.75,
 'tfidf__ngram_range': (1, 1),
 'tfidf__norm': 'l2',
 'tfidf__stop_words': None}

In [8]:
gs.best_estimator_

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.75, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('mnb',
                 MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True))],
         verbose=False)

In [9]:
mnb = MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [12]:
tfidf = TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.75, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)

In [13]:
train_raw = tfidf.fit_transform(X_train['text'])

In [14]:
train_df = pd.SparseDataFrame(train_raw, columns=tfidf.get_feature_names())

In [15]:
train_df.fillna(0, inplace=True)

In [16]:
train_df.isnull().sum().sum()

0

In [17]:
test_raw = tfidf.transform(X_test['text'])
test_df = pd.SparseDataFrame(test_raw, columns=tfidf.get_feature_names())
test_df.fillna(0, inplace=True)
test_df.head()

Unnamed: 0,000,000 000,000 acre,000 baby,000 books,000 burning,000 cars,000 cases,000 child,000 days,...,zoo with,zoologists,zoologists thrilled,zuckerberg,zuckerberg has,zuckerberg lost,zuckerberg of,zuckerberg prepares,zumtrel,zumtrel flooby
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
mnb.fit(train_df, y_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [19]:
mnb.score(train_df, y_train)

1.0

In [20]:
mnb.score(test_df, y_test)

0.8529411764705882

In [21]:
len(np.transpose(mnb.coef_))

29439

In [22]:
feat_importance = pd.DataFrame(np.transpose(mnb.coef_), train_df.columns, columns=['is_theO'])

In [25]:
feat_importance.sort_values(by='is_theO', ascending=True).head(100)

Unnamed: 0,is_theO
his manhattan,-11.414306
trampled to,-11.414306
trampled,-11.414306
in 10,-11.414306
in 2017,-11.414306
in 2018,-11.414306
in 25,-11.414306
in 300,-11.414306
in acid,-11.414306
in alabama,-11.414306


In [24]:
feat_importance['is_theO'].sort_values(ascending=True).keys

<bound method Series.keys of his manhattan        -11.414306
trampled to          -11.414306
trampled             -11.414306
in 10                -11.414306
in 2017              -11.414306
in 2018              -11.414306
in 25                -11.414306
in 300               -11.414306
in acid              -11.414306
in alabama           -11.414306
in alice             -11.414306
in america           -11.414306
train 100            -11.414306
in annual            -11.414306
in any               -11.414306
in apparent          -11.414306
in ashley            -11.414306
in austin            -11.414306
in bagel             -11.414306
in bankruptcy        -11.414306
in bar               -11.414306
in barracks          -11.414306
in beating           -11.414306
in blow              -11.414306
trans                -11.414306
in bonuses           -11.414306
trans woman          -11.414306
imports              -11.414306
transgender troops   -11.414306
illinois             -11.414306
           