In [95]:
import requests, json, time, re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.metrics import confusion_matrix

In [2]:
# read in the csv
reddit = pd.read_csv('./datasets/reddit_4.csv')

In [3]:
reddit.head()

Unnamed: 0,text,is_news
0,Scientist Stephen Hawking has died aged 76,1
1,F.C.C. Announces Plan to Repeal Net Neutrality,1
2,Report: Stan Lee dead at 95 - Story,1
3,Arizona Senator John McCain has passed away at...,1
4,"Apple admits it slows older iPhones, confirmin...",1


In [4]:
# use regex to remove characters, urls, and obvious words (news)
reddit['text'] = reddit.text.map(lambda x: re.sub('\s[\/]?r\/[^s]+', ' ', x))
reddit['text'] = reddit.text.map(lambda x: re.sub('http[s]?:\/\/[^\s]*', ' ', x))
reddit['text'] = reddit.text.map(lambda x: re.sub('(news:|news)[s]?', ' ', x, flags=re.I))

In [5]:
# create features and target variable
X = reddit.drop('is_news', axis=1)
y = reddit['is_news']

## Train Test Split

In [6]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=24)

In each of the models, I use a tfidf and countvectorizer pipe and gridsearch over parameters.  It is the same process for each model so no need to comment on each one.

## Random Forest

In [7]:
# instantiate the pipe
rf_tfidf_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

In [8]:
# create a list of parameters that will be gridsearched
# I've narrowed down the gridsearch already, so that's why there's only one param for each
rf_tfidf_params = {
    'tfidf__max_df' : [.75],
    'tfidf__ngram_range' : [(1,1)],
    'tfidf__stop_words' : [None],
    'tfidf__norm' : ['l2'],
    'rf__n_estimators': [150],
    'rf__max_depth': [None],
}

In [9]:
# run the gridseach with a cross validation of 5 folds
gs = GridSearchCV(rf_tfidf_pipe, param_grid=rf_tfidf_params, return_train_score=True, cv=5)

In [10]:
# fit the gridsearch and get the best_score
gs.fit(X_train['text'], y_train)
gs.best_score_

0.7735849056603774

In [11]:
# get the best parameters
gs.best_params_

{'rf__max_depth': None,
 'rf__n_estimators': 150,
 'tfidf__max_df': 0.75,
 'tfidf__ngram_range': (1, 1),
 'tfidf__norm': 'l2',
 'tfidf__stop_words': None}

In [12]:
#score the test
gs.score(X_test['text'], y_test)

0.7799779977997799

In [13]:
rf_cvec_pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('rf', RandomForestClassifier())
])

In [14]:
rf_cvec_params = {
    'cvec__max_features': [1500],
    'cvec__stop_words': ['english'],
    'cvec__ngram_range': [(1,2)],
    'rf__n_estimators': [100, 150, 200],
    'rf__max_depth': [None, 1, 2, 3, 4, 5],
}

In [15]:
gs = GridSearchCV(rf_cvec_pipe, param_grid=rf_cvec_params, return_train_score=True, cv=5)

In [16]:
gs.fit(X_train['text'], y_train)
gs.best_score_

0.769811320754717

In [17]:
gs.best_params_

{'cvec__max_features': 1500,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english',
 'rf__max_depth': None,
 'rf__n_estimators': 100}

In [18]:
gs.score(X_test['text'], y_test)

0.7634763476347635

## Support Vector Machine

In [19]:
svc_tfidf_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svc', SVC())
])

In [22]:
svc_tfidf_params = {
    'tfidf__max_df' : [.75, 1],
    'tfidf__ngram_range' : [(1,1), (1,2)],
    'tfidf__stop_words' : [None, 'english'],
    'tfidf__norm' : ['l2'],
    'svc__kernel' : ['linear'],
    'svc__C' : [1, 10 , 100]
}

In [23]:
gs = GridSearchCV(svc_tfidf_pipe, param_grid=svc_tfidf_params, return_train_score=True, cv=5)

In [24]:
gs.fit(X_train['text'], y_train)
gs.best_score_

0.8471698113207548

In [25]:
gs.score(X_test['text'], y_test)

0.8404840484048405

In [26]:
gs.best_params_

{'svc__C': 10,
 'svc__kernel': 'linear',
 'tfidf__max_df': 0.75,
 'tfidf__ngram_range': (1, 2),
 'tfidf__norm': 'l2',
 'tfidf__stop_words': None}

In [27]:
svc_cvec_pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('svc', SVC())
])

In [28]:
svc_cvec_params = {
    'cvec__max_features': [1500],
    'cvec__stop_words': [None, 'english'],
    'cvec__ngram_range': [(1,1), (1,2)],
    'svc__kernel' : ['linear'],
    'svc__C' : [1, 10 , 100]
}

In [29]:
gs = GridSearchCV(svc_cvec_pipe, param_grid=svc_cvec_params, return_train_score=True, cv=5)

In [30]:
gs.fit(X_train['text'], y_train)
gs.best_score_

0.789622641509434

In [31]:
gs.score(X_test['text'], y_test)

0.7821782178217822

In [32]:
gs.best_params_

{'cvec__max_features': 1500,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': None,
 'svc__C': 1,
 'svc__kernel': 'linear'}

## Multinomail Naive Bayes

In [76]:
mnb_tfidf_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('mnb', MultinomialNB())
])

In [77]:
mnb_tfidf_params = {
    'tfidf__max_df' : [.75, 1],
    'tfidf__ngram_range' : [(1,1), (1,2)],
    'tfidf__stop_words' : [None, 'english'],
    'tfidf__norm' : ['l2'],
    'mnb__alpha': [0.001, 0.1, 1, 10]
}

In [78]:
gs = GridSearchCV(mnb_tfidf_pipe, param_grid=mnb_tfidf_params, return_train_score=True, cv=5)

In [79]:
gs.fit(X_train['text'], y_train)
gs.best_score_

0.8344339622641509

In [80]:
gs.score(X_test['text'], y_test)

0.8503850385038504

In [81]:
gs.best_params_

{'mnb__alpha': 0.1,
 'tfidf__max_df': 0.75,
 'tfidf__ngram_range': (1, 2),
 'tfidf__norm': 'l2',
 'tfidf__stop_words': None}

In [82]:
gs.best_score_

0.8344339622641509

In [84]:
# pickel the best model to use on the next jupyter notebook
import pickle

In [85]:
best_model_filename = 'mnb_tfidf_gs_pkl'

In [86]:
best_model_pkl = open(best_model_filename, 'wb')

In [87]:
pickle.dump(gs, best_model_pkl)

In [88]:
mnb_cvec_pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('mnb', MultinomialNB())
])

In [89]:
mnb_cvec_params = {
    'cvec__max_features': [1500],
    'cvec__stop_words': [None, 'english'],
    'cvec__ngram_range': [(1,1), (1,2)],
    'mnb__alpha': [0.001, 0.1, 1, 10]
}

In [90]:
gs = GridSearchCV(mnb_cvec_pipe, param_grid=mnb_cvec_params, return_train_score=True, cv=5)

In [91]:
gs.fit(X_train['text'], y_train)
gs.best_score_

0.8278301886792453

In [92]:
gs.score(X_test['text'], y_test)

0.8162816281628162

In [93]:
gs.best_params_

{'cvec__max_features': 1500,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': None,
 'mnb__alpha': 1}

In [94]:
gs.best_estimator_

Pipeline(memory=None,
         steps=[('cvec',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=1500, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('mnb',
                 MultinomialNB(alpha=1, class_prior=None, fit_prior=True))],
         verbose=False)

## K-nearest Neighbors

In [59]:
knn_tfidf_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('knn', KNeighborsClassifier())
])

In [67]:
knn_tfidf_params = {
    'tfidf__max_df' : [.75, 1],
    'tfidf__ngram_range' : [(1,1), (1,2)],
    'tfidf__stop_words' : [None, 'english'],
    'tfidf__norm' : ['l2'],
    'knn__n_neighbors' : [5, 15, 25, 201],
    'knn__weights' : ['uniform', 'distance'],
    'knn__algorithm' : ['auto', 'ball_tree']
}
    

In [68]:
gs = GridSearchCV(knn_tfidf_pipe, param_grid=knn_tfidf_params, return_train_score=True, cv=5)

In [69]:
gs.fit(X_train['text'], y_train)
gs.best_score_



















0.8089622641509434

In [70]:
gs.score(X_test['text'], y_test)

0.8294829482948295

In [71]:
knn_cvec_pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('knn', KNeighborsClassifier())
])

In [72]:
mnb_cvec_params = {
    'cvec__max_features': [1500],
    'cvec__stop_words': [None, 'english'],
    'cvec__ngram_range': [(1,1), (1,2)],
    'knn__n_neighbors' : [5, 15, 25, 201],
    'knn__weights' : ['uniform', 'distance'],
    'knn__algorithm' : ['auto', 'ball_tree']
}

In [73]:
gs = GridSearchCV(knn_tfidf_pipe, param_grid=knn_tfidf_params, return_train_score=True, cv=5)

In [74]:
gs.fit(X_train['text'], y_train)
gs.best_score_



















0.8089622641509434

In [75]:
gs.score(X_test['text'], y_test)

0.8294829482948295