# Project 3 - Reddit API and Classification

## Bayes Classifier Model vs. Logistic Regression Model

The goal is to use NLP to train a classifier on whether a given post came from the Cheap_Meals or EatCheapAndHealthy subreddit.

Baseline accuracy is __0.522__.

Highest accuracy score of __0.833__ attained using Multinomial Bayes Classifier Model with CountVectorizer.

Highest sensitivity score of __0.848__ attained using Gaussian Bayes Classifier Model with TF-IDF.

Models used:

* Multinomial Bayes Classifier Model with CountVectorizer
* Gaussian Bayes Classifier Model with TF-IDF
* Logistic Regression Model with CountVectorizer
* Logistic Regression Model with TF-IDF

In [39]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegressionCV

In [4]:
df = pd.read_csv("./Data/df_clean.csv")

In [5]:
df.head()

Unnamed: 0,title,subreddit,word_count,char_count,neg_sen,pos_sen
0,ancho chile puree braised chicken tacos taco ...,1,9,56,0.0,0.0
1,recipes for college student,1,4,27,0.0,0.0
2,fluffy apple roll cake,1,4,22,0.0,0.0
3,easy potato curry,1,3,17,0.0,0.592
4,budget friendly korean soft tofu stew,1,6,37,0.0,0.39


#### Baseline

In [7]:
df['subreddit'].value_counts(normalize=True)

0    0.522284
1    0.477716
Name: subreddit, dtype: float64

>Baseline accuracy is 0.522 for predicting that a given post belongs to the EatCheapAndHealthy subreddit, represented by 0.

#### Multinomial Bayes Classifier Model with CountVectorizer

In [8]:
X = df['title']
y = df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [9]:
pipe = Pipeline([('cvec', CountVectorizer()),
                 ('mnb', MultinomialNB())])

pipe_params = {'cvec__stop_words': [None, 'english'],
               'cvec__ngram_range': [(1,1), (1,2)],
               'cvec__max_features': [None, 400, 500]}

In [10]:
grid = GridSearchCV(pipe, param_grid=pipe_params, cv=5)

In [11]:
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('cvec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('mnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'cvec__stop_words': [None, 'english'], 'cvec__ngram_range': [(1, 1), (1, 2)], 'cvec__max_features': [None, 400, 500]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [12]:
grid.best_params_

{'cvec__max_features': None,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': None}

In [13]:
grid.best_score_

0.8495821727019499

In [15]:
grid.score(X_train, y_train)

0.978644382544104

In [14]:
grid.score(X_test, y_test)

0.8328690807799443

In [16]:
pred = grid.predict(X_test)

In [20]:
confusion_matrix(y_test, pred)

array([[162,  31],
       [ 29, 137]])

In [24]:
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)
print()
print (f'Accuracy: {round((tn+tp)/(tn+fp+fn+tp),3)}')
print (f'Miscalculation rate: {round(1-((tn+tp)/(tn+fp+fn+tp)),3)}')
print (f'Sensitivity: {round(tp/(tp + fn),3)}')
print (f'Specificity: {round(tn/(tn + fp),3)}')
print (f'Precision: {round((tp)/(tp + fp),3)}')

True Negatives: 162
False Positives: 31
False Negatives: 29
True Positives: 137

Accuracy: 0.833
Miscalculation rate: 0.167
Sensitivity: 0.825
Specificity: 0.839
Precision: 0.815


#### Gaussian Bayes Classifier Model with TF-IDF

In [31]:
tfidf = TfidfVectorizer()

df_2 = pd.DataFrame(tfidf.fit_transform(X).toarray(),
                   columns=tfidf.get_feature_names())

X = df_2
y = df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,
                                                    random_state = 42)

gnb = GaussianNB()
gnb.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [32]:
gnb.score(X_train, y_train)

0.9721448467966574

In [33]:
gnb.score(X_test, y_test)

0.7520891364902507

In [34]:
pred = gnb.predict(X_test)

In [35]:
confusion_matrix(y_test, pred)

array([[125,  63],
       [ 26, 145]])

In [36]:
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)
print()
print (f'Accuracy: {round((tn+tp)/(tn+fp+fn+tp),3)}')
print (f'Miscalculation rate: {round(1-((tn+tp)/(tn+fp+fn+tp)),3)}')
print (f'Sensitivity: {round(tp/(tp + fn),3)}')
print (f'Specificity: {round(tn/(tn + fp),3)}')
print (f'Precision: {round((tp)/(tp + fp),3)}')

True Negatives: 125
False Positives: 63
False Negatives: 26
True Positives: 145

Accuracy: 0.752
Miscalculation rate: 0.248
Sensitivity: 0.848
Specificity: 0.665
Precision: 0.697


#### Logistic Regression Model with CountVectorizer

In [37]:
X = df['title']
y = df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [40]:
pipe = Pipeline([('cvec', CountVectorizer()),
                 ('lr', LogisticRegressionCV())])

pipe_params = {'cvec__stop_words': [None, 'english'],
               'cvec__ngram_range': [(1,1), (1,2)],
               'cvec__max_features': [None, 400]}

In [41]:
grid = GridSearchCV(pipe, param_grid=pipe_params, cv=5)

In [42]:
grid.fit(X_train, y_train)













GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('cvec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...    random_state=None, refit=True, scoring=None, solver='lbfgs',
           tol=0.0001, verbose=0))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'cvec__stop_words': [None, 'english'], 'cvec__ngram_range': [(1, 1), (1, 2)], 'cvec__max_features': [None, 400]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [43]:
grid.best_params_

{'cvec__max_features': None,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': None}

In [44]:
grid.best_score_

0.8180129990714949

In [45]:
grid.score(X_train, y_train)

0.9990714948932219

In [46]:
grid.score(X_test, y_test)

0.7910863509749304

In [47]:
pred = grid.predict(X_test)

In [48]:
confusion_matrix(y_test, pred)

array([[157,  36],
       [ 39, 127]])

In [49]:
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)
print()
print (f'Accuracy: {round((tn+tp)/(tn+fp+fn+tp),3)}')
print (f'Miscalculation rate: {round(1-((tn+tp)/(tn+fp+fn+tp)),3)}')
print (f'Sensitivity: {round(tp/(tp + fn),3)}')
print (f'Specificity: {round(tn/(tn + fp),3)}')
print (f'Precision: {round((tp)/(tp + fp),3)}')

True Negatives: 157
False Positives: 36
False Negatives: 39
True Positives: 127

Accuracy: 0.791
Miscalculation rate: 0.209
Sensitivity: 0.765
Specificity: 0.813
Precision: 0.779


#### Logistic Regression Model with TF-IDF

In [50]:
pipe = Pipeline([("tidif", TfidfVectorizer()),
                 ("lr", LogisticRegressionCV())])

pipe_params = {"tidif__stop_words": [None, "english"],
               "tidif__ngram_range": [(1,1), (1,2)],
               "tidif__max_features": [None, 400]}

In [51]:
grid = GridSearchCV(pipe, param_grid=pipe_params, cv=5)

In [52]:
grid.fit(X_train, y_train)











GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('tidif', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...    random_state=None, refit=True, scoring=None, solver='lbfgs',
           tol=0.0001, verbose=0))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'tidif__stop_words': [None, 'english'], 'tidif__ngram_range': [(1, 1), (1, 2)], 'tidif__max_features': [None, 400]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [53]:
grid.best_params_

{'tidif__max_features': None,
 'tidif__ngram_range': (1, 2),
 'tidif__stop_words': None}

In [54]:
grid.score(X_train, y_train)

0.9990714948932219

In [55]:
grid.score(X_test, y_test)

0.8161559888579387

In [56]:
pred = grid.predict(X_test)

In [57]:
confusion_matrix(y_test, pred)

array([[160,  33],
       [ 33, 133]])

In [58]:
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)
print()
print (f'Accuracy: {round((tn+tp)/(tn+fp+fn+tp),3)}')
print (f'Miscalculation rate: {round(1-((tn+tp)/(tn+fp+fn+tp)),3)}')
print (f'Sensitivity: {round(tp/(tp + fn),3)}')
print (f'Specificity: {round(tn/(tn + fp),3)}')
print (f'Precision: {round((tp)/(tp + fp),3)}')

True Negatives: 160
False Positives: 33
False Negatives: 33
True Positives: 133

Accuracy: 0.816
Miscalculation rate: 0.184
Sensitivity: 0.801
Specificity: 0.829
Precision: 0.801
