# Import Packages, Read Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

import nltk
from nltk.stem import WordNetLemmatizer

%matplotlib inline

In [2]:
comb_df = pd.read_csv('./datasets/combined_df.csv')
#title_wordcount_df = pd.read_csv('./datasets/title_wordcount.csv')
#post_wordcount_df = pd.read_csv('./datasets/post_wordcount.csv')

In [3]:
#finalcheck of dataframe before proceeding
comb_df = comb_df.drop(columns='Unnamed: 0')
print(comb_df.isnull().sum().sum())
print(len(comb_df))

0
1915


# Train-Test Split

In [4]:
#2 key features are titles and posts
#Titles could be highly informative features given that they tend to be succinct summaries of the post content
#Lemmatised versions already made in previous steps, so they will be used

X_titles = comb_df['titles_lemmatized']
X_posts = comb_df['posts_lemmatized']
y = comb_df['subreddit']

#### Titles split

In [5]:
X_titles_train, X_titles_test, y_titles_train, y_titles_test = train_test_split(X_titles, y,stratify=y,
                                                                                random_state=1337)

#### Posts split

In [6]:
X_posts_train, X_posts_test, y_posts_train, y_posts_test = train_test_split(X_posts, y, stratify=y,
                                                                            random_state=1337)

In [7]:
all((y_titles_train == y_posts_train)==True)

True

In [8]:
all((y_titles_test == y_posts_test)==True)

True

# <span style = "color:green"> Baseline: Naive Bayes Classifier (Multinomial)</span>

#### Create a Naive Bayes Pipeline

In [92]:
naive_bayes = Pipeline([('vector',TfidfVectorizer()),     #get tfidf scores for each word in all documents 
                        ('multi_nb', MultinomialNB())])   #default values are used for baseline -- GridSearch will be done later

#### Naive Bayes: Titles

In [93]:
naive_bayes.fit(X_titles_train,y_titles_train)          #fit training data to naive bayes pipeline

naive_ypred = naive_bayes.predict(X_titles_test)        # predict y using X_test

print('accuracy %s' % accuracy_score(y_titles_test,naive_ypred))
print('\n_____Scores for Titles_____')
print(classification_report(y_titles_test,naive_ypred,target_names=['DaveRamsey','personalfinance']))

accuracy 0.7828810020876826

_____Scores for Titles_____
                 precision    recall  f1-score   support

     DaveRamsey       0.80      0.77      0.78       245
personalfinance       0.77      0.80      0.78       234

      micro avg       0.78      0.78      0.78       479
      macro avg       0.78      0.78      0.78       479
   weighted avg       0.78      0.78      0.78       479



#### Naive Bayes: Post Text

In [114]:
naive_bayes.fit(X_posts_train,y_posts_train)

naive_ypred = naive_bayes.predict(X_posts_test)
posts_nb_base_acc = accuracy_score(y_posts_test,naive_ypred)

print('accuracy %s' % posts_nb_base_acc)
print('\n_____Scores for Posts_____')
print(classification_report(y_posts_test,naive_ypred,target_names=['DaveRamsey','personalfinance']))

accuracy 0.8622129436325678

_____Scores for Posts_____
                 precision    recall  f1-score   support

     DaveRamsey       0.84      0.91      0.87       245
personalfinance       0.90      0.81      0.85       234

      micro avg       0.86      0.86      0.86       479
      macro avg       0.87      0.86      0.86       479
   weighted avg       0.87      0.86      0.86       479



# Baseline: Logistic Regression
    

In [115]:
logisticreg = Pipeline([('vector', TfidfVectorizer()),
                        ('logisticreg', LogisticRegression(solver='liblinear',random_state=1337))])

logisticreg.fit(X_titles_train, y_titles_train)

logistic_ypred = logisticreg.predict(X_titles_test)

print('accuracy %s' % accuracy_score(logistic_ypred, y_titles_test))
print(classification_report(y_titles_test, logistic_ypred,target_names=['DaveRamsey','personalfinance']))

accuracy 0.778705636743215
                 precision    recall  f1-score   support

     DaveRamsey       0.79      0.78      0.78       245
personalfinance       0.77      0.78      0.78       234

      micro avg       0.78      0.78      0.78       479
      macro avg       0.78      0.78      0.78       479
   weighted avg       0.78      0.78      0.78       479



In [116]:
logisticreg.fit(X_posts_train, y_posts_train)

logistic_ypred = logisticreg.predict(X_posts_test)
posts_logreg_base_acc = accuracy_score(logistic_ypred, y_posts_test)

print('accuracy %s' % posts_logreg_base_acc)
print(classification_report(y_posts_test, logistic_ypred,target_names=['DaveRamsey','personalfinance']))

accuracy 0.8622129436325678
                 precision    recall  f1-score   support

     DaveRamsey       0.87      0.87      0.87       245
personalfinance       0.86      0.86      0.86       234

      micro avg       0.86      0.86      0.86       479
      macro avg       0.86      0.86      0.86       479
   weighted avg       0.86      0.86      0.86       479



# Baseline: Support Vector Machine

In [117]:
SVM = Pipeline([('vector',TfidfVectorizer()),
                ('supvec',svm.SVC(C=1.0,
                                 kernel='linear',
                                 degree=3,
                                 gamma='auto',
                                 random_state=1337))
               ])

In [118]:
SVM.fit(X_titles_train,y_titles_train)
svm_ypred = SVM.predict(X_titles_test)

print('accuracy %s' % accuracy_score(svm_ypred, y_titles_test))
print(classification_report(y_titles_test, svm_ypred,target_names=['DaveRamsey','personalfinance']))

accuracy 0.7620041753653445
                 precision    recall  f1-score   support

     DaveRamsey       0.77      0.76      0.76       245
personalfinance       0.75      0.77      0.76       234

      micro avg       0.76      0.76      0.76       479
      macro avg       0.76      0.76      0.76       479
   weighted avg       0.76      0.76      0.76       479



In [119]:
SVM.fit(X_posts_train,y_posts_train)
SVM_ypred = SVM.predict(X_posts_test)
posts_SVM_base_acc = accuracy_score(SVM_ypred, y_posts_test)

print('accuracy %s' % posts_SVM_base_acc)
print(classification_report(y_posts_test, SVM_ypred,target_names=['DaveRamsey','personalfinance']))

accuracy 0.8538622129436325
                 precision    recall  f1-score   support

     DaveRamsey       0.88      0.83      0.85       245
personalfinance       0.83      0.88      0.85       234

      micro avg       0.85      0.85      0.85       479
      macro avg       0.85      0.85      0.85       479
   weighted avg       0.86      0.85      0.85       479



# Baseline: Stochastic Gradient Descent Classifier

In [120]:
stochastic = Pipeline([('vector', TfidfVectorizer()),
                       ('stoch', SGDClassifier(random_state=1337))])

stochastic.fit(X_titles_train, y_titles_train)

stochastic_ypred = stochastic.predict(X_titles_test)

print('accuracy %s' % accuracy_score(svm_ypred, y_titles_test))
print(classification_report(y_titles_test, stochastic_ypred,target_names=['DaveRamsey','personalfinance']))

accuracy 0.7620041753653445
                 precision    recall  f1-score   support

     DaveRamsey       0.75      0.73      0.74       245
personalfinance       0.72      0.74      0.73       234

      micro avg       0.73      0.73      0.73       479
      macro avg       0.73      0.73      0.73       479
   weighted avg       0.74      0.73      0.73       479





In [122]:
stochastic.fit(X_posts_train, y_posts_train)

stoch_ypred = stochastic.predict(X_posts_test)
posts_stoch_base_acc = accuracy_score(stoch_ypred, y_posts_test)

print('accuracy %s' % posts_stoch_base_acc)
print(classification_report(y_posts_test, stoch_ypred,target_names=['DaveRamsey','personalfinance']))

accuracy 0.8204592901878914
                 precision    recall  f1-score   support

     DaveRamsey       0.86      0.78      0.82       245
personalfinance       0.79      0.87      0.83       234

      micro avg       0.82      0.82      0.82       479
      macro avg       0.82      0.82      0.82       479
   weighted avg       0.82      0.82      0.82       479



### Tuning Hyperparameters

In [71]:
#GridSearchCV for all models

param_grid_nb = [{'vector__ngram_range':[(1,1),(1,2),(1,3),(1,4),(1,5)],
                  'vector__max_df':[0.9,0.95],
                  'vector__min_df':[0.0001,0.001,0.01],
                  'multi_nb__alpha':[1e-5*10**i for i in range(0,6)],
                  'multi_nb__fit_prior':[True,False]
                 }]

param_grid_logisticreg = [{'vector__ngram_range':[(1,1),(1,2),(1,3),(1,4),(1,5)],
                           'vector__max_df':[0.9,0.95],
                           'vector__min_df':[0.0001,0.001,0.01],
                           'logisticreg__penalty':['l1','l2'],
                           'logisticreg__C':list(np.linspace(1.0,1.5,6)),
                           'logisticreg__max_iter':[1e2,1e3,1e4,1e5]
                          }]

param_grid_SVM = [{'vector__ngram_range':[(1,1),(1,2),(1,3),(1,4),(1,5)],
                   'vector__max_df':[0.9,0.95],
                   'vector__min_df':[0.0001,0.001,0.01],
                   'supvec__C':list(np.linspace(1.0,1.5,6)),
                   'supvec__kernel':['rbf','linear','sigmoid']
                  }] 

param_grid_stochastic = [{'vector__ngram_range':[(1,1),(1,2),(1,3),(1,4),(1,5)],
                          'vector__max_df':[0.9,0.95],
                          'vector__min_df':[0.0001,0.001,0.01],
                          'stoch__penalty':['l1','l2'],
                          'stoch__alpha':[1e-6*10**i for i in range(0,8)],
                          #'stoch__max_iter':[10,100,1000]
                         }]

### Naive Bayes GridSearch

In [125]:
nb_grid = GridSearchCV(estimator = naive_bayes,
                       param_grid=param_grid_nb,
                       scoring='accuracy'
                      )

In [126]:
titles_grid_result_nb = nb_grid.fit(X_titles_train,y_titles_train)
titles_best_params_nb = nb_grid.best_params_
titles_best_acc_nb = nb_grid.best_score_



In [127]:
posts_grid_result_nb = nb_grid.fit(X_posts_train,y_posts_train)
posts_best_params_nb = nb_grid.best_params_
posts_best_acc_nb = nb_grid.best_score_



In [128]:
titles_best_params_nb

{'multi_nb__alpha': 1.0,
 'multi_nb__fit_prior': True,
 'vector__max_df': 0.9,
 'vector__min_df': 0.0001,
 'vector__ngram_range': (1, 1)}

In [129]:
titles_best_acc_nb

0.7458217270194986

In [130]:
posts_best_params_nb

{'multi_nb__alpha': 1.0,
 'multi_nb__fit_prior': False,
 'vector__max_df': 0.9,
 'vector__min_df': 0.0001,
 'vector__ngram_range': (1, 2)}

In [131]:
posts_best_acc_nb

0.8649025069637883

#### Logistic Regression GridSearch

In [132]:
logreg_grid = GridSearchCV(estimator = logisticreg,
                           param_grid=param_grid_logisticreg,
                           scoring='accuracy'
                          )

In [133]:
titles_grid_result_logreg = logreg_grid.fit(X_titles_train,y_titles_train)
titles_best_params_logreg = logreg_grid.best_params_
titles_best_acc_logreg = logreg_grid.best_score_



In [134]:
posts_grid_result_logreg = logreg_grid.fit(X_posts_train,y_posts_train)
posts_best_params_logreg = logreg_grid.best_params_
posts_best_acc_logreg = logreg_grid.best_score_



In [135]:
titles_best_params_logreg

{'logisticreg__C': 1.0,
 'logisticreg__max_iter': 100.0,
 'logisticreg__penalty': 'l2',
 'vector__max_df': 0.9,
 'vector__min_df': 0.0001,
 'vector__ngram_range': (1, 1)}

In [136]:
titles_best_acc_logreg

0.754874651810585

In [137]:
posts_best_params_logreg

{'logisticreg__C': 1.1,
 'logisticreg__max_iter': 100.0,
 'logisticreg__penalty': 'l2',
 'vector__max_df': 0.9,
 'vector__min_df': 0.0001,
 'vector__ngram_range': (1, 4)}

In [138]:
posts_best_acc_logreg

0.8586350974930362

### Support Vector Machine GridSearch

In [139]:
SVM_grid = GridSearchCV(estimator = SVM,
                        param_grid = param_grid_SVM,
                        scoring = 'accuracy'
                       )

In [140]:
titles_grid_result_SVM = SVM_grid.fit(X_titles_train,y_titles_train)
titles_best_params_SVM = SVM_grid.best_params_
titles_best_acc_SVM = SVM_grid.best_score_



In [141]:
posts_grid_result_SVM = SVM_grid.fit(X_posts_train,y_posts_train)
posts_best_params_SVM = SVM_grid.best_params_
posts_best_acc_SVM = SVM_grid.best_score_



In [142]:
titles_best_acc_SVM

0.7506963788300836

In [143]:
titles_best_params_SVM

{'supvec__C': 1.4,
 'supvec__kernel': 'linear',
 'vector__max_df': 0.9,
 'vector__min_df': 0.0001,
 'vector__ngram_range': (1, 3)}

In [144]:
posts_best_acc_SVM

0.8523676880222841

In [145]:
posts_best_params_SVM

{'supvec__C': 1.1,
 'supvec__kernel': 'linear',
 'vector__max_df': 0.9,
 'vector__min_df': 0.0001,
 'vector__ngram_range': (1, 5)}

### Stochastic Gradient Classifier GridSearch

In [72]:
stoch_grid = GridSearchCV(estimator = stochastic,
                           param_grid=param_grid_stochastic,
                           scoring='accuracy'
                          )

In [73]:
titles_grid_result_stoch = stoch_grid.fit(X_titles_train,y_titles_train)
titles_best_params_stoch = stoch_grid.best_params_
titles_best_acc_stoch = stoch_grid.best_score_



In [74]:
posts_grid_result_stoch = stoch_grid.fit(X_posts_train,y_posts_train)
posts_best_params_stoch = stoch_grid.best_params_
posts_best_acc_stoch = stoch_grid.best_score_



In [75]:
titles_best_params_stoch

{'stoch__alpha': 0.001,
 'stoch__penalty': 'l2',
 'vector__max_df': 0.9,
 'vector__min_df': 0.0001,
 'vector__ngram_range': (1, 1)}

In [76]:
titles_best_acc_stoch

0.7534818941504178

In [77]:
posts_best_params_stoch

{'stoch__alpha': 9.999999999999999e-05,
 'stoch__penalty': 'l2',
 'vector__max_df': 0.9,
 'vector__min_df': 0.0001,
 'vector__ngram_range': (1, 4)}

In [78]:
posts_best_acc_stoch

0.8530640668523677

# Final Modelling Runs

In [43]:
naive_bayes_fin = Pipeline([('vector',TfidfVectorizer(ngram_range=(1,2),
                                                     min_df=0.0001,
                                                     max_df=0.9)),     #get tfidf scores for each word in all documents 
                            ('multi_nb', MultinomialNB(alpha=1.0,
                                                       fit_prior=False))
                           ])   #default values are used for baseline -- GridSearch will be done later

#### Naive Bayes: Posts

In [147]:
naive_bayes_fin.fit(X_posts_train,y_posts_train)          #fit training data to naive bayes pipeline

naive_ypred_fin = naive_bayes_fin.predict(X_posts_test)        # predict y using X_test
nb_fin_acc = accuracy_score(y_posts_test,naive_ypred_fin)

print('accuracy %s' % nb_fin_acc)
print('\n_____Scores for Posts_____')
print(classification_report(y_posts_test,naive_ypred_fin,target_names=['DaveRamsey','personalfinance']))

accuracy 0.8768267223382046

_____Scores for Posts_____
                 precision    recall  f1-score   support

     DaveRamsey       0.87      0.89      0.88       245
personalfinance       0.89      0.86      0.87       234

      micro avg       0.88      0.88      0.88       479
      macro avg       0.88      0.88      0.88       479
   weighted avg       0.88      0.88      0.88       479



### Final Logistic Regression
    

In [149]:
logisticreg_fin = Pipeline([('vector', TfidfVectorizer(ngram_range=(1,4),
                                                      min_df=0.0001,
                                                      max_df=0.9)),
                            ('logisticreg', LogisticRegression(C=1.1,
                                                               max_iter=100,
                                                               penalty='l2',
                                                               random_state=1337))])

In [151]:
logisticreg_fin.fit(X_posts_train, y_posts_train)

logistic_ypred_fin = logisticreg_fin.predict(X_posts_test)
logistic_fin_acc = accuracy_score(logistic_ypred_fin, y_posts_test)

print('accuracy %s' % logistic_fin_acc)
print('\n_____Scores for Posts_____')
print(classification_report(y_posts_test, logistic_ypred_fin,target_names=['DaveRamsey','personalfinance']))

#Logistic regression may perform better on a larger dataset. https://medium.com/@sangha_deb/naive-bayes-vs-logistic-regression-a319b07a5d4c

accuracy 0.8810020876826722

_____Scores for Posts_____
                 precision    recall  f1-score   support

     DaveRamsey       0.89      0.87      0.88       245
personalfinance       0.87      0.89      0.88       234

      micro avg       0.88      0.88      0.88       479
      macro avg       0.88      0.88      0.88       479
   weighted avg       0.88      0.88      0.88       479



### Final Support Vector Machine

In [152]:
SVM_fin=Pipeline([('vector', TfidfVectorizer(ngram_range=(1, 5),
                                           min_df= 0.0001,
                                           max_df=0.9)),
                  ('supvec', svm.SVC(C=1.1,
                                     kernel='linear',
                                     gamma='auto',
                                     random_state=1337))
                ])

In [154]:
SVM_fin.fit(X_posts_train,y_posts_train)

SVM_fin_ypred = SVM_fin.predict(X_posts_test)
SVM_fin_acc = accuracy_score(SVM_fin_ypred, y_posts_test)

print('accuracy %s' %SVM_fin_acc)
print('\n_____Scores for Posts_____')
print(classification_report(y_posts_test, SVM_fin_ypred,target_names=['DaveRamsey','personalfinance']))

accuracy 0.8663883089770354

_____Scores for Posts_____
                 precision    recall  f1-score   support

     DaveRamsey       0.90      0.83      0.86       245
personalfinance       0.83      0.91      0.87       234

      micro avg       0.87      0.87      0.87       479
      macro avg       0.87      0.87      0.87       479
   weighted avg       0.87      0.87      0.87       479



### Final SGD Classifier

In [155]:
stochastic_fin = Pipeline([('vector', TfidfVectorizer(ngram_range=(1,5),
                                                     min_df=0.0001,
                                                     max_df=0.9)),
                           ('stoch', SGDClassifier(alpha=1e-04,
                                                   penalty='l2',
                                                   random_state=1337))])

In [156]:
stochastic_fin.fit(X_posts_train, y_posts_train)

stoch_ypred_fin = stochastic_fin.predict(X_posts_test)
stoch_fin_acc = accuracy_score(stoch_ypred_fin, y_titles_test)

print('accuracy %s' % stoch_fin_acc)
print('\n_____Scores for Posts_____')
print(classification_report(y_titles_test, stoch_ypred_fin,target_names=['DaveRamsey','personalfinance']))



accuracy 0.872651356993737

_____Scores for Posts_____
                 precision    recall  f1-score   support

     DaveRamsey       0.92      0.82      0.87       245
personalfinance       0.83      0.93      0.88       234

      micro avg       0.87      0.87      0.87       479
      macro avg       0.88      0.87      0.87       479
   weighted avg       0.88      0.87      0.87       479



# Score Tabulation

In [166]:
score_df = pd.DataFrame({'Baseline':[posts_nb_base_acc,posts_logreg_base_acc,posts_SVM_base_acc,posts_stoch_base_acc],
                         'GridSearchCV':[posts_best_acc_nb,posts_best_acc_logreg,posts_best_acc_SVM,posts_best_acc_stoch],
                         'Tuned':[nb_fin_acc,logistic_fin_acc,SVM_fin_acc,stoch_fin_acc],
                         'best_params':[posts_best_params_nb,posts_best_params_logreg,posts_best_params_SVM,posts_best_params_stoch]
                        },
                        index=['MultinomialNB','LogReg','Support Vector Machine','Stoch Grad Desc'])

In [167]:
score_df

Unnamed: 0,Baseline,GridSearchCV,Tuned,best_params
MultinomialNB,0.862213,0.864903,0.876827,"{'multi_nb__alpha': 1.0, 'multi_nb__fit_prior'..."
LogReg,0.862213,0.858635,0.881002,"{'logisticreg__C': 1.1, 'logisticreg__max_iter..."
Support Vector Machine,0.853862,0.852368,0.866388,"{'supvec__C': 1.1, 'supvec__kernel': 'linear',..."
Stoch Grad Desc,0.820459,0.853064,0.872651,"{'stoch__alpha': 9.999999999999999e-05, 'stoch..."


In [165]:
9.999999999999999e-05/1e-04

0.9999999999999999