# Model Refining

In [5]:
# Importing libraries for modeling and analysis 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.datasets import make_classification
from sklearn.svm import SVC

In [3]:
# Reading in my first reddits file, without authors
reddits = pd.read_csv('./data/reddits_preprocessed.csv')

In [4]:
# Checking out my dataframe 
reddits.head()

Unnamed: 0,subreddit,type,created_utc,words
0,0,comment,1553281124,she didnt mention this when i asked her she...
1,0,comment,1553280963,i mean i was but not for the sole purpose of...
2,0,comment,1553280896,hardly the best talent around in podcast has...
3,0,comment,1553280716,she cant do season 2 because gimlet owns the...
4,0,comment,1553280571,search party really excellent and critically...


In [90]:
# I tried this to see if my model improved by only including comments and not submissions, but it did not
# make a significant difference, so I am commenting this out to show I tried
# reddits = reddits[reddits['type'] == 'comment']

In [6]:
# Setting my X variable as the words, which will be the predictors, and y to be the predicted variable, 
# which is what subreddit a text is from 
X = reddits['words']
y = reddits['subreddit']

In [7]:
# Train test split with my data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22, stratify=y)

### The Multinomial Naive Bayes was my best model, along with SVM. I am trying to see if I can change the alpha hyperparameter for any improvement. 

In [93]:
# Establishing the pipeline
pipe = Pipeline([('tfidf', TfidfVectorizer()),
                     ('nb', MultinomialNB())])
# Setting my pipeline parameters
pipe_params = {
    'nb__alpha': [1, 1e-1, 1e-2],
    
}
# Instantiating a grid search
gs = GridSearchCV(pipe, 
                  param_grid=pipe_params) 
# Fitting my model
gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                      

In [94]:
# The best score
gs.best_score_

0.7949924775962207

In [95]:
# The best estimator, with an alpha of .01
# gs.best_estimator_

In [96]:
# Setting the best estimator to the model
gs_model = gs.best_estimator_

In [97]:
# Training score with alpha = .01
gs_model.score(X_train, y_train)

0.9158455899852445

In [98]:
# Testing score with alpha = .01
gs_model.score(X_test, y_test)

0.8075110666857062

### Here I am trying fit_prior on my model as a parameter.

In [99]:
# Creating a pipeline
pipe2 = Pipeline([('tfidf', TfidfVectorizer(max_features=10000, 
                                           ngram_range=(1, 1), 
                                           stop_words=stop_words.ENGLISH_STOP_WORDS)),
                     ('nb', MultinomialNB()),
])
# Setting my pipeline parameters 
pipe_params2 = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'nb__alpha': [.01],
    'nb__fit_prior': [True, False],
}
# Instantiated a grid search
gs2 = GridSearchCV(pipe2, 
                  param_grid=pipe_params2) 
# Fitting my model
gs2.fit(X_train, y_train)
# Checking my best score
gs2.best_score_

0.7889474877414673

In [100]:
# Setting my best estimator to my model
gs_model2 = gs2.best_estimator_

In [101]:
# Checking my training score
gs_model2.score(X_train, y_train)

0.8715312485125423

In [102]:
# Checking my testing score
gs_model2.score(X_test, y_test)

0.7988005140654005

### Multinomial Naive Bayes Adding Authors

In [8]:
# Reading in my reddits data with authors included
reddits2 = pd.read_csv('./data/reddits_preprocessed_authors.csv')

In [9]:
# Assigning my new X and y variables
X = reddits2['words']
y = reddits2['subreddit']

In [10]:
# Train test split on my new data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22, stratify=y)

In [11]:
# Setting my pipeline 
pipe3 = Pipeline([('tfidf', TfidfVectorizer(max_features=10000, 
                                           ngram_range=(1, 1), 
                                           stop_words=stop_words.ENGLISH_STOP_WORDS)),
                     ('nb', MultinomialNB()),
])
# Setting my pipeline parameters 
pipe_params3 = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'nb__alpha': [0.01],
    'nb__fit_prior': [True, False],
}
# Instantiating my grid search 
gs3 = GridSearchCV(pipe3, 
                  param_grid=pipe_params3) 
# Fitting my model
gs3.fit(X_train, y_train)
# Looking at my best score
gs3.best_score_

0.9082775785688705

In [12]:
gs_model3 = gs3.best_estimator_

In [13]:
gs_model3.score(X_train, y_train)

0.9541625017849493

In [14]:
gs_model3.score(X_test, y_test)

0.91360845351992

In [15]:
predictions = gs_model3.predict(X_test)

In [16]:
cm = confusion_matrix(y_test, predictions)

In [17]:
cm_df = pd.DataFrame(cm, columns=['pred gimlet', 'pred maximum_fun'], index=['actual gimlet', 'actual maximum_fun'])
cm_df

Unnamed: 0,pred gimlet,pred maximum_fun
actual gimlet,3127,250
actual maximum_fun,355,3271


### SVM Model with Authors

In [113]:
# Setting the pipeline, this time just putting the features for TFIDF that have been most successful in the
# pipeline rather than in the parameters. 
pipe4 = Pipeline([('tfidf', TfidfVectorizer(max_features=10000, 
                                           ngram_range=(1, 1), 
                                           stop_words=stop_words.ENGLISH_STOP_WORDS)),
                     ('svc', SVC(gamma='scale')),
])
# Setting the parameters
pipe_params4 = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
}
# Instantiating a grid search pipeline
gs4 = GridSearchCV(pipe4, 
                  param_grid=pipe_params4) 
# Fitting the model
gs4.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=10000,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                     

In [114]:
gs_model4 = gs4.best_estimator_

In [121]:
gs_model4.score(X_train, y_train)

0.9894806987481556

In [122]:
gs_model4.score(X_test, y_test)

0.9008996144509496

In [123]:
predictions2 = gs_model4.predict(X_test)

In [124]:
cm2 = confusion_matrix(y_test, predictions2)

In [128]:
cm_df2 = pd.DataFrame(cm2, columns=['pred gimlet', 'pred maximum_fun'], index=['actual gimlet', 'actual maximum_fun'])
cm_df2

Unnamed: 0,pred gimlet,pred maximum_fun
actual gimlet,3102,275
actual maximum_fun,419,3207
