In [1]:
# Data Frame and Math Imports
import pandas as pd
import numpy as np

# Visualization Imports
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Model Imports
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV

# Metric Imports
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
import imblearn.over_sampling

In [2]:
# Define the models evaluation function
def model_comparison(X, y):
    
    '''
    X : data set features
    y : data set target
    folds : number of cross-validation fold
    
    '''
    
    # Perform fit to each machine learning classifier
    log = log_model.fit(X, y)
    dtr = dtr_model.fit(X, y)
    rfc = rfc_model.fit(X, y)
    gnb = gnb_model.fit(X, y)
    bnb = bnb_model.fit(X, y)
    gbm = gbm_model.fit(X, y)
    gbm_tuned = gbm_tuned_model.fit(X, y)
    
    # Perform fit to each machine learning classifier
    log_predict = log_model.predict(X_test)
    dtr_predict = dtr_model.predict(X_test)
    rfc_predict = rfc_model.predict(X_test)
    gnb_predict = gnb_model.predict(X_test)
    bnb_predict = bnb_model.predict(X_test)
    gbm_predict = gbm_model.predict(X_test)
    gbm_tuned_predict = gbm_tuned_model.predict(X_test)
    
    # Create a data frame with the models perfomance metrics scores
    models_scores_table = pd.DataFrame({'Logistic Regression':[accuracy_score(y_test, log_predict),
                                                               precision_score(y_test, log_predict),
                                                               recall_score(y_test, log_predict),
                                                               f1_score(y_test, log_predict)],
                                       
                                      'Decision Tree':[accuracy_score(y_test, dtr_predict),
                                                        precision_score(y_test, dtr_predict),
                                                        recall_score(y_test, dtr_predict),
                                                        f1_score(y_test, dtr_predict)],
                                       
                                      'Random Forest':[accuracy_score(y_test, rfc_predict),
                                                        precision_score(y_test, rfc_predict),
                                                        recall_score(y_test, rfc_predict),
                                                        f1_score(y_test, rfc_predict)],
                                       
                                      'Bernoulli Naive Bayes':[accuracy_score(y_test, bnb_predict),
                                                                precision_score(y_test, bnb_predict),
                                                                recall_score(y_test, bnb_predict),
                                                                f1_score(y_test, bnb_predict)],
                                        
                                    'Gaussian Naive Bayes':[accuracy_score(y_test, gnb_predict),
                                                            precision_score(y_test, gnb_predict),
                                                            recall_score(y_test, gnb_predict),
                                                            f1_score(y_test, gnb_predict)],
                                       
                                       'XGradient Boost Classifier':[accuracy_score(y_test, gbm_predict),
                                                                     precision_score(y_test, gbm_predict),
                                                                     recall_score(y_test, gbm_predict),
                                                                     f1_score(y_test, gbm_predict)],
                                       
                               'XGradient Boost Classifier (Tuned)':[accuracy_score(y_test, gbm_tuned_predict),
                                                                     precision_score(y_test, gbm_tuned_predict),
                                                                     recall_score(y_test, gbm_tuned_predict),
                                                                     f1_score(y_test, gbm_tuned_predict)]},
                                       
                                       
                                      
                                      index = ['Accuracy', 'Precision', 'Recall', 'F1 Score'])
    
    # Add 'Best Score' column
    models_scores_table['Best Score'] = models_scores_table.idxmax(axis=1)
    
    # Print confusion matrix for each model
    print('Logistic Regression: \n', confusion_matrix(y_test, log_predict))
    print('Decision Tree Classifier: \n', confusion_matrix(y_test, dtr_predict))
    print('Random Forest Classifier: \n', confusion_matrix(y_test, rfc_predict))
    print('Bernoulli Naive Bayes: \n', confusion_matrix(y_test, bnb_predict))
    print('Gaussian Naive Bayes: \n', confusion_matrix(y_test, gnb_predict))
    print('Extreme Gradient Boost: \n', confusion_matrix(y_test, gbm_predict))
    print('Extreme Gradient Boost (Tuned): \n', confusion_matrix(y_test, gbm_tuned_predict))
    
    # Return models performance metrics scores data frame
    return(models_scores_table)

In [3]:
# Define the models evaluation function
def model_comparison_crossval(X, y, folds):
    
    '''
    X : data set features
    y : data set target
    folds : number of cross-validation fold
    
    '''
    
    # Define dictionary with performance metrics
    scoring = {'accuracy':make_scorer(accuracy_score), 
               'precision':make_scorer(precision_score),
               'recall':make_scorer(recall_score), 
               'f1_score':make_scorer(f1_score)}
    
    # Perform cross-validation to each machine learning classifier
    log = cross_validate(log_model, X, y, cv = folds, scoring = scoring)
    dtr = cross_validate(dtr_model, X, y, cv = folds, scoring = scoring)
    rfc = cross_validate(rfc_model, X, y, cv = folds, scoring = scoring)
    gnb = cross_validate(bnb_model, X, y, cv = folds, scoring = scoring)
    bnb = cross_validate(gnb_model, X, y, cv = folds, scoring = scoring)
    gbm = cross_validate(gbm_model, X, y, cv = folds, scoring = scoring)
    gbm_tuned = cross_validate(gbm_tuned_model, X, y, cv = folds, scoring = scoring)
    
    # Perform cross-validation prediction to each machine learning classifier
    log_predict = cross_val_predict(log_model, X, y, cv = folds)
    dtr_predict = cross_val_predict(dtr_model, X, y, cv = folds)
    rfc_predict = cross_val_predict(rfc_model, X, y, cv = folds)
    bnb_predict = cross_val_predict(bnb_model, X, y, cv = folds)
    gnb_predict = cross_val_predict(gnb_model, X, y, cv = folds)
    gbm_predict = cross_val_predict(gbm_model, X, y, cv = folds)
    gbm_tuned_predict = cross_val_predict(gbm_tuned_model, X, y, cv = folds)

    # Create a data frame with the models perfomance metrics scores
    models_scores_table = pd.DataFrame({'Logistic Regression':[log['test_accuracy'].mean(),
                                                               log['test_precision'].mean(),
                                                               log['test_recall'].mean(),
                                                               log['test_f1_score'].mean()],
                                       
                                      'Decision Tree':[dtr['test_accuracy'].mean(),
                                                       dtr['test_precision'].mean(),
                                                       dtr['test_recall'].mean(),
                                                       dtr['test_f1_score'].mean()],
                                       
                                      'Random Forest':[rfc['test_accuracy'].mean(),
                                                       rfc['test_precision'].mean(),
                                                       rfc['test_recall'].mean(),
                                                       rfc['test_f1_score'].mean()],
                                       
                                      'Bernoulli Naive Bayes':[bnb['test_accuracy'].mean(),
                                                               bnb['test_precision'].mean(),
                                                               bnb['test_recall'].mean(),
                                                               bnb['test_f1_score'].mean()],
                                        
                                    'Gaussian Naive Bayes':[gnb['test_accuracy'].mean(),
                                                            gnb['test_precision'].mean(),
                                                            gnb['test_recall'].mean(),
                                                            gnb['test_f1_score'].mean()],
                                       
                                       'XGradient Boost Classifier':[gbm['test_accuracy'].mean(),
                                                                     gbm['test_precision'].mean(),
                                                                     gbm['test_recall'].mean(),
                                                                     gbm['test_f1_score'].mean()],
                                       
                               'XGradient Boost Classifier (Tuned)':[gbm_tuned['test_accuracy'].mean(),
                                                                     gbm_tuned['test_precision'].mean(),
                                                                     gbm_tuned['test_recall'].mean(),
                                                                     gbm_tuned['test_f1_score'].mean()]},
                                      
                                      index = ['Accuracy', 'Precision', 'Recall', 'F1 Score'])
    
    # Add 'Best Score' column
    models_scores_table['Best Score'] = models_scores_table.idxmax(axis=1)
    
    # Return models performance metrics scores data frame
    return(models_scores_table)

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import random as sparse_random
from sklearn.random_projection import sparse_random_matrix
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
# Create two separate data frames for /r/onion and /r/nottheonion that are in EQUAL SAMPLE SIZE
df = pd.read_csv('data/2020/doc_topic_matrix_2020.csv')
df_tomerge = pd.read_csv('data/2020/merge_clean_2020.csv')

print('Shape of doc_topic_matrix: ', df.shape)
print('Shape of df to merge onto doc_topic_matrix: ', df_tomerge.shape)

Shape of doc_topic_matrix:  (36261, 6)
Shape of df to merge onto doc_topic_matrix:  (36261, 6)


In [6]:
df.head(2)

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6
0,0.0,1e-05,-0.0,3e-05,-1e-05,-0.0
1,-0.0,0.0,0.0,0.0,-0.0,-0.0


In [7]:
df_tomerge.head(2)

Unnamed: 0,Onion,Title,Positive_Sentiment,Negative_Sentiment,Neutral_Sentiment,Compound_Sentiment
0,1,Mentally Unbalanced Man Still Waiting For The ...,0.0,0.0,1.0,0.0
1,1,Trump Unable Produce Certificate Proving Not Pile,0.0,0.231,0.769,-0.5574


In [8]:
# Merge the two dataframes and do some fancy tricks to permit merging by resetting indexes
df = pd.merge(df.reset_index(),\
         df_tomerge[['Positive_Sentiment', 'Negative_Sentiment', 'Neutral_Sentiment', 'Onion']].\
         reset_index()).drop(columns='index')

In [9]:
df.head(2)

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,Positive_Sentiment,Negative_Sentiment,Neutral_Sentiment,Onion
0,0.0,1e-05,-0.0,3e-05,-1e-05,-0.0,0.0,0.0,1.0,1
1,-0.0,0.0,0.0,0.0,-0.0,-0.0,0.0,0.231,0.769,1


In [10]:
print('Amount of rows to /r/TheOnion (1): ', len(df[df['Onion'] == 1]))
print('Amount of rows to /r/NotTheOnion (0): ', len(df[df['Onion'] == 0]))

Amount of rows to /r/TheOnion (1):  1080
Amount of rows to /r/NotTheOnion (0):  35181


In [11]:
# remove_n = 70000
remove_n = 122890 # some factor of how many numbers to remove
df = df.drop(np.random.choice(df.index[df['Onion'] == 0], remove_n))

In [12]:
print('Amount of rows to /r/TheOnion (1): ', len(df[df['Onion'] == 1]))
print('Amount of rows to /r/NotTheOnion (0): ', len(df[df['Onion'] == 0]))

Amount of rows to /r/TheOnion (1):  1080
Amount of rows to /r/NotTheOnion (0):  1075


In [13]:
X = df.iloc[:, :len(df.columns)-1]
y = df.iloc[:, len(df.columns)-1]

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :len(df.columns)-1],
                                                    df.iloc[:, len(df.columns)-1], 
                                                    test_size = 0.2,
                                                    random_state = 42)

print('Features Shape (X):', X.shape)
print('Labels Shape (y):', y.shape)
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

Features Shape (X): (2155, 9)
Labels Shape (y): (2155,)
Training Features Shape: (1724, 9)
Training Labels Shape: (1724,)
Testing Features Shape: (431, 9)
Testing Labels Shape: (431,)


In [14]:
X.head(2)

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,Positive_Sentiment,Negative_Sentiment,Neutral_Sentiment
0,0.0,1e-05,-0.0,3e-05,-1e-05,-0.0,0.0,0.0,1.0
1,-0.0,0.0,0.0,0.0,-0.0,-0.0,0.0,0.231,0.769


In [15]:
# # Setup for the ratio argument of RandomOverSampler initialization
# n_pos = np.sum(y_train == 1)
# n_neg = np.sum(y_train == 0)
# ratio = {1 : n_pos * 20, 0 : int(n_neg)}

In [16]:
# # Randomly oversample positive samples: create 10x as many 
# ROS = imblearn.over_sampling.RandomOverSampler(sampling_strategy = ratio, random_state = 42) 

# X_train_resample, y_train_resample = ROS.fit_sample(X_train, y_train)

In [17]:
# # Randomly oversample positive samples: create 10x as many 
# RUS = imblearn.under_sampling.RandomUnderSampler(sampling_strategy = ratio, random_state = 42) 

# X_train_resample, y_train_resample = RUS.fit_sample(X_train, y_train)

In [18]:
log_model = LogisticRegression(max_iter = 5000,
                               solver = 'liblinear',
                               penalty = 'l1',
                               C = 21.544346900318832)
dtr_model = DecisionTreeClassifier()
rfc_model = RandomForestClassifier()
gnb_model = GaussianNB()
bnb_model = BernoulliNB()
gbm_model = xgb.XGBClassifier(objective = "binary:logistic")
gbm_tuned_model = xgb.XGBClassifier(n_estimators = 5300,
                                    max_depth = 1,
                                    objective = "binary:logistic",
                                    learning_rate = 0.1, 
                                    subsample = 1,
                                    min_child_weight = 1,
                                    colsample_bytree = 0.2,
                                    scale_pos_weight = 1)

In [19]:
'''
Ask yourself: is minimizing false positives (not depressed, marked depressed--precision)
              or false negatives (depressed, but not marked depressed--recall) more important?

Answer: it is better to minimize false negatives and hence, a better RECALL score
'''

# Run basic model comparison with no oversampling

model_comparison(X_train, y_train)

Logistic Regression: 
 [[151  74]
 [114  92]]
Decision Tree Classifier: 
 [[103 122]
 [ 95 111]]
Random Forest Classifier: 
 [[106 119]
 [ 87 119]]
Bernoulli Naive Bayes: 
 [[ 91 134]
 [ 50 156]]
Gaussian Naive Bayes: 
 [[  7 218]
 [  5 201]]
Extreme Gradient Boost: 
 [[106 119]
 [ 66 140]]
Extreme Gradient Boost (Tuned): 
 [[106 119]
 [ 63 143]]


Unnamed: 0,Logistic Regression,Decision Tree,Random Forest,Bernoulli Naive Bayes,Gaussian Naive Bayes,XGradient Boost Classifier,XGradient Boost Classifier (Tuned),Best Score
Accuracy,0.563805,0.49652,0.522042,0.573086,0.482599,0.570766,0.577726,XGradient Boost Classifier (Tuned)
Precision,0.554217,0.476395,0.5,0.537931,0.479714,0.540541,0.545802,Logistic Regression
Recall,0.446602,0.538835,0.57767,0.757282,0.975728,0.679612,0.694175,Gaussian Naive Bayes
F1 Score,0.494624,0.505695,0.536036,0.629032,0.6432,0.602151,0.611111,Gaussian Naive Bayes


In [20]:
# ### Run basic model comparison with 10x oversampling

# model_comparison(X_train_resample, y_train_resample)

In [21]:
### Run basic model comparison with K-fold = 5

model_comparison_crossval(X_train, y_train, 10) #countvect

Unnamed: 0,Logistic Regression,Decision Tree,Random Forest,Bernoulli Naive Bayes,Gaussian Naive Bayes,XGradient Boost Classifier,XGradient Boost Classifier (Tuned),Best Score
Accuracy,0.546955,0.524997,0.552295,0.518568,0.556301,0.540086,0.5807,XGradient Boost Classifier (Tuned)
Precision,0.571969,0.528053,0.551688,0.513349,0.549068,0.538825,0.572162,XGradient Boost Classifier (Tuned)
Recall,0.424412,0.590556,0.62722,0.967973,0.700235,0.655747,0.693508,Bernoulli Naive Bayes
F1 Score,0.48604,0.55695,0.586254,0.670849,0.615072,0.591105,0.626712,Bernoulli Naive Bayes


### Model Tuning: XGBoost Classifier

In [24]:
'''
This is a setup for hyperparameter tuning.
It uses a random grid search to find the 'best' hyperparamaters to use for random forest classification.
'''

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 1000, stop = 11000, num = 101)]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 5, num = 5)]

scale_pos_weight = [int(x) for x in np.linspace(1, 20, num = 20)]

colsample_bytree = [round(x, 1) for x in np.linspace(0.2, 2.0, num = 10)]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'scale_pos_weight': scale_pos_weight,
               'colsample_bytree': colsample_bytree}

print(random_grid)

{'n_estimators': [1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 5100, 5200, 5300, 5400, 5500, 5600, 5700, 5800, 5900, 6000, 6100, 6200, 6300, 6400, 6500, 6600, 6700, 6800, 6900, 7000, 7100, 7200, 7300, 7400, 7500, 7600, 7700, 7800, 7900, 8000, 8100, 8200, 8300, 8400, 8500, 8600, 8700, 8800, 8900, 9000, 9100, 9200, 9300, 9400, 9500, 9600, 9700, 9800, 9900, 10000, 10100, 10200, 10300, 10400, 10500, 10600, 10700, 10800, 10900, 11000], 'max_depth': [1, 2, 3, 4, 5], 'scale_pos_weight': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], 'colsample_bytree': [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]}


In [25]:
# This runs the hyperparameter tuning for random forest classification.
# It takes about 30 minutes to run.

# Use the random grid to search for best hyperparameters

# Random search of parameters, using 3 fold cross validation, 
# search across 150 different combinations, and use all available cores

scorers = {'precision_score': make_scorer(precision_score),
           'recall_score': make_scorer(recall_score),
           'accuracy_score': make_scorer(accuracy_score)}

gbm_search = RandomizedSearchCV(estimator = gbm_model,
                                param_distributions = random_grid,
                                n_iter = 200, cv = 10, verbose = 2,
                                random_state = 42,
                                scoring = scorers, #Specifically optimizes accuracy
                                refit = 'accuracy_score',
                                n_jobs = -1)
                                     
# Fit the random search model
gbm_search.fit(X_train, y_train)


Fitting 10 folds for each of 200 candidates, totalling 2000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   24.9s
[Parallel(n_jobs=-1)]: Done 440 tasks      | elapsed:   58.6s
[Parallel(n_jobs=-1)]: Done 800 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 1173 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 1890 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed:  5.8min finished


RandomizedSearchCV(cv=10,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           gpu_id=-1, importance_type='gain',
                                           interaction_constraints='',
                                           learning_rate=0.300000012,
                                           max_delta_step=0, max_depth=6,
                                           min_child_weight=1, missing=nan,
                                           monotone_constraints='()',
                                           n_estimators=100, n_jobs=0,
                                           num_p...
                                                         2200, 2300, 2400, 2500,
                                                       

In [26]:
# Run this after the above cell is run to collect best parameters
gbm_search.best_params_

{'scale_pos_weight': 1,
 'n_estimators': 5300,
 'max_depth': 1,
 'colsample_bytree': 0.2}

### Model Tuning: Logistic Regression

In [22]:
# Hyperparameters to look through
C = np.logspace(0, 4, num = 10)
penalty = ['l1', 'l2']
solver = ['liblinear', 'saga']

# Create the random grid
hyperparameters = dict(C=C, penalty=penalty, solver=solver)

# Perform the search
logistic = linear_model.LogisticRegression()
gridsearch = GridSearchCV(logistic, hyperparameters)

best_model = gridsearch.fit(X_train, y_train)





LogisticRegression(C=166.81005372000593, penalty='l1', solver='saga')




In [27]:
print(best_model.best_estimator_)

LogisticRegression(C=166.81005372000593, penalty='l1', solver='saga')
