### Note: this notebook contains experiments with Random Search CV - which I did not end up using for my presentation

In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import train_test_split. Crossval score. Gridsearch CV
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

# Import Standard Scaller
from sklearn.preprocessing import StandardScaler


# Import logistic regression. KNN.
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# Import metrics
from sklearn import metrics

#Import Pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


# imports

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, plot_confusion_matrix


# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [3]:
df = pd.read_csv('../data/text_count_sentiment.csv')

In [4]:
# Step 1: Split into training & testing sets
X = df['features']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13)

In [5]:
# Redefine training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=13)

In [6]:
y_test.value_counts(normalize = True)

1    0.501002
0    0.498998
Name: target, dtype: float64

In [7]:
X_train.shape

(1494,)

In [13]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'cvec__max_features':[2000, 3000, 4000, 5000],
               'cvec__min_df' : [2, 3],
               'cvec__max_df': [.9, .95],
               'cvec__ngram_range': [(1, 1), (1, 2)],
               'rf__n_estimators': n_estimators,
               'rf__max_features': max_features,
               'rf__max_depth': max_depth,
               'rf__min_samples_split': min_samples_split,
               'rf__min_samples_leaf': min_samples_leaf,
               'rf__bootstrap': bootstrap}
print(random_grid)

{'cvec__max_features': [2000, 3000, 4000, 5000], 'cvec__min_df': [2, 3], 'cvec__max_df': [0.9, 0.95], 'cvec__ngram_range': [(1, 1), (1, 2)], 'rf__n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'rf__max_features': ['auto', 'sqrt'], 'rf__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'rf__min_samples_split': [2, 5, 10], 'rf__min_samples_leaf': [1, 2, 4], 'rf__bootstrap': [True, False]}


In [16]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('rf', RandomForestClassifier())
])


In [17]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(pipe, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.1min finished


RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                             ('rf', RandomForestClassifier())]),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'cvec__max_df': [0.9, 0.95],
                                        'cvec__max_features': [2000, 3000, 4000,
                                                               5000],
                                        'cvec__min_df': [2, 3],
                                        'cvec__ngram_range': [(1, 1), (1, 2)],
                                        'rf__bootstrap': [True, False],
                                        'rf__max_depth': [10, 20, 30, 40, 50,
                                                          60, 70, 80, 90, 100,
                                                          110, None],
                                        'rf__max_features': ['auto', 'sqrt'],
                                      

In [18]:
rf_random.best_params_


{'rf__n_estimators': 1000,
 'rf__min_samples_split': 10,
 'rf__min_samples_leaf': 2,
 'rf__max_features': 'auto',
 'rf__max_depth': 40,
 'rf__bootstrap': False,
 'cvec__ngram_range': (1, 1),
 'cvec__min_df': 3,
 'cvec__max_features': 2000,
 'cvec__max_df': 0.9}

In [19]:
# Score model on training set.
print(f'Train accuracy = {rf_random.score(X_train, y_train)}')
# Score model on testing set.
print(f'Test accuracy = {rf_random.score(X_test, y_test)}')
# Get predictions
preds = rf_random.predict(X_test)
# Save confusion matrix values
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
# Calculate the specificity
print(f'Specificity = {tn / (tn + fp)}')
# Calculate the recall
print(f'Recall = {tp / (tp + fn)}')

Train accuracy = 0.9223560910307899
Test accuracy = 0.7615230460921844
Specificity = 0.7389558232931727
Recall = 0.784


# TFIDF

In [32]:
# Instantiate the transformer.
tvec = TfidfVectorizer()

In [33]:
# Step 1: Split into training & testing sets
X = df['features']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13)

In [34]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = { 'tvec__max_features': [2000, 3000, 4000, 5000],
               'tvec__stop_words': [None, 'english'],
               'tvec__min_df' : [2, 3],
               'tvec__max_df': [.9, .95],
               'tvec__ngram_range': [(1,1), (1,2)],
               'rf__n_estimators': n_estimators,
               'rf__max_features': max_features,
               'rf__max_depth': max_depth,
               'rf__min_samples_split': min_samples_split,
               'rf__min_samples_leaf': min_samples_leaf,
               'rf__bootstrap': bootstrap}
print(random_grid)

{'tvec__max_features': [2000, 3000, 4000, 5000], 'tvec__stop_words': [None, 'english'], 'tvec__min_df': [2, 3], 'tvec__max_df': [0.9, 0.95], 'tvec__ngram_range': [(1, 1), (1, 2)], 'rf__n_estimators': [50, 100, 150, 200, 250, 300, 350, 400, 450, 500], 'rf__max_features': ['auto', 'sqrt'], 'rf__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'rf__min_samples_split': [2, 5, 10], 'rf__min_samples_leaf': [1, 2, 4], 'rf__bootstrap': [True, False]}


In [35]:
# Let's set a pipeline up with two stages:
# 1. tf-idf vectorizer (transformer)
# 2. Multinomial Naive Bayes (estimator)
pipe_tvec = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])


In [36]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(pipe_tvec, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   41.0s finished


RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                             ('rf', RandomForestClassifier())]),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'rf__bootstrap': [True, False],
                                        'rf__max_depth': [10, 20, 30, 40, 50,
                                                          60, 70, 80, 90, 100,
                                                          110, None],
                                        'rf__max_features': ['auto', 'sqrt'],
                                        'rf__min_samples_leaf': [1, 2, 4],
                                        'rf__min_samples_split': [2, 5, 10],
                                        'rf__n_estimators': [50, 100, 150, 200,
                                                             250, 300, 350, 400,
                                                             450, 500],
                

In [37]:
# Score model on training set.
rf_random.best_params_

{'tvec__stop_words': 'english',
 'tvec__ngram_range': (1, 1),
 'tvec__min_df': 3,
 'tvec__max_features': 5000,
 'tvec__max_df': 0.95,
 'rf__n_estimators': 350,
 'rf__min_samples_split': 2,
 'rf__min_samples_leaf': 4,
 'rf__max_features': 'auto',
 'rf__max_depth': 80,
 'rf__bootstrap': False}

In [38]:
# Score model on training set.
print(f'Train accuracy = {rf_random.score(X_train, y_train)}')
# Score model on testing set.
print(f'Test accuracy = {rf_random.score(X_test, y_test)}')
# Get predictions
preds = rf_random.predict(X_test)
# Save confusion matrix values
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
# Calculate the specificity
print(f'Specificity = {tn / (tn + fp)}')
# Calculate the recall
print(f'Recall = {tp / (tp + fn)}')

Train accuracy = 0.8775100401606426
Test accuracy = 0.7274549098196392
Specificity = 0.7450199203187251
Recall = 0.7096774193548387


# G boost

In [53]:
from sklearn.model_selection import RandomizedSearchCV
learning_rate = [0.15,0.1,0.05,0.01,0.005,0.001]
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'tvec__max_features': [2000, 3000, 4000, 5000],
               'tvec__stop_words': [None, 'english'],
               'tvec__min_df' : [2, 3],
               'tvec__max_df': [.9, .95],
               'tvec__ngram_range': [(1,1), (1,2)],
               'gb__learning_rate': learning_rate,
               'gb__n_estimators': n_estimators,
               'gb__max_features': max_features,
               'gb__max_depth': max_depth,
               'gb__min_samples_split': min_samples_split,
               'gb__min_samples_leaf': min_samples_leaf}
print(random_grid)

{'tvec__max_features': [2000, 3000, 4000, 5000], 'tvec__stop_words': [None, 'english'], 'tvec__min_df': [2, 3], 'tvec__max_df': [0.9, 0.95], 'tvec__ngram_range': [(1, 1), (1, 2)], 'gb__learning_rate': [0.15, 0.1, 0.05, 0.01, 0.005, 0.001], 'gb__n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'gb__max_features': ['auto', 'sqrt'], 'gb__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'gb__min_samples_split': [2, 5, 10], 'gb__min_samples_leaf': [1, 2, 4]}


In [54]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
# Let's set a pipeline up with two stages:
# 1. tf-idf vectorizer (transformer)
# 2. Multinomial Naive Bayes (estimator)
pipe_tvec = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('gb', GradientBoostingClassifier())
])


In [55]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
#rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
gb_random = RandomizedSearchCV(pipe_tvec, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
gb_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   38.0s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed: 23.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 43.2min finished


RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                             ('gb',
                                              GradientBoostingClassifier())]),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'gb__learning_rate': [0.15, 0.1, 0.05,
                                                              0.01, 0.005,
                                                              0.001],
                                        'gb__max_depth': [10, 20, 30, 40, 50,
                                                          60, 70, 80, 90, 100,
                                                          110, None],
                                        'gb__max_features': ['auto', 'sqrt'],
                                        'gb__min_samples_leaf': [1, 2, 4],
                                        'gb__min_samples_split': [2, 5, 10],
                                        'gb__n_

In [56]:
# Score model on training set.
gb_random.best_params_

{'tvec__stop_words': 'english',
 'tvec__ngram_range': (1, 1),
 'tvec__min_df': 2,
 'tvec__max_features': 4000,
 'tvec__max_df': 0.95,
 'gb__n_estimators': 600,
 'gb__min_samples_split': 10,
 'gb__min_samples_leaf': 4,
 'gb__max_features': 'sqrt',
 'gb__max_depth': 40,
 'gb__learning_rate': 0.01}

In [46]:
gb = GradientBoostingClassifier()
gb.get_params().keys()

dict_keys(['ccp_alpha', 'criterion', 'init', 'learning_rate', 'loss', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_iter_no_change', 'presort', 'random_state', 'subsample', 'tol', 'validation_fraction', 'verbose', 'warm_start'])

In [58]:
print('Train score: ', gb_random.score(X_train, y_train))
print('Test score: ', gb_random.score(X_test, y_test))
preds = gb_random.predict(X_test)
# Save confusion matrix values
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
# Calculate the specificity
print(f'Specificity = {tn / (tn + fp)}')
# Calculate the recall
print(f'Recall = {tp / (tp + fn)}')

Train score:  0.964524765729585
Test score:  0.7234468937875751
Specificity = 0.7290836653386454
Recall = 0.717741935483871
