In [41]:
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import pickle

In [2]:
pwd

'C:\\Users\\ajdou\\Desktop\\Springboard\\assignments\\Capstone Project 2'

In [3]:
#load in dataset
df = pd.read_csv('data\\model_df.csv', index_col = 0)

In [4]:
df.head()

Unnamed: 0,label,text_wc,title_wc,av_word_len_text,av_word_len_title,upper_text_wc,upper_title_wc,numerics_text,numerics_title,exclam_text,...,text_ngram_worker,text_ngram_working,text_ngram_world,text_ngram_would,text_ngram_wrong,text_ngram_wrote,text_ngram_year,text_ngram_yet,text_ngram_york,text_ngram_young
0,1,495,12,4.80404,5.583333,5,0,4,0,6,...,0.0,0.0,0.0,0.042374,0.051551,0.0,0.39707,0.0,0.0,0.0
1,1,305,8,5.213115,7.625,3,0,0,0,0,...,0.0,0.0,0.075998,0.0,0.0,0.0,0.0,0.0,0.174931,0.0
2,1,580,15,5.168966,5.0,42,0,0,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,444,14,5.18018,4.571429,5,1,5,0,0,...,0.0,0.05844,0.0,0.087214,0.0,0.0,0.034052,0.0,0.0,0.0
4,1,420,11,4.554762,5.363636,0,0,0,0,0,...,0.0,0.0,0.065641,0.0,0.0,0.0,0.0,0.076686,0.0,0.092756


In [5]:
#separate features and label
X = df.drop('label', axis=1)
y = df.label

In [6]:
#scale data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
#split data into 75% training set and 25% testing set
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=.25, random_state= 42)

### Naive Bayes

In [7]:
#initialize model
mnb = MultinomialNB()
#fit model to training data and caclulate training time
t1 = time.time()
mnb.fit(X_train,y_train)
time.time() - t1

0.11793184280395508

In [8]:
#training accuracy
mnb.score(X_train, y_train)

0.9436374795417348

In [10]:
#make predictions on test set
mnb_preds = mnb.predict(X_test)
#testing metrics
print(classification_report(y_test, mnb_preds))
print(confusion_matrix(y_test, mnb_preds))

              precision    recall  f1-score   support

           0       0.95      0.94      0.95      5304
           1       0.93      0.94      0.94      4472

    accuracy                           0.94      9776
   macro avg       0.94      0.94      0.94      9776
weighted avg       0.94      0.94      0.94      9776

[[5006  298]
 [ 260 4212]]


### Random Forest

In [11]:
#split unscaled data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state= 42)

In [12]:
#initialize model
rfc = RandomForestClassifier()
#fit model to training data and caclulate training time
t1 = time.time()
rfc.fit(X_train, y_train)
time.time() - t1

33.69265604019165

In [13]:
#training accuracy
rfc.score(X_train, y_train)

1.0

In [15]:
#make predictions on test set
rfc_preds = rfc.predict(X_test)
#testing metrics
print(classification_report(y_test, rfc_preds))
print(confusion_matrix(y_test, rfc_preds))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      5304
           1       0.99      0.99      0.99      4472

    accuracy                           0.99      9776
   macro avg       0.99      0.99      0.99      9776
weighted avg       0.99      0.99      0.99      9776

[[5279   25]
 [  39 4433]]


In [16]:
#Tune hyperparameters with RandomizedSearchCV
params = {'n_estimators':[10,50,100,150,200], 'criterion': ['gini', 'entropy'], 'max_depth':[2,5,10, None]}
rf_cv = RandomizedSearchCV(RandomForestClassifier(), param_distributions=params, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1)
rf_cv.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.3min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [17]:
#make dataframe of random search results
rf_cv_results = pd.DataFrame(rf_cv.cv_results_)
rf_cv_results[['param_n_estimators', 'param_max_depth', 'param_criterion', 'mean_test_score', 'rank_test_score']].sort_values(by='rank_test_score').head(5)

Unnamed: 0,param_n_estimators,param_max_depth,param_criterion,mean_test_score,rank_test_score
0,200,,gini,0.999681,1
3,50,,gini,0.999469,2
5,100,10.0,gini,0.999219,3
4,150,5.0,gini,0.998325,4
8,100,5.0,entropy,0.998176,5


In [18]:
#find best parameters
print(rf_cv.best_params_)
print(rf_cv.best_score_)

{'n_estimators': 200, 'max_depth': None, 'criterion': 'gini'}
0.9996807526079238


In [39]:
#build a second random forest model with the best parameters
rfc2 = RandomForestClassifier(**rf_cv.best_params_)
rfc2.fit(X_train, y_train)
rfc2_preds = rfc2.predict(X_test)
#test metrics
print(classification_report(y_test, rfc2_preds))
print(confusion_matrix(y_test, rfc2_preds))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5304
           1       0.99      0.99      0.99      4472

    accuracy                           0.99      9776
   macro avg       0.99      0.99      0.99      9776
weighted avg       0.99      0.99      0.99      9776

[[5275   29]
 [  38 4434]]


In [40]:
#make a dataframe of the top 20 most important features in the model
pd.DataFrame(rfc2.feature_importances_, index=df.drop('label', axis=1).columns, columns=['importance']).sort_values(by='importance', ascending=False).head(20)

Unnamed: 0,importance
upper_title_wc,0.097296
text_ngram_said,0.079227
title_wc,0.074212
text_ngram_image,0.061524
qmark_text,0.035558
title_ngram_video,0.033167
stop_p,0.030983
at_sign,0.02513
exclam_text,0.015843
av_word_len_text,0.014964


### Logistic Regression

In [21]:
#scale and split data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=.25, random_state= 42)

In [22]:
#initizlize and train model
lr = LogisticRegression(max_iter=200)
t1 = time.time()
lr.fit(X_train, y_train)
time.time() - t1

3.4320287704467773

In [23]:
#training accuracy
lr.score(X_train,y_train)

1.0

In [24]:
#make prediction and calculate testing metrics
lr_preds = lr.predict(X_test)
print(classification_report(y_test, lr_preds))
print(confusion_matrix(y_test, lr_preds))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5304
           1       0.99      0.98      0.99      4472

    accuracy                           0.99      9776
   macro avg       0.99      0.99      0.99      9776
weighted avg       0.99      0.99      0.99      9776

[[5252   52]
 [  70 4402]]


In [25]:
#tune model with RandomizedSearchCV
params = {'penalty':['l1', 'l2', 'elasticnet'], 'C':np.logspace(-2,2,5), 'solver':['lbfgs', 'sag', 'saga']}
lr_cv = RandomizedSearchCV(LogisticRegression(), param_distributions=params, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1)
lr_cv.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  4.1min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=LogisticRegression(C=1.0, class_weight=None,
                                                dual=False, fit_intercept=True,
                                                intercept_scaling=1,
                                                l1_ratio=None, max_iter=100,
                                                multi_class='auto', n_jobs=None,
                                                penalty='l2', random_state=None,
                                                solver='lbfgs', tol=0.0001,
                                                verbose=0, warm_start=False),
                   iid='deprecated', n_iter=10, n_jobs=-1,
                   param_distributions={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                                        'penalty': ['l1', 'l2', 'elasticnet'],
                                        'solver': ['lbfgs', 'sag', 'saga']},
                   pre_dispatch='

In [26]:
#make a dataframe of search results sorted by best score
lr_cv_results = pd.DataFrame(lr_cv.cv_results_)
lr_cv_results[['param_solver', 'param_penalty', 'param_C', 'mean_test_score', 'rank_test_score']].sort_values(by='rank_test_score')

Unnamed: 0,param_solver,param_penalty,param_C,mean_test_score,rank_test_score
5,lbfgs,l2,0.01,0.999211,1
4,sag,l2,0.01,0.99921,2
9,saga,l2,0.01,0.999207,3
8,saga,l1,10.0,0.999141,4
1,saga,l2,10.0,0.999135,5
0,lbfgs,l1,100.0,,6
2,sag,elasticnet,10.0,,7
3,saga,elasticnet,0.01,,8
6,lbfgs,elasticnet,10.0,,9
7,sag,l1,0.01,,10


In [27]:
#find best score and best parameters
print(lr_cv.best_score_)
print(lr_cv.best_params_)

0.9992112152421515
{'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.01}


In [28]:
#build a 2nd logisitic regression model with best parameters
lr2 = LogisticRegression(**lr_cv.best_params_, max_iter=200)
lr2.fit(X_train, y_train)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [29]:
#make predictions and calculate metrics
lr2_preds = lr2.predict(X_test)
print(classification_report(y_test, lr2_preds))
print(confusion_matrix(y_test, lr2_preds))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5304
           1       0.99      0.99      0.99      4472

    accuracy                           0.99      9776
   macro avg       0.99      0.99      0.99      9776
weighted avg       0.99      0.99      0.99      9776

[[5272   32]
 [  59 4413]]


In [30]:
#dataframe of model coefficients to find most important features
pd.DataFrame(lr2.coef_.T, index=df.drop('label', axis=1).columns, columns=['coef']).sort_values(by='coef', ascending=False).head(10)

Unnamed: 0,coef
upper_title_wc,1.255792
title_wc,1.007022
text_ngram_image,0.683703
title_ngram_video,0.520196
title_ngram_breaking,0.375867
exclam_title,0.368532
text_ngram_gop,0.365698
text_ngram_even,0.258577
text_ngram_president trump,0.230691
at_sign,0.23004


### Support Vector Machine

In [31]:
#initialize, fit, and time model
svc = SVC()
t1 = time.time()
svc.fit(X_train, y_train)
time.time() - t1

687.2539291381836

In [32]:
#training accuracy
svc.score(X_train, y_train)

0.9990793780687398

In [34]:
#make predictions and calculate metrics
svc_preds = svc.predict(X_test)
print(classification_report(y_test, svc_preds))
print(confusion_matrix(y_test, svc_preds))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5304
           1       0.99      0.99      0.99      4472

    accuracy                           0.99      9776
   macro avg       0.99      0.99      0.99      9776
weighted avg       0.99      0.99      0.99      9776

[[5277   27]
 [  51 4421]]


### KNN

In [35]:
#initialize, fit, and time model
knn = KNeighborsClassifier()
t1=time.time()
knn.fit(X_train, y_train)
t2 = time.time() - t1
t2

19.049558639526367

In [37]:
#training accuracy
knn.score(X_train, y_train)

0.7208128750681942

In [38]:
#make predictions and calculate metrics
knn_preds=knn.predict(X_test)
print(classification_report(y_test, knn_preds))
print(confusion_matrix(y_test, knn_preds))

              precision    recall  f1-score   support

           0       0.95      0.43      0.60      5304
           1       0.59      0.97      0.74      4472

    accuracy                           0.68      9776
   macro avg       0.77      0.70      0.67      9776
weighted avg       0.79      0.68      0.66      9776

[[2306 2998]
 [ 123 4349]]


In [43]:
#save best model
filename = 'rf_model.sav'
pickle.dump(rfc2, open(filename, 'wb'))