# Week 6 - Classifying Text

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

## Get the Data
Unlike the book, I'm not fetching from the NLTK dataset. I have my own version, which is a little cleaner than the one on NLTK.

In [2]:
data_df= pd.read_csv('data/fetch_20newsgroups.csv')
print('Original shape:', data_df.shape)
print(data_df.head(10))

Original shape: (18541, 3)
                                             Article  Target Label  \
0  \n\nI am sure some bashers of Pens fans are pr...          10.0   
1  My brother is in the market for a high-perform...           3.0   
2  \n\n\n\n\tFinally you said what you dream abou...          17.0   
3  \nThink!\n\nIt's the SCSI card doing the DMA t...           3.0   
4  1)    I have an old Jasmine drive which I cann...           4.0   
5  \n\nBack in high school I worked as a lab assi...          12.0   
6  \n\nAE is in Dallas...try 214/241-6060 or 214/...           4.0   
7  \n[stuff deleted]\n\nOk, here's the solution t...          10.0   
8  \n\n\nYeah, it's the second one.  And I believ...          10.0   
9  \nIf a Christian means someone who believes in...          19.0   

                Target Name  
0          rec.sport.hockey  
1  comp.sys.ibm.pc.hardware  
2     talk.politics.mideast  
3  comp.sys.ibm.pc.hardware  
4     comp.sys.mac.hardware  
5           sci.electr

## Data Preprocessing and Normalization

In [3]:
total_nulls = data_df[data_df.Article.str.strip() == ''].shape[0]
print('Empty documents:', total_nulls)

data_df = data_df[~(data_df.Article.str.strip() == '')]
print('New shape:', data_df.shape)

Empty documents: 130
New shape: (18411, 3)


## Starting on page 290 - follow my code! Author's code won't work.

In [15]:
import nltk
# There are several ways to get folders visible in Python. This way isn't the most elegant
# but it works consistently. Replace my path with yours. The path you append to should be the
# folder where your tokenizer Python class is located.
import sys
sys.path.append(r'YOUR PATH TO THE TEXT NORMALIZER')
from text_normalizer import TextNormalizer # this will probably not be your path

# create the normalizer object
tn = TextNormalizer()

# normalize the corpus
import time
start = time.time()
norm_corpus = tn.normalize_corpus(corpus=data_df['Article'], html_stripping=True,
                                  contraction_expansion=True, accented_char_removal=True,
                                  text_lower_case=True, text_lemmatization=True,
                                  special_char_removal=True, remove_digits=True,
                                  stopword_removal=True)
full_time = round(time.time() - start, 2)
print('Normalizing finished in ', str(full_time), '\n')

data_df['Clean Article'] = norm_corpus

# view sample data
data_df = data_df[['Article', 'Clean Article', 'Target Label', 'Target Name']]
print(data_df.head(), '\n')

data_df = data_df.dropna(axis=0, how='any').reset_index(drop=True)
print(data_df.info())

Starting TextNormalizer
Done strip
Done lower
Done stopword
Done char remove
Done contract exp
Done text lemm
Done spec char remove
Normalizing finished in  34.18 

                                             Article  \
0  \n\nI am sure some bashers of Pens fans are pr...   
1  My brother is in the market for a high-perform...   
2  \n\n\n\n\tFinally you said what you dream abou...   
3  \nThink!\n\nIt's the SCSI card doing the DMA t...   
4  1)    I have an old Jasmine drive which I cann...   

                                       Clean Article  Target Label  \
0  sure bashers pens fans pretty confused lack ki...          10.0   
1  brother market highperformance video card supp...           3.0   
2  finally said dream about mediterranean new are...          17.0   
3  think scsi card dma transfers disks scsi card ...           3.0   
4   old jasmine drive cannot use new system under...           4.0   

                Target Name  
0          rec.sport.hockey  
1  comp.sys.ibm.p

## Save the cleaned file

In [16]:
data_df.to_csv('data/clean_newsgroups.csv', index=False)

## Confusion Matrix - Starting on Page 292
Building test and train data sets

In [18]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# load the data save to .csv from a_data.py
# this line isn't necessary unless you start from here
data_df = pd.read_csv('data/clean_newsgroups.csv')

train_corpus, test_corpus, train_label_nums, test_label_nums, \
    train_label_names, test_label_names = train_test_split(np.array(data_df['Clean Article']),
                                                          np.array(data_df['Target Label']),
                                                          np.array(data_df['Target Name']),
                                                          test_size=0.33, random_state=42)

from collections import Counter
trd  = dict(Counter(train_label_names))
tsd = dict(Counter(test_label_names))

print((pd.DataFrame([[key, trd[key], tsd[key]] for key in trd],
              columns=['Target Label', 'Train Count',
                       'Test Count']).sort_values(by=['Train Count', 'Test Count'], ascending=False)), '\n')

                Target Label  Train Count  Test Count
0                    sci.med          652         294
4         rec.sport.baseball          649         297
16            comp.windows.x          643         321
10              misc.forsale          642         300
12          rec.sport.hockey          640         320
14  comp.sys.ibm.pc.hardware          639         311
1              comp.graphics          638         303
6                  rec.autos          634         287
19     comp.sys.mac.hardware          634         284
13    soc.religion.christian          632         329
7                  sci.space          631         313
15           sci.electronics          627         315
3            rec.motorcycles          623         332
2      talk.politics.mideast          620         284
8                  sci.crypt          617         320
9    comp.os.ms-windows.misc          606         317
17        talk.politics.guns          582         291
11               alt.atheism

## Evaluating Classification Models
For some strange reason, the author uses a breast cancer dataset to show the model evaluation methods (versus the newsgroup dataset). 

### Confusion Matrix - Starting on Page 310

In [19]:
from sklearn import linear_model
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.75,
                                                    test_size=0.25,
                                                    random_state=1234)

# train and build the model
logistic = linear_model.LogisticRegression(solver='lbfgs', max_iter=5000)
logistic.fit(X_train, y_train)
# predict on test data and view confusion matrix
y_pred = logistic.predict(X_test)

# note this is a standard package, not the one in the book on page 310
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred, labels=[0, 1])
print('Confusion matrix: \n', confusion_matrix)

# Performance Metrics, starting on page 312
positive_class = 1
TP = confusion_matrix[1, 1]
FP = confusion_matrix[0, 1]
TN = confusion_matrix[0, 0]
FN = confusion_matrix[1, 0]
print(TP, FP, TN, FN, '\n')

Confusion matrix: 
 [[49  6]
 [ 2 86]]
86 6 49 2 



### Accuracy, Precision, and Recall

In [20]:
# accuracy
from sklearn.metrics import accuracy_score # standard
print('Framework Accuracy:', round(accuracy_score(y_test, y_pred), 5))
mc_acc = round((TP + TN) / (TP + TN + FP + FN), 5)
print('Manually Computed Accuracy:', mc_acc, '\n')

# precision
from sklearn.metrics import precision_score
print('Framework Precision:', round(precision_score(y_test, y_pred), 5))
mc_prec = round((TP) / (TP + FP), 5)
print('Manually Computed Precision:', mc_prec, '\n')

# recall
from sklearn.metrics import recall_score
print('Framework Recall:', round(recall_score(y_test, y_pred), 5))
mc_rec = round((TP) / (TP + FN), 5)
print('Manually computed Recall:', mc_rec, '\n')

from sklearn.metrics import f1_score
print('Framework F1-Score:', round(f1_score(y_test, y_pred), 5))
mc_f1 = round((2*mc_prec*mc_rec) / (mc_prec+mc_rec), 5)
print('Manually Computed F1-Score:', mc_f1)

Framework Accuracy: 0.94406
Manually Computed Accuracy: 0.94406 

Framework Precision: 0.93478
Manually Computed Precision: 0.93478 

Framework Recall: 0.97727
Manually computed Recall: 0.97727 

Framework F1-Score: 0.95556
Manually Computed F1-Score: 0.95555


## BOW Features with Classification Models - starting on page 315
We go back to the newsgroup dataset - you don't need to reload it, but I do so for completeness.

In [35]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

clean_df = pd.read_csv('data/clean_newsgroups.csv')

# cleaning the articles created some NaNs - get rid of those
data_df = clean_df[clean_df['Clean Article'].notna()]

# split to test and train
train_corpus, test_corpus, train_label_nums, test_label_nums, \
    train_label_names, test_label_names = train_test_split(np.array(data_df['Clean Article']),
                                                          np.array(data_df['Target Label']),
                                                          np.array(data_df['Target Name']),
                                                          test_size=0.33, random_state=42)

### Build BOW features on train articles

In [40]:
cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0)
cv_train_features = cv.fit_transform(train_corpus)

# transform test articles into features
cv_test_features = cv.transform(test_corpus)

print('Train features shape:', cv_train_features.shape)
print('Test features shape:', cv_test_features.shape, '\n')

Train features shape: (12072, 67312)
Test features shape: (5946, 67312) 



## Other Models - Starting on Page 316
This takes a while to run.

In [42]:
print('Naive Bayes:')
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB(alpha=1)
mnb.fit(cv_train_features, train_label_names)
mnb_bow_cv_scores = cross_val_score(mnb, cv_train_features, train_label_names, cv=5)
mnb_bow_cv_mean_score = np.mean(mnb_bow_cv_scores)
print('CV Accuracy (5-fold):', mnb_bow_cv_scores)
print('Mean CV Accuracy:', mnb_bow_cv_mean_score)
mnb_bow_test_score = mnb.score(cv_test_features, test_label_names)
print('Test Accuracy:', mnb_bow_test_score, '\n')

# Logistic Regression - starting on page 316
# This takes quite a while to run, be patient.
print('Logistic Regression:')
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2', max_iter=1000, solver='lbfgs',
                        C=1, random_state=42, multi_class='auto')
lr.fit(cv_train_features, train_label_names)
lr_bow_cv_scores = cross_val_score(lr, cv_train_features, train_label_names, cv=5)
lr_bow_cv_mean_score = np.mean(lr_bow_cv_scores)
print('CV Accuracy (5-fold):', lr_bow_cv_scores)
print('Mean CV Accuracy:', lr_bow_cv_mean_score)
lr_bow_test_score = lr.score(cv_test_features, test_label_names)
print('Test Accuracy:', lr_bow_test_score, '\n')

# Support Vector Machines
print('SVM:')
from sklearn.svm import LinearSVC

svm = LinearSVC(penalty='l2', max_iter=1000, C=1, random_state=42)
svm.fit(cv_train_features, train_label_names)
svm_bow_cv_scores = cross_val_score(svm, cv_train_features, train_label_names, cv=5)
svm_bow_cv_mean_score = np.mean(svm_bow_cv_scores)
print('CV Accuracy (5-fold):', svm_bow_cv_scores)
print('Mean CV Accuracy:', svm_bow_cv_mean_score)
svm_bow_test_score = svm.score(cv_test_features, test_label_names)
print('Test Accuracy:', svm_bow_test_score, '\n')

# Random Forest
print('Random Forest')
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=10, random_state=42)
rfc.fit(cv_train_features, train_label_names)
rfc_bow_cv_scores = cross_val_score(rfc, cv_train_features, train_label_names, cv=5)
rfc_bow_cv_mean_score = np.mean(rfc_bow_cv_scores)
print('CV Accuracy (5-fold):', rfc_bow_cv_scores)
print('Mean CV Accuracy:', rfc_bow_cv_mean_score)
rfc_bow_test_score = rfc.score(cv_test_features, test_label_names)
print('Test Accuracy:', rfc_bow_test_score, '\n')

# Gradient Boosting Machines
# This takes quite a while to run...
print('Gradient Boosting')
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=10, random_state=42)
gbc.fit(cv_train_features, train_label_names)
gbc_bow_cv_scores = cross_val_score(gbc, cv_train_features, train_label_names, cv=5)
gbc_bow_cv_mean_score = np.mean(gbc_bow_cv_scores)
print('CV Accuracy (5-fold):', gbc_bow_cv_scores)
print('Mean CV Accuracy:', gbc_bow_cv_mean_score)
gbc_bow_test_score = gbc.score(cv_test_features, test_label_names)
print('Test Accuracy:', gbc_bow_test_score)

Naive Bayes:
CV Accuracy (5-fold): [0.71546392 0.69549028 0.69014085 0.70705394 0.69534497]
Mean CV Accuracy: 0.7006987905228245
Test Accuracy: 0.7233434241506895 

Logistic Regression:
CV Accuracy (5-fold): [0.68123711 0.65742656 0.65824358 0.65975104 0.67414796]
Mean CV Accuracy: 0.6661612510293118
Test Accuracy: 0.6907164480322906 

SVM:
CV Accuracy (5-fold): [0.63793814 0.62019032 0.62551781 0.62448133 0.63923525]
Mean CV Accuracy: 0.6294725697373328
Test Accuracy: 0.6560713084426505 

Random Forest
CV Accuracy (5-fold): [0.53030928 0.53868432 0.52278376 0.52946058 0.56109726]
Mean CV Accuracy: 0.536467039383467
Test Accuracy: 0.5613858055835856 

Gradient Boosting
CV Accuracy (5-fold): [0.52783505 0.51634257 0.52236951 0.55352697 0.54862843]
Mean CV Accuracy: 0.5337405072102663
Test Accuracy: 0.5455768583921964


## TF-IDF Features with Classification Models - Starting on page 319

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

# build BOW features on train articles
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0)
tv_train_features = tv.fit_transform(train_corpus)

# transform test articles into features
tv_test_features = tv.transform(test_corpus)

print('Train features shape:', tv_train_features.shape)
print(' Test features shape:', tv_test_features.shape, '\n')

Train features shape: (12072, 67312)
 Test features shape: (5946, 67312) 



## Building Models with TF-IDF Features
This takes a while to run...

In [45]:
# Naive Bayes
print('Naive Bayes:')
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

mnb = MultinomialNB(alpha=1)
mnb.fit(tv_train_features, train_label_names)
mnb_tfidf_cv_scores = cross_val_score(mnb, tv_train_features, train_label_names, cv=5)
mnb_tfidf_cv_mean_score = np.mean(mnb_tfidf_cv_scores)
print('CV Accuracy (5-fold):', mnb_tfidf_cv_scores)
print('Mean CV Accuracy:', mnb_tfidf_cv_mean_score)
mnb_tfidf_test_score = mnb.score(tv_test_features, test_label_names)
print('Test Accuracy:', mnb_tfidf_test_score, '\n')

# Logistic Regression
print('Logistic Regression:')
# This takes quite a while to run, be patient.
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2', max_iter=100, solver='lbfgs',
                        C=1, random_state=42, multi_class='auto')
lr.fit(tv_train_features, train_label_names)
lr_tfidf_cv_scores = cross_val_score(lr, tv_train_features, train_label_names, cv=5)
lr_tfidf_cv_mean_score = np.mean(lr_tfidf_cv_scores)
print('CV Accuracy (5-fold):', lr_tfidf_cv_scores)
print('Mean CV Accuracy:', lr_tfidf_cv_mean_score)
lr_tfidf_test_score = lr.score(tv_test_features, test_label_names)
print('Test Accuracy:', lr_tfidf_test_score, '\n')

# Support Vector Machines
print('Support Vector Machines:')
from sklearn.svm import LinearSVC

svm = LinearSVC(penalty='l2', C=1, random_state=42)
svm.fit(tv_train_features, train_label_names)
svm_tfidf_cv_scores = cross_val_score(svm, tv_train_features, train_label_names, cv=5)
svm_tfidf_cv_mean_score = np.mean(svm_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svm_tfidf_cv_scores)
print('Mean CV Accuracy:', svm_tfidf_cv_mean_score)
svm_tfidf_test_score = svm.score(tv_test_features, test_label_names)
print('Test Accuracy:', svm_tfidf_test_score, '\n')

# SVM with Stochastic Gradient Descent
print('SVM With Gradient Descent:')
from sklearn.linear_model import SGDClassifier

svm_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=50, tol=1e-3)
svm_sgd.fit(tv_train_features, train_label_names)
svmsgd_tfidf_cv_scores = cross_val_score(svm_sgd, tv_train_features, train_label_names, cv=5)
svmsg_tfidf_cv_mean_score = np.mean(svmsgd_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svmsgd_tfidf_cv_scores)
print('Mean CV Accuracy:', svmsg_tfidf_cv_mean_score)
svmsgd_tfidf_test_score = svm_sgd.score(tv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_tfidf_test_score, '\n')

# Random Forest
print('Random Forest:')
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=10, random_state=42) 
rfc.fit(tv_train_features, train_label_names)
rfc_tfidf_cv_scores = cross_val_score(rfc, tv_train_features, train_label_names, cv=5)
rfc_tfidf_cv_mean_score = np.mean(rfc_tfidf_cv_scores)
print('CV Accuracy (5-fold):', rfc_tfidf_cv_scores)
print('Mean CV Accuracy:', rfc_tfidf_cv_mean_score)
rfc_tfidf_test_score = rfc.score(tv_test_features, test_label_names)
print('Test Accuracy:', rfc_tfidf_test_score, '\n')

# Gradient Boosting
print('Grandient Boosting:')
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=10, random_state=42)
gbc.fit(tv_train_features, train_label_names)
gbc_tfidf_cv_scores = cross_val_score(gbc, tv_train_features, train_label_names, cv=5)
gbc_tfidf_cv_mean_score = np.mean(gbc_tfidf_cv_scores)
print('CV Accurance (5-fold):', gbc_tfidf_cv_scores)
print('Mean CV Accuracy:', gbc_tfidf_cv_mean_score)
gbc_tfidf_test_score = gbc.score(tv_test_features, test_label_names)
print('Test Accuracy:', gbc_tfidf_test_score, '\n')

Naive Bayes:
CV Accuracy (5-fold): [0.70721649 0.69962764 0.68889809 0.70788382 0.6957606 ]
Mean CV Accuracy: 0.6998773285585533
Test Accuracy: 0.7119071644803229 

Logistic Regression:
CV Accuracy (5-fold): [0.74762887 0.72776169 0.72783761 0.74273859 0.72942643]
Mean CV Accuracy: 0.7350786382136094
Test Accuracy: 0.7514295324587958 

Support Vector Machines:
CV Accuracy (5-fold): [0.75051546 0.74141498 0.73695112 0.75809129 0.74729842]
Mean CV Accuracy: 0.7468542533119572
Test Accuracy: 0.767238479650185 

SVM With Gradient Descent:
CV Accuracy (5-fold): [0.76123711 0.74886223 0.74233637 0.76348548 0.74480466]
Mean CV Accuracy: 0.752145168535528
Test Accuracy: 0.7684157416750756 

Random Forest:
CV Accuracy (5-fold): [0.54927835 0.51592884 0.54598177 0.54149378 0.5361596 ]
Mean CV Accuracy: 0.5377684675678408
Test Accuracy: 0.5593676421123445 

Grandient Boosting:
CV Accurance (5-fold): [0.53690722 0.51592884 0.52568351 0.55186722 0.54904406]
Mean CV Accuracy: 0.535886168636141
Test 

### Comparative Model Performance - starting on page 322
This also take a while to run.

In [46]:
# Uses only TF-IDF
print(pd.DataFrame(
    [['NB', mnb_tfidf_cv_mean_score, mnb_tfidf_test_score],
    ['LR', lr_tfidf_cv_mean_score, lr_tfidf_test_score],
    ['SVM', svm_tfidf_cv_mean_score, svm_tfidf_test_score],
    ['SGD', svmsg_tfidf_cv_mean_score, svmsgd_tfidf_test_score],
    ['RF', rfc_tfidf_cv_mean_score, rfc_tfidf_test_score],
    ['GBM', gbc_tfidf_cv_mean_score, gbc_tfidf_test_score]],
    columns=['Model', 'CV Score (TF-IDF)', 'Test Score (TF-IDF)']))

  Model  CV Score (TF-IDF)  Test Score (TF-IDF)
0    NB           0.699877             0.711907
1    LR           0.735079             0.751430
2   SVM           0.746854             0.767238
3   SGD           0.752145             0.768416
4    RF           0.537768             0.559368
5   GBM           0.535886             0.545745


## Embeddings - Starting on page 323

This next block of code is not part of the book, but avoids loading the entire Tokenizer class.

In [47]:
# this is not in the book but is part of the author's normalization.py file
def tokenize_text(text):
    tokens = nltk.word_tokenize(text) 
    tokens = [token.strip() for token in tokens]
    return tokens

### Word2Vec Embeddings with Classification Models
This takes a while to run.

In [48]:
def document_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)

    def average_word_vectors(words, model, vocabulary, num_features):
        feature_vector = np.zeros((num_features,), dtype='float64')
        num_words = 0. 

        for word in words:
            if word in vocabulary:
                num_words = num_words + 1. 
                feature_vector = np.add(feature_vector, model.wv[word])
        if num_words:
            feature_vector = np.divide(feature_vector, num_words)
        
        return feature_vector
    
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                for tokenized_sentence in corpus]
    return np.array(features)

# tokenize corpus
tokenized_train = [tokenize_text(text) for text in train_corpus]
tokenized_test = [tokenize_text(text) for text in test_corpus]

# generate word2vec word embeddings
import gensim
# build word2vec model
w2v_num_features = 1000
w2v_model = gensim.models.Word2Vec(tokenized_train, size=w2v_num_features, window=100, 
                                   min_count=2, sample=1e-3, sg=1, iter=5, workers=10)

### Generate document level embeddings
Remember we only use train dataset vocabulary embeddings so that test dateset truly remains an unseen dataset

#### Generate averaged word vector features from word2vec model

In [49]:
avg_wv_train_features = document_vectorizer(corpus=tokenized_train, model=w2v_model, 
                                            num_features=w2v_num_features)
avg_wv_test_features = document_vectorizer(corpus=tokenized_test, model=w2v_model, 
                                           num_features=w2v_num_features)

print('Word2VTrain features shape:', avg_wv_train_features.shape)
print('Test features shape:', avg_wv_test_features.shape, '\n')

Word2VTrain features shape: (12072, 1000)
Test features shape: (5946, 1000) 



## Create SGDClassiferi Model with Word2Vec

In [50]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier

svm = SGDClassifier(loss='hinge', penalty='l2', max_iter=50, tol=1e-3)
svm.fit(avg_wv_train_features, train_label_names)
svm_w2v_cv_scores = cross_val_score(svm, avg_wv_train_features, train_label_names, cv=5)
svm_w2v_cv_mean_score = np.mean(svm_w2v_cv_scores)
print('CV Accuracy (5-fold):', svm_w2v_cv_scores)
print('Mean CV Accuracy:', svm_w2v_cv_mean_score)
svm_w2v_test_score = svm.score(avg_wv_test_features, test_label_names)
print('Test Accuracy:', svm_w2v_test_score, '\n')

CV Accuracy (5-fold): [0.72907216 0.73438146 0.72576636 0.72365145 0.72693267]
Mean CV Accuracy: 0.7279608226137078
Test Accuracy: 0.7221661621257989 



Skipping GloVe and FastText - same ideas as W2V and take a while to run

Skipping the neural network - also computationally expensive and hasn't been introduced

## Model Tuning and Performance - Starting on Page 329
### Tuning our Multinomial Naive Bayes Model - this take a while to run (but not too long)

In [51]:
import numpy as np
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

mnb_pipeline = Pipeline([('tfidf', TfidfVectorizer()), ('mnb', MultinomialNB())])
param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2)], 'mnb__alpha': [1e-5, 1e-4, 1e-2, 1e-1, 1]}

gs_mnb = GridSearchCV(mnb_pipeline, param_grid, cv=5, verbose=2)
gs_mnb = gs_mnb.fit(train_corpus, train_label_names)

print(gs_mnb.best_estimator_.get_params(), '\n')

cv_results = gs_mnb.cv_results_
results_df = pd.DataFrame({'rank': cv_results['rank_test_score'],
                            'params': cv_results['params'], 
                            'cv score (mean)': cv_results['mean_test_score'],
                            'cv score (std)': cv_results['std_test_score']})
results_df = results_df.sort_values(by=['rank'], ascending=True)
pd.set_option('display.max_colwidth', 100)
print('Modeling tuning results DF:', results_df, '\n')

best_mnb_test_score = gs_mnb.score(test_corpus, test_label_names)
print('Test Accuracy:', best_mnb_test_score, '\n')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] mnb__alpha=1e-05, tfidf__ngram_range=(1, 1) .....................
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] ...... mnb__alpha=1e-05, tfidf__ngram_range=(1, 1), total=   0.9s
[CV] mnb__alpha=1e-05, tfidf__ngram_range=(1, 1) .....................
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.6s remaining:    0.0s
[CV] ...... mnb__alpha=1e-05, tfidf__ngram_range=(1, 1), total=   0.9s
[CV] mnb__alpha=1e-05, tfidf__ngram_range=(1, 1) .....................
[CV] ...... mnb__alpha=1e-05, tfidf__ngram_range=(1, 1), total=   0.9s
[CV] mnb__alpha=1e-05, tfidf__ngram_range=(1, 1) .....................
[CV] ...... mnb__alpha=1e-05, tfidf__ngram_range=(1, 1), total=   0.9s
[CV] mnb__alpha=1e-05, tfidf__ngram_range=(1, 1) .....................
[CV] ...... mnb__alpha=1e-05, tfidf__ngram_range=(1, 1), total=   0.9s
[CV] mnb__alpha=1e-05, tfidf__ngram_range=(1, 2) ...................

## Tuning our Logistic Regression Model
Also takes a while to run

In [52]:
from sklearn.linear_model import LogisticRegression
lr_pipeline = Pipeline([('tfidf', TfidfVectorizer()), ('lr', LogisticRegression(penalty='l2', 
                          max_iter=100, random_state=42))])
param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2)], 'lr__C': [1, 5, 10]}

gl_lr = GridSearchCV(lr_pipeline, param_grid, cv=5, verbose=2)
gl_lr = gl_lr.fit(train_corpus, train_label_names)

# evaluate best tuned model on the test dataset
best_lr_test_score = gl_lr.score(test_corpus, test_label_names)
print('Test Accuracy:', best_lr_test_score, '\n')

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] lr__C=1, tfidf__ngram_range=(1, 1) ..............................
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] ............... lr__C=1, tfidf__ngram_range=(1, 1), total=   4.2s
[CV] lr__C=1, tfidf__ngram_range=(1, 1) ..............................
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.8s remaining:    0.0s
[CV] ............... lr__C=1, tfidf__ngram_range=(1, 1), total=   4.1s
[CV] lr__C=1, tfidf__ngram_range=(1, 1) ..............................
[CV] ............... lr__C=1, tfidf__ngram_range=(1, 1), total=   4.3s
[CV] lr__C=1, tfidf__ngram_range=(1, 1) ..............................
[CV] ............... lr__C=1, tfidf__ngram_range=(1, 1), total=   4.2s
[CV] lr__C=1, tfidf__ngram_range=(1, 1) ..............................
[CV] ............... lr__C=1, tfidf__ngram_range=(1, 1), total=   4.1s
[CV] lr__C=1, tfidf__ngram_range=(1, 2) .............................

## Tuning the Linear SVM Model
Also takes a while to run

In [53]:
from sklearn.svm import LinearSVC
svm_pipeline = Pipeline([('tfidf', TfidfVectorizer()), ('svm', LinearSVC(random_state=42))])
param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2)], 'svm__C': [0.01, 0.1, 1, 5]}

gs_svm = GridSearchCV(svm_pipeline, param_grid, cv=5, verbose=2)
gs_svm = gs_svm.fit(train_corpus, train_label_names)

# evaluating best tuned model on the data set
best_svm_test_score = gs_svm.score(test_corpus, test_label_names)
print('Test Accuracy:', best_svm_test_score)

mnb_predictions = gs_mnb.predict(test_corpus)
unique_classes = list(set(test_label_names))
print('Unique classes:', unique_classes, '\n')

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] svm__C=0.01, tfidf__ngram_range=(1, 1) ..........................
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 1), total=   1.5s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 1) ..........................
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.1s remaining:    0.0s
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 1), total=   1.4s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 1) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 1), total=   1.5s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 1) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 1), total=   1.5s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 1) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 1), total=   1.5s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 2) .........................