# Part 1. Using ReviewText Only for Classification

In [1]:
import pandas as pd
import numpy as np
import time
import nltk
#nltk.download('stopwords')    # this is done just once
import sklearn
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV, KFold
from sklearn import metrics
from platform import python_version
print('Python {}'.format(python_version()))
print('Scikit-learn {}.'.format(sklearn.__version__))

Python 3.7.1
Scikit-learn 0.20.1.


#### 1. Read in json, get only the text data and labels cols; change labels to 'neg', 'mixed', and 'pos' only

In [2]:
%%time
path=''
file='kindle_reviews.json'
df = pd.read_json(path_or_buf=path+file, lines=True, encoding='utf-8')    #, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, chunksize=None, compression='infer')
print('Length of text: {}'.format(len(df)))

Length of text: 982619
Wall time: 13.1 s


In [3]:
df.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,B000F83SZQ,"[0, 0]",5,I enjoy vintage books and movies so I enjoyed ...,"05 5, 2014",A1F6404F1VG29J,Avidreader,Nice vintage story,1399248000
1,B000F83SZQ,"[2, 2]",4,This book is a reissue of an old one; the auth...,"01 6, 2014",AN0N05A9LIJEQ,critters,Different...,1388966400
2,B000F83SZQ,"[2, 2]",4,This was a fairly interesting read. It had ol...,"04 4, 2014",A795DMNCJILA6,dot,Oldie,1396569600
3,B000F83SZQ,"[1, 1]",5,I'd never read any of the Amy Brewster mysteri...,"02 19, 2014",A1FV0SX13TWVXQ,"Elaine H. Turley ""Montana Songbird""",I really liked it.,1392768000
4,B000F83SZQ,"[0, 1]",4,"If you like period pieces - clothing, lingo, y...","03 19, 2014",A3SPTOKDG7WBLN,Father Dowling Fan,Period Mystery,1395187200


In [4]:
len(df.asin.unique())

61934

In [5]:
df_text = df[['reviewText', 'overall']].copy()        # copy only certain columns to another df
#df = None                                             # release memory

In [6]:
df_text.head()

Unnamed: 0,reviewText,overall
0,I enjoy vintage books and movies so I enjoyed ...,5
1,This book is a reissue of an old one; the auth...,4
2,This was a fairly interesting read. It had ol...,4
3,I'd never read any of the Amy Brewster mysteri...,5
4,"If you like period pieces - clothing, lingo, y...",4


In [7]:
# convert 5 ratings to 3 ('neg', 'mixed', and 'pos')
df_text['overall'] = df_text['overall'].apply(lambda x: 'pos' if x > 3 else 'neg' if x < 3 else 'mixed')

In [8]:
df_text.head()

Unnamed: 0,reviewText,overall
0,I enjoy vintage books and movies so I enjoyed ...,pos
1,This book is a reissue of an old one; the auth...,pos
2,This was a fairly interesting read. It had ol...,pos
3,I'd never read any of the Amy Brewster mysteri...,pos
4,"If you like period pieces - clothing, lingo, y...",pos


In [9]:
print('Total length of the dataset is {}'.format(len(df_text)))

Total length of the dataset is 982619


#### 2. Reduce size of data, get reduced-size dataset and labels

In [10]:
# get a random sample from the dataframe whose size is manageable for cross-validation and grid search
# with more computing resources and/or time, this can be done on a larger data set
length = len(df_text)
df_text = df_text.sample(n=length)
df_text_short = df_text.sample(n=int(length/23))
print('Length of the sample is {}'.format(len(df_text_short)))

Length of the sample is 42722


In [11]:
# count of unique labels
df_text_short.overall.value_counts()

pos      35988
mixed     4158
neg       2576
Name: overall, dtype: int64

In [12]:
# this is our train set and labels
data = df_text_short['reviewText'].values
labels = df_text_short['overall'].values

#### 3. Experimenting with different lists of stopwords and selecting the most efficient one

In [13]:
# GENERATING A LIST OF STOPWORDS
# these various stopword lists and the combined joint list were tested on the same classifier (MultinomialNB) with the same
# parameters, and it was found that the lemur list and the combined list, the latter includes the former, were the most
# efficient ones

from sklearn.feature_extraction import stop_words    
from nltk.corpus import stopwords                    
 
print('Sklearn:')
stopwords_sklearn = list(stop_words.ENGLISH_STOP_WORDS)        # 318 words
print(len(stopwords_sklearn))
print(stopwords_sklearn)

print('\nNLTK:')
stopwords_nltk = list(stopwords.words('english'))              # 180 words
print(len(stopwords_nltk))
print(stopwords_nltk)

print('\nLemur')                                               # 430 words
stopwords_lemur = []
with open('lemur_stopwords.txt') as f:
    for line in f:
        line = line.strip()
        stopwords_lemur.append(line)
print(len(stopwords_lemur))
print(stopwords_lemur)

print('\nOther:')                                              # 153 words
stopwords_other = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
print(len(stopwords_other))
print(stopwords_other)

print('\nCOMBINED:')                                           # 579 words
stopwords_combined = list(set(stopwords_sklearn + stopwords_nltk + stopwords_lemur + stopwords_other))
print(len(stopwords_combined))
print(stopwords_combined)

Sklearn:
318
['her', 'most', 'hundred', 'please', 'of', 'much', 'take', 'same', 'toward', 'otherwise', 'from', 'neither', 'such', 'who', 'get', 'under', 'together', 'whether', 'two', 'beyond', 'and', 'whence', 'bill', 'further', 'above', 'elsewhere', 'three', 'third', 'besides', 'whoever', 'upon', 'herein', 'yourself', 'done', 'into', 'hasnt', 'an', 'because', 'can', 'these', 'eleven', 'with', 'were', 're', 'your', 'about', 'meanwhile', 'four', 'now', 'no', 'mostly', 'part', 'while', 'my', 'what', 'off', 'first', 'also', 'hence', 'therefore', 'serious', 'already', 'amount', 'his', 'beside', 'whereby', 'cant', 'down', 'across', 'whole', 'de', 'all', 'will', 'should', 'after', 'any', 'there', 'ourselves', 'anyhow', 'least', 'through', 'against', 'bottom', 'nor', 'nothing', 'hereupon', 'thereby', 'every', 'thru', 'is', 'might', 'thereupon', 'fifty', 'ten', 'nine', 'system', 'being', 'back', 'again', 'the', 'myself', 'below', 'in', 'becomes', 'thin', 'too', 'why', 'none', 'nowhere', 'him',

#### 4. Selecting classifier(s)

In [14]:
# SELECTING THE MOST EFFICIENT CLASSIFIER ON A LIMITED DATASET

# potential candidates
clfs = [MultinomialNB(),
        svm.LinearSVC(),
        LogisticRegression(),
        KNeighborsClassifier(n_neighbors=3),
        GradientBoostingClassifier(),
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        SGDClassifier()]

# vectorize data
vectorizer = TfidfVectorizer(analyzer='word', stop_words=stopwords_combined, min_df=5, max_df=0.25, ngram_range=(1, 2))
matrix = vectorizer.fit_transform(data)

# try each classifier on the data
for clf in clfs:
    #scoring = ['precision_macro', 'recall_macro']        # if using this, add scoring=scoring to cross_validate()
    scores = cross_validate(clf, matrix, labels, cv=3)
    print('---------------------------------')
    print(str(clf))
    print('-----------------------------------')
    for key, values in scores.items():
            print(key,' mean ', values.mean())
            print(key,' std ', values.std())

---------------------------------
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
-----------------------------------
fit_time  mean  0.10594948132832845
fit_time  std  0.0036903773744526307
score_time  mean  0.015373150507609049
score_time  std  0.0018912696447948786
test_score  mean  0.8424465172431659
test_score  std  8.267247775353001e-05
train_score  mean  0.8425869594139265
train_score  std  0.00011325272724816022
---------------------------------
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
-----------------------------------
fit_time  mean  0.7182605266571045
fit_time  std  0.05156395322007344
score_time  mean  0.01570590337117513
score_time  std  0.0004716009225517741
test_score  mean  0.8688263821003185
test_score  std  0.0016243916891886861
train_score  mean  0.9914212828686236
train_score  std  7



---------------------------------
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
-----------------------------------
fit_time  mean  1.1099553108215332
fit_time  std  0.0013917528212170854
score_time  mean  0.017703453699747723
score_time  std  0.0012482757471740434
test_score  mean  0.8591358245315327
test_score  std  0.0008457620284527822
train_score  mean  0.8786690633291814
train_score  std  0.0005918632026690911
---------------------------------
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')
-----------------------------------
fit_time  mean  0.060829718907674156
fit_time  std  0.004935014238985133
score_time  mean  13.998656431833902
score_time  std



---------------------------------
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
-----------------------------------
fit_time  mean  5.996913512547811
fit_time  std  0.14431630208935697
score_time  mean  0.23264670372009277
score_time  std  0.0035892214909454324
test_score  mean  0.849702709931524
test_score  std  0.0006387146382114112
train_score  mean  0.9915617265536559
train_score  std  0.000129144100673023




---------------------------------
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)
-----------------------------------
fit_time  mean  0.17615342140197754
fit_time  std  0.00635992226009191
score_time  mean  0.023063182830810547
score_time  std  0.0028364829064531773
test_score  mean  0.8544075820144768
test_score  std  0.0007449991939459659
train_score  mean  0.8675155548935862
train_score  std  0.0009199755891669019


Based on an analysis of mean train scores, test scores, their standard deviations, and the history of use for text data, for the purposes of this project I decided to go forward with SVM and Naive Bayes, but in real life I would also do a cross-validated grid search for Logistic Regression, Random Forest, Gradient Boosting, or SGD classifiers. Of course, that would require much more computing resources OR time.

#### 5. Straightforward Implementation of a text classifier (as a benchmark)

In [15]:
# simple straightforward
def clf_simple(classifier, data, labels):

    # split data into train and test sets; use TfidfVectorizer
    trainX, testX, trainY, testY = train_test_split(data, labels, test_size = 0.2, random_state = 43)
    vectorizer = TfidfVectorizer(analyzer='word', stop_words=stopwords_combined, min_df=5, max_df=0.5, ngram_range=(1, 2))
    matrix_train = vectorizer.fit_transform(trainX)    # lowercase=True by default, initially min_df=15, max_df=0.23
    matrix_test = vectorizer.transform(testX)
                   
    # get classifier's name to print results; otherwise, this function needs another argument
    clf_string = str(classifier)
    idx = clf_string.find("(")
    classifier_name = clf_string[:idx]
    
    # fit classifier
    clf = classifier
    clf = clf.fit(matrix_train, trainY)  
        
    # predict and compute metrics    
    predictions=clf.predict(matrix_test)
    score = metrics.f1_score(testY, predictions, average='micro')
    cm    = metrics.confusion_matrix(testY, predictions)
    print('{} F-1 score:   {:0.4f}'.format(classifier_name, score))
    print('Confusion matrix:')
    print(cm)
    print()

#### 6. Text Classifier with Pipeline (as a benchmark)

In [16]:
# pipeline
def clf_pipe(classifier, data, labels):

    # split data into train and test sets; create pipeline
    trainX, testX, trainY, testY = train_test_split(data, labels, test_size = 0.2, random_state = 43)
    clf = Pipeline([('vect', CountVectorizer(analyzer='word', stop_words=stopwords_combined, min_df=5, max_df=0.5, ngram_range=(1, 2))),  # lowercase=True by default
                   ('tfidf', TfidfTransformer()),                                                                # initially min_df=15, max_df=0.23,
                   ('clf', classifier),
                 ])   
       
    # get classifier's name to print results; otherwise, this function needs another argument
    clf_string = str(classifier)
    idx = clf_string.find("(")
    classifier_name = clf_string[:idx]
        
    # fit classifier, predict, and compute metrics
    clf = clf.fit(trainX, trainY)
    predictions=clf.predict(testX)
    score = metrics.f1_score(testY, predictions, average='micro')
    cm    = metrics.confusion_matrix(testY, predictions)
    print('{} F-1 score:   {:0.4f}'.format(classifier_name, score))
    print('Confusion matrix:')
    print(cm)
    print()

#### 7. Text Classifier Using 5-Fold Cross-Validation

In [17]:
# with cross_val_score
def clf_cv(classifier, data, labels):

    # split data into train and test sets; create pipeline    
    trainX, testX, trainY, testY = train_test_split(data, labels, test_size = 0.2, random_state = 43)
    clf = Pipeline([('vect', CountVectorizer(analyzer='word', stop_words=stopwords_combined, min_df=5, max_df=0.5, ngram_range=(1, 2))),  # lowercase=True by default
                   ('tfidf', TfidfTransformer()),                                                                # initially min_df=15, max_df=0.23,
                   ('clf', classifier),
                 ])   
        
    # get classifier's name to print results; otherwise, this function needs another argument
    clf_string = str(classifier)
    idx = clf_string.find("(")
    classifier_name = clf_string[:idx]
        
    # fit classifier, predict, and compute metrics
    clf = clf.fit(trainX, trainY)
    seed = 7
    kfold = KFold(n_splits=5, shuffle=True, random_state=seed)
    scores = cross_val_score(clf, trainX, trainY, cv=kfold, scoring='f1_micro')
    print('Cross-validated Accuracy of {}: {:0.4f} +/- {:0.4f}'.format(classifier_name, scores.mean(), scores.std() * 2))
    predictions = clf.predict(testX)
    cm          = metrics.confusion_matrix(testY, predictions)
    print('Confusion matrix:')
    print(cm)
    print()

#### 8. Text Classifier with Cross-Validated Parameter Grid Search

In [18]:
# GridSearchCV
def clf_GridSearchCV(classifier, data, labels, param_grid):

    # split data into train and test sets; create pipeline    
    trainX, testX, trainY, testY = train_test_split(data, labels, test_size = 0.2, random_state = 43)
    clf = Pipeline([('vect', CountVectorizer(analyzer='word', stop_words=stopwords_combined, min_df=5, max_df=0.5, ngram_range=(1, 2))),  # lowercase=True by default
                   ('tfidf', TfidfTransformer()), 
                   ('clf', classifier),
                 ])   
            
    # get classifier's name to print results; otherwise, this function needs another argument
    clf_string = str(classifier)
    idx = clf_string.find("(")
    classifier_name = clf_string[:idx]
    
    # do 3-fold cross validation for each of the possible combinations of the parameter values above
    grid = GridSearchCV(clf, cv=3, param_grid=param_grid, scoring='f1_micro')
    grid.fit(trainX, trainY)

    # summarize results
    print("Best: %f using %s" % (grid.best_score_, 
        grid.best_params_))
    means = grid.cv_results_['mean_test_score']
    stds = grid.cv_results_['std_test_score']
    params = grid.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))

    # train and predict on test instances using the best configs found in the CV step
        
    #predictions = grid.best_estimator_.predict(testX)                   # this is how to find the best estimator 
    #testX = grid.best_estimator_.named_steps['tfidf'].transform(testX)  # this is how to find indiv. components (same for pipeline)
    predictions=grid.predict(testX)                                      # called on the best estimator by default
    score = metrics.f1_score(testY, predictions, average='micro')
    cm    = metrics.confusion_matrix(testY, predictions)
    print('{} cross-validated F-1 score with grid search: {:0.4f}'.format(classifier_name, score))
    print('Confusion matrix:')
    print(cm)
    print()
    
    # return the best classifier to run it on the full dataset    
    return grid.best_estimator_

#### 9. Running 7 options on the limited data set

In [19]:
%%time
# MultinomialNB()
nb = MultinomialNB()
nb_param_grid = {
        'vect__max_df':[0.25, 0.5,0.75],
        'vect__min_df':[5,15,25,50,100],
        'vect__ngram_range':[(1, 1),(1, 2),(1, 3)],
        'clf__alpha':[0.1,0.25,0.5,0.75,1.0]
    }
clf_simple(nb, data, labels)    # straightforward NB

MultinomialNB F-1 score:   0.8428
Confusion matrix:
[[   1    0  820]
 [   0    2  523]
 [   0    0 7199]]

Wall time: 9.91 s


In [20]:
%%time
clf_pipe(nb, data, labels)    # pipeline NB

MultinomialNB F-1 score:   0.8428
Confusion matrix:
[[   1    0  820]
 [   0    2  523]
 [   0    0 7199]]

Wall time: 9.96 s


In [21]:
%%time
clf_cv(nb, data, labels)    # cross_val_score NB

Cross-validated Accuracy of MultinomialNB: 0.8426 +/- 0.0108
Confusion matrix:
[[   1    0  820]
 [   0    2  523]
 [   0    0 7199]]

Wall time: 49.3 s


In [22]:
%%time
best_NB = clf_GridSearchCV(nb, data, labels, nb_param_grid)    # parameter grid search NB

Best: 0.857419 using {'clf__alpha': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 15, 'vect__ngram_range': (1, 2)}
0.852825 (0.000207) with: {'clf__alpha': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 5, 'vect__ngram_range': (1, 1)}
0.857185 (0.000927) with: {'clf__alpha': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 5, 'vect__ngram_range': (1, 2)}
0.857272 (0.000871) with: {'clf__alpha': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 5, 'vect__ngram_range': (1, 3)}
0.852035 (0.000391) with: {'clf__alpha': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 15, 'vect__ngram_range': (1, 1)}
0.857419 (0.000426) with: {'clf__alpha': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 15, 'vect__ngram_range': (1, 2)}
0.857389 (0.000639) with: {'clf__alpha': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 15, 'vect__ngram_range': (1, 3)}
0.850865 (0.001045) with: {'clf__alpha': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 25, 'vect__ngram_range': (1, 1)}
0.855722 (0.000345) with: {'clf__alpha': 0.1, 'vect__max_df': 0.25, 'vec

MultinomialNB cross-validated F-1 score with grid search: 0.8624
Confusion matrix:
[[  82   24  715]
 [  33  121  371]
 [  25    8 7166]]

Wall time: 1h 56min 20s


In [23]:
%%time
svc = svm.LinearSVC()
svc_param_grid = {
    'vect__max_df':[0.25,0.5,0.75],
    'vect__min_df':[5,15,25,50,100],
    'vect__ngram_range':[(1, 1),(1, 2),(1, 3)],
    'clf__C':[0.1,0.25,0.5,0.75,1.0]
}
clf_pipe(svc, data, labels)            # pipeline SVM

LinearSVC F-1 score:   0.8674
Confusion matrix:
[[ 193   66  562]
 [  94  223  208]
 [ 157   46 6996]]

Wall time: 10.7 s


In [24]:
%%time
clf_cv(svc, data, labels)             # cross_val_score SVM

Cross-validated Accuracy of LinearSVC: 0.8674 +/- 0.0083
Confusion matrix:
[[ 193   66  562]
 [  94  223  208]
 [ 157   46 6996]]

Wall time: 53.3 s


In [25]:
%%time
best_SVM = clf_GridSearchCV(svc, data, labels, svc_param_grid)    # parameter grid search SVM

Best: 0.868654 using {'clf__C': 0.75, 'vect__max_df': 0.5, 'vect__min_df': 5, 'vect__ngram_range': (1, 3)}
0.859145 (0.000846) with: {'clf__C': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 5, 'vect__ngram_range': (1, 1)}
0.857302 (0.000702) with: {'clf__C': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 5, 'vect__ngram_range': (1, 2)}
0.857389 (0.000689) with: {'clf__C': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 5, 'vect__ngram_range': (1, 3)}
0.859701 (0.000639) with: {'clf__C': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 15, 'vect__ngram_range': (1, 1)}
0.860111 (0.000103) with: {'clf__C': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 15, 'vect__ngram_range': (1, 2)}
0.859994 (0.000331) with: {'clf__C': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 15, 'vect__ngram_range': (1, 3)}
0.860198 (0.000113) with: {'clf__C': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 25, 'vect__ngram_range': (1, 1)}
0.860608 (0.000299) with: {'clf__C': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 25, 'vect__ngram_range': 

LinearSVC cross-validated F-1 score with grid search: 0.8703
Confusion matrix:
[[ 186   63  572]
 [  98  212  215]
 [ 117   43 7039]]

Wall time: 1h 58min 37s


#### 10. Running Naive Bayes and SVM with the Best Parameters from Grid Search on the full dataset

In [26]:
# create full dataset and labels
full_data = df_text['reviewText'].values
full_labels = df_text['overall'].values

In [27]:
%%time
# run the two best classifier on it
for best_clf in [best_NB, best_SVM]:
        
    # split into train and test sets
    trainX, testX, trainY, testY = train_test_split(full_data, full_labels, test_size = 0.2, random_state = 43)
    clf = best_clf.fit(trainX, trainY)
    
    # predict and compute metrics
    predictions = clf.predict(testX)
    score = metrics.f1_score(testY, predictions, average='micro')
    cm    = metrics.confusion_matrix(testY, predictions)
    print('The best {} F-1 score on full dataset: {:0.4f}'.format('Naive Bayes' if best_clf==best_NB else 'SVM', score))
    print('Confusion matrix:')
    print(cm)
    print()    

The best Naive Bayes F-1 score on full dataset: 0.8744
Confusion matrix:
[[  3956   1338  13950]
 [  1598   4652   5200]
 [  2153    442 163235]]

The best SVM F-1 score on full dataset: 0.8845
Confusion matrix:
[[  5304   2006  11934]
 [  2069   6118   3263]
 [  2703    726 162401]]

Wall time: 17min 30s


#### 11. Running Best Classifiers on Full Dataset with Additional Data
Let's add the "summary" column and see how things change

In [28]:
# Let's add the "summary" column and see how things change
new_data = df['reviewText'] + ' ' + df['summary'].map(str)
new_data.head()

0    I enjoy vintage books and movies so I enjoyed ...
1    This book is a reissue of an old one; the auth...
2    This was a fairly interesting read.  It had ol...
3    I'd never read any of the Amy Brewster mysteri...
4    If you like period pieces - clothing, lingo, y...
dtype: object

In [29]:
# full_labels did not change
new_data = new_data.values

In [30]:
# sanity check
print(new_data[:5])

["I enjoy vintage books and movies so I enjoyed reading this book.  The plot was unusual.  Don't think killing someone in self-defense but leaving the scene and the body without notifying the police or hitting someone in the jaw to knock them out would wash today.Still it was a good read for me. Nice vintage story"
 "This book is a reissue of an old one; the author was born in 1910. It's of the era of, say, Nero Wolfe. The introduction was quite interesting, explaining who the author was and why he's been forgotten; I'd never heard of him.The language is a little dated at times, like calling a gun a &#34;heater.&#34;  I also made good use of my Fire's dictionary to look up words like &#34;deshabille&#34; and &#34;Canarsie.&#34; Still, it was well worth a look-see. Different..."
 "This was a fairly interesting read.  It had old- style terminology.I was glad to get  to read a story that doesn't have coarse, crasslanguage.  I read for fun and relaxation......I like the free ebooksbecause 

In [31]:
%%time
# run the two best classifier on it
for best_clf in [best_NB, best_SVM]:
        
    # split into train and test sets
    trainX, testX, trainY, testY = train_test_split(new_data, full_labels, test_size = 0.2, random_state = 43)
    clf = best_clf.fit(trainX, trainY)
    
    # predict and compute metrics
    predictions = clf.predict(testX)
    score = metrics.f1_score(testY, predictions, average='micro')
    cm    = metrics.confusion_matrix(testY, predictions)
    print('The best {} F-1 score on full dataset + summaries: {:0.4f}'.format('Naive Bayes' if best_clf==best_NB else 'SVM', score))
    print('Confusion matrix:')
    print(cm)
    print()    

The best Naive Bayes F-1 score on full dataset + summaries: 0.8437
Confusion matrix:
[[     0      2  19242]
 [     0      0  11450]
 [     0     30 165800]]





The best SVM F-1 score on full dataset + summaries: 0.8419
Confusion matrix:
[[    46     11  19187]
 [    33      3  11414]
 [   372     61 165397]]

Wall time: 36min 53s


Not an improvement. Having 'summary' before the 'reveiwText' yields the same results. What about the 'reviewerName'?

In [32]:
# Let's add the "summary" column and see how things change
df['merged2'] = df['reviewerName'].map(str) + ' ' + df['reviewText']    # conversion to unicode is needed as apparently this data is not all unicode
df.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,merged2
0,B000F83SZQ,"[0, 0]",5,I enjoy vintage books and movies so I enjoyed ...,"05 5, 2014",A1F6404F1VG29J,Avidreader,Nice vintage story,1399248000,Avidreader I enjoy vintage books and movies so...
1,B000F83SZQ,"[2, 2]",4,This book is a reissue of an old one; the auth...,"01 6, 2014",AN0N05A9LIJEQ,critters,Different...,1388966400,critters This book is a reissue of an old one;...
2,B000F83SZQ,"[2, 2]",4,This was a fairly interesting read. It had ol...,"04 4, 2014",A795DMNCJILA6,dot,Oldie,1396569600,dot This was a fairly interesting read. It ha...
3,B000F83SZQ,"[1, 1]",5,I'd never read any of the Amy Brewster mysteri...,"02 19, 2014",A1FV0SX13TWVXQ,"Elaine H. Turley ""Montana Songbird""",I really liked it.,1392768000,"Elaine H. Turley ""Montana Songbird"" I'd never ..."
4,B000F83SZQ,"[0, 1]",4,"If you like period pieces - clothing, lingo, y...","03 19, 2014",A3SPTOKDG7WBLN,Father Dowling Fan,Period Mystery,1395187200,Father Dowling Fan If you like period pieces -...


In [33]:
%%time
# full_labels did not change
new_data = df['merged2'].values#.astype('U')        ## need to convert to Unicode for vectorizers and classifiers to work

Wall time: 0 ns


In [34]:
# sanity check
print(new_data[:5])

["Avidreader I enjoy vintage books and movies so I enjoyed reading this book.  The plot was unusual.  Don't think killing someone in self-defense but leaving the scene and the body without notifying the police or hitting someone in the jaw to knock them out would wash today.Still it was a good read for me."
 "critters This book is a reissue of an old one; the author was born in 1910. It's of the era of, say, Nero Wolfe. The introduction was quite interesting, explaining who the author was and why he's been forgotten; I'd never heard of him.The language is a little dated at times, like calling a gun a &#34;heater.&#34;  I also made good use of my Fire's dictionary to look up words like &#34;deshabille&#34; and &#34;Canarsie.&#34; Still, it was well worth a look-see."
 "dot This was a fairly interesting read.  It had old- style terminology.I was glad to get  to read a story that doesn't have coarse, crasslanguage.  I read for fun and relaxation......I like the free ebooksbecause I can ch

In [35]:
%%time
# run the two best classifier on it
for best_clf in [best_NB, best_SVM]:
        
    # split into train and test sets
    trainX, testX, trainY, testY = train_test_split(new_data, full_labels, test_size = 0.2, random_state = 43)
    clf = best_clf.fit(trainX, trainY)
    
    # predict and compute metrics
    predictions = clf.predict(testX)
    score = metrics.f1_score(testY, predictions, average='micro')
    cm    = metrics.confusion_matrix(testY, predictions)
    print('The best {} F-1 score on full dataset + author names: {:0.4f}'.format('Naive Bayes' if best_clf==best_NB else 'SVM', score))
    print('Confusion matrix:')
    print(cm)
    print()    

The best Naive Bayes F-1 score on full dataset + author names: 0.8436
Confusion matrix:
[[     0      2  19242]
 [     0      0  11450]
 [     3     34 165793]]





The best SVM F-1 score on full dataset + author names: 0.8415
Confusion matrix:
[[    53     16  19175]
 [    32      4  11414]
 [   436     69 165325]]

Wall time: 39min 7s


Not an improvement either

In [36]:
# PLEASE DISREGARD EVERYTHING BELOW. THIS IS FOR MY REFERENCE ONLY
# helper code and additional information
# all possible parameters for scoring='' in cross_val_score()
sorted(metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'brier_score_loss',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'mutual_info_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'v_measure_score']

https://scikit-learn.org/stable/modules/cross_validation.html  
3.1.1.1. The cross_validate function and multiple metric evaluation  
The cross_validate function differs from cross_val_score in two ways -
* It allows specifying multiple metrics for evaluation.
* It returns a dict containing fit-times, score-times (and optionally training scores as well as fitted estimators) in addition to the test score.  
For single metric evaluation, where the scoring parameter is a string, callable or None, the keys will be - ['test_score', 'fit_time', 'score_time']
And for multiple metric evaluation, the return value is a dict with the following keys - ['test_<scorer1_name>', 'test_<scorer2_name>', 'test_<scorer...>', 'fit_time', 'score_time']  

return_train_score is set to True by default. It adds train score keys for all the scorers. If train scores are not needed, this should be set to False explicitly.  
You may also retain the estimator fitted on each training set by setting return_estimator=True.
The multiple metrics can be specified either as a list, tuple or set of predefined scorer names:

>>>
>>> from sklearn.model_selection import cross_validate
>>> from sklearn.metrics import recall_score
>>> scoring = ['precision_macro', 'recall_macro']
>>> clf = svm.SVC(kernel='linear', C=1, random_state=0)
>>> scores = cross_validate(clf, iris.data, iris.target, scoring=scoring,
...                         cv=5, return_train_score=False)
>>> sorted(scores.keys())
['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']
>>> scores['test_recall_macro']                       
array([0.96..., 1.  ..., 0.96..., 0.96..., 1.        ])

Or as a dict mapping scorer name to a predefined or custom scoring function:  
>>>
>>> from sklearn.metrics.scorer import make_scorer
>>> scoring = {'prec_macro': 'precision_macro',
...            'rec_micro': make_scorer(recall_score, average='macro')}
>>> scores = cross_validate(clf, iris.data, iris.target, scoring=scoring,
...                         cv=5, return_train_score=True)
>>> sorted(scores.keys())                 
['fit_time', 'score_time', 'test_prec_macro', 'test_rec_micro',
 'train_prec_macro', 'train_rec_micro']
>>> scores['train_rec_micro']                         
array([0.97..., 0.97..., 0.99..., 0.98..., 0.98...])

Here is an example of cross_validate using a single metric:  
>>>
>>> scores = cross_validate(clf, iris.data, iris.target,
...                         scoring='precision_macro', cv=5,
...                         return_estimator=True)
>>> sorted(scores.keys())
['estimator', 'fit_time', 'score_time', 'test_score', 'train_score']