# Part 2. ReviewText + Summary

In [1]:
import pandas as pd
import numpy as np
import time
import nltk
#nltk.download('stopwords')    # this is done just once
import sklearn
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV, KFold
from sklearn import metrics
from platform import python_version
print('Python {}'.format(python_version()))
print('Scikit-learn {}.'.format(sklearn.__version__))

Python 3.7.1
Scikit-learn 0.20.1.


#### 1. Read in json, combine reviewText with summary columns, get labels cols

In [2]:
%%time
path=''
file='kindle_reviews.json'
df = pd.read_json(path_or_buf=path+file, lines=True, encoding='utf-8')    #, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, chunksize=None, compression='infer')
print('Length of text: {}'.format(len(df)))

Length of text: 982619
Wall time: 14.1 s


In [3]:
df.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,B000F83SZQ,"[0, 0]",5,I enjoy vintage books and movies so I enjoyed ...,"05 5, 2014",A1F6404F1VG29J,Avidreader,Nice vintage story,1399248000
1,B000F83SZQ,"[2, 2]",4,This book is a reissue of an old one; the auth...,"01 6, 2014",AN0N05A9LIJEQ,critters,Different...,1388966400
2,B000F83SZQ,"[2, 2]",4,This was a fairly interesting read. It had ol...,"04 4, 2014",A795DMNCJILA6,dot,Oldie,1396569600
3,B000F83SZQ,"[1, 1]",5,I'd never read any of the Amy Brewster mysteri...,"02 19, 2014",A1FV0SX13TWVXQ,"Elaine H. Turley ""Montana Songbird""",I really liked it.,1392768000
4,B000F83SZQ,"[0, 1]",4,"If you like period pieces - clothing, lingo, y...","03 19, 2014",A3SPTOKDG7WBLN,Father Dowling Fan,Period Mystery,1395187200


In [4]:
# Let's add the "summary" column to reviewText
df['reviewText'] = df['summary'].map(str) + ' ' + df['reviewText']

In [5]:
df_text = df[['reviewText', 'overall']].copy()        # copy only certain columns to another df
df = None                                             # release memory

In [6]:
df_text.head()

Unnamed: 0,reviewText,overall
0,Nice vintage story I enjoy vintage books and m...,5
1,Different... This book is a reissue of an old ...,4
2,Oldie This was a fairly interesting read. It ...,4
3,I really liked it. I'd never read any of the A...,5
4,Period Mystery If you like period pieces - clo...,4


#### 2. Convert 5 ratings to 3 ('neg', 'mixed', and 'pos')

In [7]:
# convert 5 ratings to 3 ('neg', 'mixed', and 'pos')
df_text['overall'] = df_text['overall'].apply(lambda x: 'pos' if x > 3 else 'neg' if x < 3 else 'mixed')

In [8]:
df_text.head()

Unnamed: 0,reviewText,overall
0,Nice vintage story I enjoy vintage books and m...,pos
1,Different... This book is a reissue of an old ...,pos
2,Oldie This was a fairly interesting read. It ...,pos
3,I really liked it. I'd never read any of the A...,pos
4,Period Mystery If you like period pieces - clo...,pos


In [9]:
print('Total length of the dataset is {}'.format(len(df_text)))

Total length of the dataset is 982619


#### 3. Reduce size of data, get reduced-size dataset and labels

In [10]:
# get a random sample from the dataframe whose size is manageable for cross-validation and grid search
# with more computing resources and/or time, this can be done on a larger data set
length = len(df_text)
df_text = df_text.sample(n=length)
df_text_short = df_text.sample(n=int(length/23))
print('Length of the sample is {}'.format(len(df_text_short)))

Length of the sample is 42722


In [11]:
# count of unique labels
df_text_short.overall.value_counts()

pos      36117
mixed     4094
neg       2511
Name: overall, dtype: int64

In [12]:
# this is our train set and labels
data = df_text_short['reviewText'].values
labels = df_text_short['overall'].values

#### 4. Experimenting with different lists of stopwords and selecting the most efficient one

In [13]:
# GENERATING A LIST OF STOPWORDS
# these various stopword lists and the combined joint list were tested on the same classifier (MultinomialNB) with the same
# parameters, and it was found that the lemur list and the combined list, the latter includes the former, were the most
# efficient ones

from sklearn.feature_extraction import stop_words    
from nltk.corpus import stopwords                    
 
print('Sklearn:')
stopwords_sklearn = list(stop_words.ENGLISH_STOP_WORDS)        # 318 words
print(len(stopwords_sklearn))
print(stopwords_sklearn)

print('\nNLTK:')
stopwords_nltk = list(stopwords.words('english'))              # 180 words
print(len(stopwords_nltk))
print(stopwords_nltk)

print('\nLemur')                                               # 430 words
stopwords_lemur = []
with open('lemur_stopwords.txt') as f:
    for line in f:
        line = line.strip()
        stopwords_lemur.append(line)
print(len(stopwords_lemur))
print(stopwords_lemur)

print('\nOther:')                                              # 153 words
stopwords_other = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
print(len(stopwords_other))
print(stopwords_other)

print('\nCOMBINED:')                                           # 579 words
stopwords_combined = list(set(stopwords_sklearn + stopwords_nltk + stopwords_lemur + stopwords_other))
print(len(stopwords_combined))
print(stopwords_combined)

Sklearn:
318
['he', 'nowhere', 'put', 'between', 'once', 'within', 'him', 'under', 'many', 'every', 'hereby', 'towards', 'thereby', 'herself', 'again', 'amoungst', 'besides', 'been', 'bill', 'very', 'mine', 'give', 'see', 'am', 'name', 'how', 'must', 'before', 'three', 'get', 'seems', 'much', 'further', 'down', 'to', 'without', 'nevertheless', 'thin', 'move', 'enough', 'con', 'even', 'keep', 'most', 'ours', 'yourselves', 'whereby', 'do', 'yourself', 'eight', 'thick', 'detail', 'by', 'such', 'thru', 'ie', 'may', 'should', 'their', 'those', 'done', 'full', 'across', 'are', 'the', 'bottom', 'has', 'yours', 'thereafter', 'hundred', 'behind', 'mill', 'being', 'we', 'had', 'same', 'because', 'nobody', 'more', 'thus', 'upon', 'whole', 'as', 're', 'all', 'in', 'sometime', 'found', 'fifty', 'seemed', 'beside', 'onto', 'itself', 'these', 'a', 'perhaps', 'about', 'everyone', 'ourselves', 'sometimes', 'herein', 'formerly', 'part', 'amongst', 'i', 'for', 'therein', 'forty', 'or', 'find', 'anyway', 

#### 5. Straightforward Implementation of a text classifier (as a benchmark)
Using the same two classifiers - Naive Bayes and SVM. The classification functions are generic, so you can use any other classifiers by just making minor modifications of the code

In [14]:
# simple straightforward
def clf_simple(classifier, data, labels):

    # split data into train and test sets; use TfidfVectorizer
    trainX, testX, trainY, testY = train_test_split(data, labels, test_size = 0.2, random_state = 43)
    vectorizer = TfidfVectorizer(analyzer='word', stop_words=stopwords_combined, min_df=5, max_df=0.5, ngram_range=(1, 2))
    matrix_train = vectorizer.fit_transform(trainX)    # lowercase=True by default, initially min_df=15, max_df=0.23
    matrix_test = vectorizer.transform(testX)
                   
    # get classifier's name to print results; otherwise, this function needs another argument
    clf_string = str(classifier)
    idx = clf_string.find("(")
    classifier_name = clf_string[:idx]
    
    # fit classifier
    clf = classifier
    clf = clf.fit(matrix_train, trainY)  
        
    # predict and compute metrics    
    predictions=clf.predict(matrix_test)
    score = metrics.f1_score(testY, predictions, average='micro')
    cm    = metrics.confusion_matrix(testY, predictions)
    print('{} F-1 score:   {:0.4f}'.format(classifier_name, score))
    print('Confusion matrix:')
    print(cm)
    print()

#### 6. Text Classifier with Pipeline (as a benchmark)

In [15]:
# pipeline
def clf_pipe(classifier, data, labels):

    # split data into train and test sets; create pipeline
    trainX, testX, trainY, testY = train_test_split(data, labels, test_size = 0.2, random_state = 43)
    clf = Pipeline([('vect', CountVectorizer(analyzer='word', stop_words=stopwords_combined, min_df=5, max_df=0.5, ngram_range=(1, 2))),  # lowercase=True by default
                   ('tfidf', TfidfTransformer()),                                                                # initially min_df=15, max_df=0.23,
                   ('clf', classifier),
                 ])   
       
    # get classifier's name to print results; otherwise, this function needs another argument
    clf_string = str(classifier)
    idx = clf_string.find("(")
    classifier_name = clf_string[:idx]
        
    # fit classifier, predict, and compute metrics
    clf = clf.fit(trainX, trainY)
    predictions=clf.predict(testX)
    score = metrics.f1_score(testY, predictions, average='micro')
    cm    = metrics.confusion_matrix(testY, predictions)
    print('{} F-1 score:   {:0.4f}'.format(classifier_name, score))
    print('Confusion matrix:')
    print(cm)
    print()

#### 7. Text Classifier Using 5-Fold Cross-Validation

In [16]:
# with cross_val_score
def clf_cv(classifier, data, labels):

    # split data into train and test sets; create pipeline    
    trainX, testX, trainY, testY = train_test_split(data, labels, test_size = 0.2, random_state = 43)
    clf = Pipeline([('vect', CountVectorizer(analyzer='word', stop_words=stopwords_combined, min_df=5, max_df=0.5, ngram_range=(1, 2))),  # lowercase=True by default
                   ('tfidf', TfidfTransformer()),                                                                # initially min_df=15, max_df=0.23,
                   ('clf', classifier),
                 ])   
        
    # get classifier's name to print results; otherwise, this function needs another argument
    clf_string = str(classifier)
    idx = clf_string.find("(")
    classifier_name = clf_string[:idx]
        
    # fit classifier, predict, and compute metrics
    clf = clf.fit(trainX, trainY)
    seed = 7
    kfold = KFold(n_splits=5, shuffle=True, random_state=seed)
    scores = cross_val_score(clf, trainX, trainY, cv=kfold, scoring='f1_micro')
    print('Cross-validated Accuracy of {}: {:0.4f} +/- {:0.4f}'.format(classifier_name, scores.mean(), scores.std() * 2))
    predictions = clf.predict(testX)
    cm          = metrics.confusion_matrix(testY, predictions)
    print('Confusion matrix:')
    print(cm)
    print()

#### 8. Text Classifier with Cross-Validated Parameter Grid Search

In [17]:
# GridSearchCV
def clf_GridSearchCV(classifier, data, labels, param_grid):

    # split data into train and test sets; create pipeline    
    trainX, testX, trainY, testY = train_test_split(data, labels, test_size = 0.2, random_state = 43)
    clf = Pipeline([('vect', CountVectorizer(analyzer='word', stop_words=stopwords_combined, min_df=5, max_df=0.5, ngram_range=(1, 2))),  # lowercase=True by default
                   ('tfidf', TfidfTransformer()), 
                   ('clf', classifier),
                 ])   
            
    # get classifier's name to print results; otherwise, this function needs another argument
    clf_string = str(classifier)
    idx = clf_string.find("(")
    classifier_name = clf_string[:idx]
    
    # do 3-fold cross validation for each of the possible combinations of the parameter values above
    grid = GridSearchCV(clf, cv=3, param_grid=param_grid, scoring='f1_micro')
    grid.fit(trainX, trainY)

    # summarize results
    print("Best: %f using %s" % (grid.best_score_, 
        grid.best_params_))
    means = grid.cv_results_['mean_test_score']
    stds = grid.cv_results_['std_test_score']
    params = grid.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))

    # train and predict on test instances using the best configs found in the CV step
        
    #predictions = grid.best_estimator_.predict(testX)                   # this is how to find the best estimator 
    #testX = grid.best_estimator_.named_steps['tfidf'].transform(testX)  # this is how to find indiv. components (same for pipeline)
    predictions=grid.predict(testX)                                      # called on the best estimator by default
    score = metrics.f1_score(testY, predictions, average='micro')
    cm    = metrics.confusion_matrix(testY, predictions)
    print('{} cross-validated F-1 score with grid search: {:0.4f}'.format(classifier_name, score))
    print('Confusion matrix:')
    print(cm)
    print()
    
    # return the best classifier to run it on the full dataset    
    return grid.best_estimator_

#### 9. Running 7 options on the limited data set

In [18]:
%%time
# MultinomialNB()
nb = MultinomialNB()
nb_param_grid = {
        'vect__max_df':[0.25, 0.5,0.75],
        'vect__min_df':[5,15,25,50,100],
        'vect__ngram_range':[(1, 1),(1, 2),(1, 3)],
        'clf__alpha':[0.1,0.25,0.5,0.75,1.0]
    }
clf_simple(nb, data, labels)    # straightforward NB

MultinomialNB F-1 score:   0.8455
Confusion matrix:
[[   2    0  827]
 [   1    3  492]
 [   0    0 7220]]

Wall time: 10.4 s


In [19]:
%%time
clf_pipe(nb, data, labels)    # pipeline NB

MultinomialNB F-1 score:   0.8455
Confusion matrix:
[[   2    0  827]
 [   1    3  492]
 [   0    0 7220]]

Wall time: 10.8 s


In [20]:
%%time
clf_cv(nb, data, labels)    # cross_val_score NB

Cross-validated Accuracy of MultinomialNB: 0.8462 +/- 0.0082
Confusion matrix:
[[   2    0  827]
 [   1    3  492]
 [   0    0 7220]]

Wall time: 53 s


In [21]:
%%time
best_NB = clf_GridSearchCV(nb, data, labels, nb_param_grid)    # parameter grid search NB

Best: 0.866840 using {'clf__alpha': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 15, 'vect__ngram_range': (1, 3)}
0.858589 (0.000847) with: {'clf__alpha': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 5, 'vect__ngram_range': (1, 1)}
0.864265 (0.001485) with: {'clf__alpha': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 5, 'vect__ngram_range': (1, 2)}
0.864968 (0.001109) with: {'clf__alpha': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 5, 'vect__ngram_range': (1, 3)}
0.859379 (0.000768) with: {'clf__alpha': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 15, 'vect__ngram_range': (1, 1)}
0.866460 (0.000830) with: {'clf__alpha': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 15, 'vect__ngram_range': (1, 2)}
0.866840 (0.000687) with: {'clf__alpha': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 15, 'vect__ngram_range': (1, 3)}
0.858677 (0.000449) with: {'clf__alpha': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 25, 'vect__ngram_range': (1, 1)}
0.863622 (0.000749) with: {'clf__alpha': 0.1, 'vect__max_df': 0.25, 'vec

MultinomialNB cross-validated F-1 score with grid search: 0.8683
Confusion matrix:
[[ 109   36  684]
 [  47  130  319]
 [  31    8 7181]]

Wall time: 2h 1min 38s


In [22]:
%%time
svc = svm.LinearSVC()
svc_param_grid = {
    'vect__max_df':[0.25,0.5,0.75],
    'vect__min_df':[5,15,25,50,100],
    'vect__ngram_range':[(1, 1),(1, 2),(1, 3)],
    'clf__C':[0.1,0.25,0.5,0.75,1.0]
}
clf_pipe(svc, data, labels)            # pipeline SVM

LinearSVC F-1 score:   0.8728
Confusion matrix:
[[ 201   62  566]
 [ 105  204  187]
 [ 136   31 7053]]

Wall time: 10.6 s


In [23]:
%%time
clf_cv(svc, data, labels)             # cross_val_score SVM

Cross-validated Accuracy of LinearSVC: 0.8795 +/- 0.0113
Confusion matrix:
[[ 201   62  566]
 [ 105  204  187]
 [ 136   31 7053]]

Wall time: 53.6 s


In [24]:
%%time
best_SVM = clf_GridSearchCV(svc, data, labels, svc_param_grid)    # parameter grid search SVM

Best: 0.878895 using {'clf__C': 0.5, 'vect__max_df': 0.75, 'vect__min_df': 5, 'vect__ngram_range': (1, 3)}
0.867718 (0.001328) with: {'clf__C': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 5, 'vect__ngram_range': (1, 1)}
0.866197 (0.001418) with: {'clf__C': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 5, 'vect__ngram_range': (1, 2)}
0.866080 (0.001211) with: {'clf__C': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 5, 'vect__ngram_range': (1, 3)}
0.868625 (0.001090) with: {'clf__C': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 15, 'vect__ngram_range': (1, 1)}
0.870644 (0.001855) with: {'clf__C': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 15, 'vect__ngram_range': (1, 2)}
0.870498 (0.001694) with: {'clf__C': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 15, 'vect__ngram_range': (1, 3)}
0.869415 (0.000987) with: {'clf__C': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 25, 'vect__ngram_range': (1, 1)}
0.871463 (0.002131) with: {'clf__C': 0.1, 'vect__max_df': 0.25, 'vect__min_df': 25, 'vect__ngram_range': 

LinearSVC cross-validated F-1 score with grid search: 0.8765
Confusion matrix:
[[ 186   53  590]
 [  84  192  220]
 [  84   24 7112]]

Wall time: 2h 3min 11s


#### 10. Running Naive Bayes and SVM with the Best Parameters from Grid Search on the full dataset

In [25]:
# create full dataset and labels
full_data = df_text['reviewText'].values
full_labels = df_text['overall'].values

In [26]:
%%time
# run the two best classifier on it
for best_clf in [best_NB, best_SVM]:
        
    # split into train and test sets
    trainX, testX, trainY, testY = train_test_split(full_data, full_labels, test_size = 0.2, random_state = 43)
    clf = best_clf.fit(trainX, trainY)
    
    # predict and compute metrics
    predictions = clf.predict(testX)
    score = metrics.f1_score(testY, predictions, average='micro')
    cm    = metrics.confusion_matrix(testY, predictions)
    print('The best {} F-1 score on full dataset: {:0.4f}'.format('Naive Bayes' if best_clf==best_NB else 'SVM', score))
    print('Confusion matrix:')
    print(cm)
    print()    

The best Naive Bayes F-1 score on full dataset: 0.8810
Confusion matrix:
[[  5254   1553  12456]
 [  1816   5617   4028]
 [  3113    418 162269]]

The best SVM F-1 score on full dataset: 0.8928
Confusion matrix:
[[  5617   2005  11641]
 [  1891   6785   2785]
 [  2244    500 163056]]

Wall time: 22min 5s
