In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, ComplementNB, GaussianNB
from sklearn.metrics import classification_report

In [2]:
def text_process(text):
    text = ' '.join(str(text).split())
    return text

# Iterator to define a product as "awesome" or not based on its overall rating (>4.5 threshold for Sports)
def df_iter(overall):
    if overall > 4.5:
        result = 'awesome'
    else:
        result = 'not'
        
    return result

In [3]:
# Processes the JSON dataset and returns a dataframe wherein products are grouped by their ID (ASIN), 
# & associated with their overall review rating, from which we know their "awesomeness" or not.
def load_process_file():
    df = pd.read_json('data/Sports_and_Outdoors_Reviews_training.json', lines=True)
    
    grouped_df = df.groupby("asin")
    grouped_lists = grouped_df[['summary', 'reviewText']].apply(text_process).reset_index()
    
    mean_df = grouped_df['overall'].mean()
    mean_df = mean_df.reset_index()
    
    final_df = pd.merge(grouped_lists, mean_df, on="asin")
    final_df['class'] = final_df.apply(lambda row: df_iter(row['overall']), axis=1)
    
    return final_df

In [4]:
# Returns X_train, X_test, y_train, y_test
def vectorize(final_df):
    
    # stemming, stop words dictionary, tokenizer
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    stop_words = stopwords.words("english")
    token = RegexpTokenizer(r'[a-zA-Z0-9]+')
    
    # Feature extraction with CountVectorizer. (Tried TFIDF, didn't work as well.)
    # ngrame range shortened from trigrams back to bigrams: improves performance & time-efficiency
    cv = CountVectorizer(preprocessor=stemmer.stem, stop_words=stop_words, ngram_range = (1,2), tokenizer=token.tokenize)
    text_counts = cv.fit_transform(final_df[0])
    
    # Generate & return
    X_train, X_test, y_train, y_test = train_test_split(text_counts, final_df['class'], test_size = 0.25, random_state = 5)
    return X_train, X_test, y_train, y_test

Boosted SVC

In [5]:
# AdaBoost LinearSVC — doesn't always improve performance.
def boost_svc(X_train, X_test, y_train, y_test):

    boost_classifier = AdaBoostClassifier(LinearSVC(loss='squared_hinge'),
                                          algorithm='SAMME',
                                          learning_rate=0.125,
                                          n_estimators=200)
    
    boost_classifier.fit(X_train, y_train)
    
    y_score = boost_classifier.decision_function(X_test)
    y_pred_boost = boost_classifier.predict(X_test)
    f1_score_boost = metrics.f1_score(y_test, y_pred_boost, average='weighted')
    precision_score_boost = metrics.average_precision_score(y_test, y_score, average='weighted', pos_label="not")

    print('Boosted SVC F1: ' + str('{:04.2f}'.format(f1_score_boost*100)) + '%')
    print('Boosted SVC Precision-Recall: ' + str('{:04.2f}'.format(precision_score_boost*100)) + '%')
    
    return boost_classifier

Logistic Regression

In [6]:
# Non-boosted Logistic Regression
def log_reg(X_train, X_test, y_train, y_test):

    logreg = LogisticRegression(penalty='l2',
                                solver='saga',
                                max_iter=10000,
                                class_weight='balanced',
                                tol=1e-2,
                                verbose=True)
    
    logreg.fit(X_train, y_train)

    # Predicting the results, calculating accuracy
    y_pred = logreg.predict(X_test)
    print("Accuracy of logistic regression classifier on test set: {:.2f}".format(logreg.score(X_test, y_test)))
    
    # Compute F-1, precision, recall
    f1_score = metrics.f1_score(y_test, y_pred, average='weighted')
    
    print('Logistic regression F1: ' + str('{:04.2f}'.format(f1_score*100)) + '%')
    print(classification_report(y_test, y_pred))

    y_score = logreg.decision_function(X_test)
    precision_score_lr = metrics.average_precision_score(y_test, y_score, average='weighted', pos_label="not")
    print('LR Precision-Recall: ' + str('{:04.2f}'.format(precision_score_lr*100)) + '%')
    
    return log_reg

Boosted Naive Bayes

In [7]:
# AdaBoost Naive Bayes
def boost_NB(X_train, X_test, y_train, y_test):

    boost_nb = AdaBoostClassifier(base_estimator=ComplementNB(),
                                      n_estimators=1000,
                                      learning_rate=0.1)
    boost_nb.fit(X_train, y_train)
    
    predicted = boost_nb.predict(X_test)
    f1_score = metrics.f1_score(y_test, predicted, average='weighted')
    
    print('F1: ' + str('{:04.2f}'.format(f1_score*100)) + '%')
    
    return boost_nb

In [18]:
def voting_hard(X_train, X_test, y_train, y_test, classifier1, classifier2):

    boostsvc = AdaBoostClassifier(LinearSVC(loss='squared_hinge'),
                                          algorithm='SAMME',
                                          learning_rate=0.125,
                                          n_estimators=200)
    
    logreg = LogisticRegression(penalty='l2',
                                solver='saga',
                                max_iter=10000,
                                class_weight='balanced')
    
    vc_hard = VotingClassifier(estimators=[('svc',classifier1),('lr',classifier2)],
                          voting='hard',
                          n_jobs=1,
                          verbose=True)
    
    vc_hard.fit(X_train,y_train)
    
    y_pred_vc_hard = vc_hard.predict(X_test)
    f1_score_vc_hard = metrics.f1_score(y_test, y_pred_vc_hard, average='weighted')
    
    print('F1 (VC hard): ' + str('{:04.2f}'.format(f1_score_vc_hard*100)) + '%')
    print(metrics.classification_report(y_test, y_pred_vc_hard))
    
    return vc_hard

In [19]:
# classifier1=boostsvc, classifier2=logreg
def voting_soft(X_train, X_test, y_train, y_test, classifier1, classifier2):

    vc_soft = VotingClassifier(estimators=[('svc', classifier1),('lr', classifier2)],
                          voting='soft',
                          n_jobs=1,
                          verbose=True)
    
    vc_soft.fit(X_train, y_train)
    
    y_pred_vc_soft = vc_soft.predict(X_test)
    f1_score_vc_soft = metrics.f1_score(y_test, y_pred_vc_soft, average='weighted')
    
    print("F1 (VC soft): " + str('{:04.2f}'.format(f1_score_vc_soft*100)) + '%')
    print(metrics.classification_report(y_test, y_pred_vc_soft))
    
    return vc_soft

In [10]:
from sklearn.model_selection import GridSearchCV

def grid_search(X_train, X_test, y_train, y_test, vc):

    params = {'svc__n_estimators': [20, 200], 'lr__C': [1.0, 100.0]}

    # 5-fold cross validation
    grid = GridSearchCV(estimator=vc, param_grid=params, cv=5)
    grid = grid.fit(X_train, y_train)
    
    y_pred_grid = grid.predict(X_test)
    f1_score_grid = metrics.f1_score(y_test, y_pred_grid, average='weighted')
    print("F1 (GridSearchCV): " + str('{:04.2f}'.format(f1_score_grid*100)) + '%')
    print(metrics.classification_report(y_test, y_pred_grid))

In [11]:
final_df = load_process_file()

In [12]:
X_train, X_test, y_train, y_test = vectorize(final_df)

In [None]:
# boosted_naive_bayes = boost_NB(X_train, X_test, y_train, y_test)

In [13]:
boosted_svc = boost_svc(X_train, X_test, y_train, y_test)

Boosted SVC F1: 78.54%
Boosted SVC Precision-Recall: 89.88%


In [14]:
logistic_regression = log_reg(X_train, X_test, y_train, y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 30 epochs took 28 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   28.0s finished


Accuracy of logistic regression classifier on test set: 0.78
Logistic regression F1: 78.22%
              precision    recall  f1-score   support

     awesome       0.73      0.80      0.77      9304
         not       0.83      0.76      0.80     11634

    accuracy                           0.78     20938
   macro avg       0.78      0.78      0.78     20938
weighted avg       0.79      0.78      0.78     20938

LR Precision-Recall: 89.31%


In [22]:
vc = voting_hard(X_train, X_test, y_train, y_test, boosted_svc, logistic_regression)

TypeError: __init__() got an unexpected keyword argument 'verbose'

In [None]:
grid_search(X_train, X_test, y_train, y_test, vc)

Aight looks like it's time to **boost** LogReg

In [None]:
#Bagging with logistic regression
# estimators = [50,100,150,200,250,300]
# samples = [0.1,0.2,0.3,0.4,0.5]
# for e in estimators:
#     for s in samples:
#         print('estimators = {}\nsamples = {}'.format(e,s))
#         bag_classifier = BaggingClassifier(base_estimator=LogisticRegression(penalty='l2',
#                                                                              solver='saga',
#                                                                              max_iter=10000,
#                                                                              tol=1e-2,
#                                                                              class_weight='balanced'),
#                                            max_samples=s,
#                                            n_estimators=e,
#                                            verbose=1,
#                                            n_jobs=-1)
#         bag_classifier.fit(X_train, y_train)
#         y_score = bag_classifier.decision_function(X_test)
#         y_pred_bag = bag_classifier.predict(X_test)
#         f1_score = metrics.f1_score(y_test, y_pred_bag, average='weighted')
#         print('F1: ' + str('{:04.2f}'.format(f1_score*100)) + '%')
#         print(metrics.classification_report(y_test, y_pred_bag))

In [None]:
# f1_score = metrics.f1_score(y_test, y_pred_bag, average='weighted')
# print('F1: ' + str('{:04.2f}'.format(f1_score*100)) + '%')
# print(metrics.classification_report(y_test, y_pred_bag))

In [None]:
#Boosting with logistic regression
# boo = AdaBoostClassifier(base_estimator=LogisticRegression(penalty='l2',
#                                                            solver='saga',
#                                                            max_iter=10000,
#                                                            tol=1e-2,
#                                                            class_weight='balanced'),
#                          learning_rate=0.4,
#                          n_estimators=100,
#                          algorithm='SAMME')
# boo.fit(X_train, y_train)
# y_score = boo.decision_function(X_test)
# y_pred_boo = boo.predict(X_test)

In [None]:
# f1_score = metrics.f1_score(y_test, y_pred_boo, average='weighted')
# print('F1: ' + str('{:04.2f}'.format(f1_score*100)) + '%')
# print(metrics.classification_report(y_test, y_pred_boo))