In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, ComplementNB, GaussianNB
from sklearn.metrics import classification_report

## Dataframe loading, processing, and feature extraction

In [None]:
def text_process(text):
    text = ' '.join(str(text).split())
    return text

# Iterator to define a product as "awesome" or not based on its overall rating (>4.5 threshold for Sports)
def df_iter(overall):
    if overall > 4.5:
        result = 'awesome'
    else:
        result = 'not'
        
    return result

In [None]:
# Processes the JSON dataset and returns a dataframe wherein products are grouped by their ID (ASIN), 
# & associated with their overall review rating, from which we know their "awesomeness" or not.
def load_process_file(filename):
    df = pd.read_json(filename, lines=True)
    
    grouped_df = df.groupby("asin")
    grouped_lists = grouped_df[['summary', 'reviewText']].apply(text_process).reset_index()
    
    mean_df = grouped_df['overall'].mean()
    mean_df = mean_df.reset_index()
    
    final_df = pd.merge(grouped_lists, mean_df, on="asin")
    final_df['class'] = final_df.apply(lambda row: df_iter(row['overall']), axis=1)
    
    return final_df

In [None]:
# Returns X_train, X_test, y_train, y_test
def vectorize(final_df):
    
    # stemming, stop words dictionary, tokenizer
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    stop_words = stopwords.words("english")
    token = RegexpTokenizer(r'[a-zA-Z0-9]+')
    
    # Feature extraction with CountVectorizer. (Tried TFIDF, didn't work as well.)
    # ngrame range shortened from trigrams back to bigrams: improves performance & time-efficiency
    cv = CountVectorizer(preprocessor=stemmer.stem, stop_words=stop_words, ngram_range = (1,2), tokenizer=token.tokenize)
    text_counts = cv.fit_transform(final_df[0])
    
    # Generate & return
    X_train, X_test, y_train, y_test = train_test_split(text_counts, final_df['class'], test_size = 0.25, random_state = 5)
    return X_train, X_test, y_train, y_test

## Classifier: AdaBoost w/ Linear SVM

In [None]:
# AdaBoost LinearSVC
def boost_svc(X_train, X_test, y_train, y_test):

    boost_classifier = AdaBoostClassifier(LinearSVC(loss='squared_hinge'),
                                          algorithm='SAMME',
                                          learning_rate=0.125,
                                          n_estimators=200)
    
    boost_classifier.fit(X_train, y_train)
    
    y_score = boost_classifier.decision_function(X_test)
    y_pred_boost = boost_classifier.predict(X_test)
    f1_score_boost = metrics.f1_score(y_test, y_pred_boost, average='weighted')
    precision_score_boost = metrics.average_precision_score(y_test, y_score, average='weighted', pos_label="not")

    print('Boosted SVC F1: ' + str('{:04.2f}'.format(f1_score_boost*100)) + '%')
    print('Boosted SVC Precision-Recall: ' + str('{:04.2f}'.format(precision_score_boost*100)) + '%')
    
    return boost_classifier

## Classifier: Logistic Regression

In [None]:
# Non-boosted Logistic Regression
def log_reg(X_train, X_test, y_train, y_test):

    logreg = LogisticRegression(penalty='l2',
                                solver='liblinear',
                                max_iter=10000,
                                class_weight='balanced',
                                tol=1e-2,
                                verbose=True)
    
    logreg.fit(X_train, y_train)

    # Predicting the results, calculating accuracy
    y_pred = logreg.predict(X_test)
    print("Accuracy of logistic regression classifier on test set: {:.2f}".format(logreg.score(X_test, y_test)))
    
    # Compute F-1, precision, recall
    f1_score = metrics.f1_score(y_test, y_pred, average='weighted')
    
    print('Logistic regression F1: ' + str('{:04.2f}'.format(f1_score*100)) + '%')
    print(classification_report(y_test, y_pred))

    y_score = logreg.decision_function(X_test)
    precision_score_lr = metrics.average_precision_score(y_test, y_score, average='weighted', pos_label="not")
    print('LR Precision-Recall: ' + str('{:04.2f}'.format(precision_score_lr*100)) + '%')
    
    return log_reg

## Classifier: AdaBoost w/ Naïve Bayes

In [None]:
# AdaBoost Naive Bayes
def boost_NB(X_train, X_test, y_train, y_test):

    boost_nb = AdaBoostClassifier(base_estimator=ComplementNB(),
                                      n_estimators=1000,
                                      learning_rate=0.1)
    boost_nb.fit(X_train, y_train)
    
    predicted = boost_nb.predict(X_test)
    f1_score = metrics.f1_score(y_test, predicted, average='weighted')
    
    print('F1: ' + str('{:04.2f}'.format(f1_score*100)) + '%')
    
    return boost_nb

## Classifier: Voting w/ AdaBoost'd LinearSVC & Logistic Regression as estimators

In [None]:
def voting_hard(X_train, X_test, y_train, y_test):
    
    boostsvc = AdaBoostClassifier(LinearSVC(loss='squared_hinge'),
                                          algorithm='SAMME',
                                          learning_rate=0.125,
                                          n_estimators=200)
    
    logreg = LogisticRegression(penalty='l2',
                                solver='saga',
                                max_iter=10000,
                                class_weight='balanced')
    
    vc_hard = VotingClassifier(estimators=[('svc',boostsvc),('lr',logreg)],
                          voting='hard',
                          n_jobs=1)
    
    vc_hard.fit(X_train,y_train)
    
    y_pred_vc_hard = vc_hard.predict(X_test)
    f1_score_vc_hard = metrics.f1_score(y_test, y_pred_vc_hard, average='weighted')
    
    print('F1 (VC hard): ' + str('{:04.2f}'.format(f1_score_vc_hard*100)) + '%')
    print(metrics.classification_report(y_test, y_pred_vc_hard))
    
    return vc_hard

## Get results

In [None]:
filename = 'data/Sports_and_Outdoors_Reviews_training.json'
final_df = load_process_file(filename)

In [None]:
X_train, X_test, y_train, y_test = vectorize(final_df)

In [None]:
vc = voting_hard(X_train, X_test, y_train, y_test)

# Generate predictions on test dataset