## Final SVM Classifier

## Import Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn import metrics

## Loading file

In [2]:
df = pd.read_json('data/Sports_and_Outdoors_Reviews_training.json', lines=True)

In [3]:
def text_process(text):
    text = ' '.join(str(text).split())
    return text

## group by product and aggregate all the review summary text

In [30]:
grouped_df = df.groupby("asin")
grouped_lists = grouped_df["summary"].apply(text_process).reset_index()
grouped_lists

Unnamed: 0,asin,summary
0,00018C9635D55E22BF157AA13E91226F,2045677 Five Stars 2045678 I recommend that yo...
1,0001DE3A462B5C5D33AF3BC1053FC20C,"2142792 Very portable, fairly durable. 2142793..."
2,00022ACC61318C98DA944B9BABD9E5AB,"434812 Great product, poor shipping. 434813 Fi..."
3,0002C8404EBEDA230E4B66A85CEC5503,417817 Small and Cheap 417818 One Star 417819 ...
4,00034EBDF69991833D05B51EE7B11234,91838 Pretty Good 91839 Escrima stick 91840 No...
...,...,...
83744,FFFCB2EBE4D59DA2E256396B12F59FB1,"1262012 For The Price, Resin Pads All the way...."
83745,FFFEE00C6052E1A688F4639D650AA50D,593487 Good purchase 593488 Great balls to go ...
83746,FFFEE7703FE466554E6B5F9C21F09297,593764 Cheapo zipper not for me 593765 Nice ba...
83747,FFFF67EAA043C2DB092DBC8934077556,1496788 No specs -- no deal 1496789 Good tent ...


## get the mean of all ratings for a given product

In [31]:
mean_df = grouped_df['overall'].mean()
mean_df = mean_df.reset_index()
mean_df

Unnamed: 0,asin,overall
0,00018C9635D55E22BF157AA13E91226F,4.090909
1,0001DE3A462B5C5D33AF3BC1053FC20C,3.909091
2,00022ACC61318C98DA944B9BABD9E5AB,4.698413
3,0002C8404EBEDA230E4B66A85CEC5503,3.400000
4,00034EBDF69991833D05B51EE7B11234,4.214286
...,...,...
83744,FFFCB2EBE4D59DA2E256396B12F59FB1,4.941176
83745,FFFEE00C6052E1A688F4639D650AA50D,4.452381
83746,FFFEE7703FE466554E6B5F9C21F09297,4.310345
83747,FFFF67EAA043C2DB092DBC8934077556,4.600000


## merge product mean ratings + aggregated review summary text

In [32]:
final_df = pd.merge(grouped_lists, mean_df, on="asin")
final_df

Unnamed: 0,asin,summary,overall
0,00018C9635D55E22BF157AA13E91226F,2045677 Five Stars 2045678 I recommend that yo...,4.090909
1,0001DE3A462B5C5D33AF3BC1053FC20C,"2142792 Very portable, fairly durable. 2142793...",3.909091
2,00022ACC61318C98DA944B9BABD9E5AB,"434812 Great product, poor shipping. 434813 Fi...",4.698413
3,0002C8404EBEDA230E4B66A85CEC5503,417817 Small and Cheap 417818 One Star 417819 ...,3.400000
4,00034EBDF69991833D05B51EE7B11234,91838 Pretty Good 91839 Escrima stick 91840 No...,4.214286
...,...,...,...
83744,FFFCB2EBE4D59DA2E256396B12F59FB1,"1262012 For The Price, Resin Pads All the way....",4.941176
83745,FFFEE00C6052E1A688F4639D650AA50D,593487 Good purchase 593488 Great balls to go ...,4.452381
83746,FFFEE7703FE466554E6B5F9C21F09297,593764 Cheapo zipper not for me 593765 Nice ba...,4.310345
83747,FFFF67EAA043C2DB092DBC8934077556,1496788 No specs -- no deal 1496789 Good tent ...,4.600000


## classify as awesome/not

In [33]:
# function to use for lambda to categorize as awesome, not awesome
def df_iter(overall):
    if overall > 4.5:
        result = 'awesome'
    else:
        result = 'not'
        
    return result

In [34]:
final_df['class'] = final_df.apply(lambda row: df_iter(row['overall']), axis=1)
final_df

Unnamed: 0,asin,summary,overall,class
0,00018C9635D55E22BF157AA13E91226F,2045677 Five Stars 2045678 I recommend that yo...,4.090909,not
1,0001DE3A462B5C5D33AF3BC1053FC20C,"2142792 Very portable, fairly durable. 2142793...",3.909091,not
2,00022ACC61318C98DA944B9BABD9E5AB,"434812 Great product, poor shipping. 434813 Fi...",4.698413,awesome
3,0002C8404EBEDA230E4B66A85CEC5503,417817 Small and Cheap 417818 One Star 417819 ...,3.400000,not
4,00034EBDF69991833D05B51EE7B11234,91838 Pretty Good 91839 Escrima stick 91840 No...,4.214286,not
...,...,...,...,...
83744,FFFCB2EBE4D59DA2E256396B12F59FB1,"1262012 For The Price, Resin Pads All the way....",4.941176,awesome
83745,FFFEE00C6052E1A688F4639D650AA50D,593487 Good purchase 593488 Great balls to go ...,4.452381,not
83746,FFFEE7703FE466554E6B5F9C21F09297,593764 Cheapo zipper not for me 593765 Nice ba...,4.310345,not
83747,FFFF67EAA043C2DB092DBC8934077556,1496788 No specs -- no deal 1496789 Good tent ...,4.600000,awesome


In [9]:
# reasonably balanced!
final_df['class'].value_counts()

not        46668
awesome    37081
Name: class, dtype: int64

## Prep + vectorize text, feed into SVC classifier

In [35]:
import nltk
nltk.download('stopwords')

stemmer = SnowballStemmer("english", ignore_stopwords=True)
stop_words = stopwords.words("english")
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(preprocessor=stemmer.stem, stop_words=stop_words, ngram_range = (1,3), tokenizer=token.tokenize)
text_counts = cv.fit_transform(final_df['summary'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chrischeng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(text_counts, final_df['class'], test_size = 0.25, random_state = 5)

In [37]:
#Support Vector
SVC_classifier = LinearSVC()
SVC_classifier.fit(X_train, y_train)
y_score = SVC_classifier.decision_function(X_test)
y_pred_SVC = SVC_classifier.predict(X_test)

In [13]:
# checking accuracy
f1_score_SVC = metrics.f1_score(y_test, y_pred_SVC, average='weighted')
auc_score_SVC = metrics.roc_auc_score(y_test, y_score, average='weighted', labels=["not","awesome"])
precision_score_SVC = metrics.average_precision_score(y_test, y_score, average='weighted', pos_label="not")

print('SVC F1: ' + str('{:04.2f}'.format(f1_score_SVC*100)) + '%')
print('SVC Precision-Recall: ' + str('{:04.2f}'.format(precision_score_SVC*100)) + '%')
print('SVC AUC: ' + str('{:04.2f}'.format(auc_score_SVC*100)) + '%')

SVC F1: 77.63%
SVC Precision-Recall: 88.83%
SVC AUC: 85.71%


## k-means cross validation

In [26]:
# splitting data for testing and training to see how well model performs

SVC_classifier = LinearSVC()
from sklearn.model_selection import StratifiedShuffleSplit
from statistics import mean
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=5)
# X_train, X_test, Y_train, Y_test = train_test_split(text_counts, df_sorted['overall'], test_size = 0.25, random_state = 5)
count = 1
X = text_counts
y = final_df['class']
f1_scores, auc_scores = list(), list()
for train_idx, test_idx in sss.split(X, y):
    print ("Group" , count)
    count += 1
    X_train, X_test, y_train, y_test = X[train_idx], X[test_idx], y[train_idx], y[test_idx]
    SVC_classifier.fit(X_train, y_train)
    y_score = SVC_classifier.decision_function(X_test)
    y_pred_SVC = SVC_classifier.predict(X_test)
    # checking accuracy
    f1_score_SVC = metrics.f1_score(y_test, y_pred_SVC, average='weighted')
    precision_score_SVC = metrics.average_precision_score(y_test, y_score, average='weighted', pos_label="not")
    
    f1_scores.append(f1_score_SVC)
    auc_scores.append(precision_score_SVC)
    
    print('SVC F1: ' + str('{:04.2f}'.format(f1_score_SVC*100)) + '%')
    print('SVC Precision-Recall AUC: ' + str('{:04.2f}'.format(precision_score_SVC*100)) + '%')
    print()
print('F1 average: ' + str('{:04.2f}'.format(mean(f1_scores)*100)) + '%')
print('Precision-Recall AUC average: ' + str('{:04.2f}'.format(mean(auc_scores)*100)) + '%')



Group 1
SVC F1: 77.58%
SVC Precision-Recall AUC: 88.98%

Group 2
SVC F1: 77.95%
SVC Precision-Recall AUC: 89.22%

Group 3
SVC F1: 77.32%
SVC Precision-Recall AUC: 88.76%

Group 4
SVC F1: 78.21%
SVC Precision-Recall AUC: 89.32%

Group 5
SVC F1: 77.78%
SVC Precision-Recall AUC: 89.13%

F1 average:77.77%
Precision-Recall AUC average:89.08%


# Predictions on Test Dataset

In [40]:
# Read in raw data
test_df = pd.read_json('data/Sports_and_Outdoors_Reviews_test.json', lines=True)

group_df = test_df.groupby("asin")
group_lists = group_df["summary"].apply(text_process).reset_index()

test_counts = cv.transform(group_lists['summary'])

In [41]:
predictions = SVC_classifier.predict(test_counts)
group_lists['predictions'] = predictions
group_lists

Unnamed: 0,asin,summary,predictions
0,00001378E0675643F36F8B5147FDA6D0,158023 Awesome Product 158024 very nice rifle ...,awesome
1,00010FEA8BF06921C276FE4DB4B63AB2,151462 actually fits the Governor and makes ca...,awesome
2,0001DCCBFABAC9E1073A1F87A393E5BA,464277 One of the best options out there for m...,awesome
3,0003494756240B4FC0579229A91398F7,284314 Fun at the range. 284315 Great fun 2843...,not
4,00064AD7C050018B509A034445C74890,177600 Tackiest grip ever 177601 Sure Hands 17...,awesome
...,...,...,...
20933,FFE4079FBEC0377CA942C4C3E3FDB127,330833 Kid's snorkel vest. 330834 Couldn't blo...,not
20934,FFE64F533026658E64520B5F50C3338D,344744 I don't know that my 4 yr old nephew wo...,not
20935,FFE8D7BE871372F0FBD09DB5C962E335,58975 Five Stars 58976 Two Stars 58977 Four St...,not
20936,FFEBF00219AA7DC96F0D3584EF0581CD,525044 Fun Fun Fun ... 525045 5 Star Value - N...,not


In [42]:
output = group_lists[['asin','predictions']].sort_values('asin')
output

Unnamed: 0,asin,predictions
0,00001378E0675643F36F8B5147FDA6D0,awesome
1,00010FEA8BF06921C276FE4DB4B63AB2,awesome
2,0001DCCBFABAC9E1073A1F87A393E5BA,awesome
3,0003494756240B4FC0579229A91398F7,not
4,00064AD7C050018B509A034445C74890,awesome
...,...,...
20933,FFE4079FBEC0377CA942C4C3E3FDB127,not
20934,FFE64F533026658E64520B5F50C3338D,not
20935,FFE8D7BE871372F0FBD09DB5C962E335,not
20936,FFEBF00219AA7DC96F0D3584EF0581CD,not


In [43]:
output.to_csv('predictions.csv', index=False)