## Logistic regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.ensemble import AdaBoostRegressor, BaggingClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

In [25]:
# LOAD EM UP
df = pd.read_json('data/Sports_and_Outdoors_Reviews_training.json', lines=True)

In [46]:
def text_process(text):
    words = str(text).split()
        
    text = ' '.join(word for word in words[1:] if not word.isdigit())
    return text

In [47]:
# this one takes a lil while so be careful
# group by product and aggregate all the review text
from tqdm import tqdm
tqdm.pandas()
grouped_df = df.groupby("asin")
grouped_lists = grouped_df[["summary", "reviewText"]].progress_apply(text_process).reset_index()
grouped_lists

100%|██████████| 83749/83749 [02:12<00:00, 633.18it/s]


Unnamed: 0,asin,0
0,00018C9635D55E22BF157AA13E91226F,\ Five Stars I recommend that you pay little m...
1,0001DE3A462B5C5D33AF3BC1053FC20C,"\ Very portable, fairly durable. Works Pretty ..."
2,00022ACC61318C98DA944B9BABD9E5AB,"\ Great product, poor shipping. Five Stars Fiv..."
3,0002C8404EBEDA230E4B66A85CEC5503,\ Small and Cheap One Star shades Great Value ...
4,00034EBDF69991833D05B51EE7B11234,\ Pretty Good Escrima stick Not for adult spar...
...,...,...
83744,FFFCB2EBE4D59DA2E256396B12F59FB1,"\ For The Price, Resin Pads All the way. excel..."
83745,FFFEE00C6052E1A688F4639D650AA50D,\ Good purchase Great balls to go with plastic...
83746,FFFEE7703FE466554E6B5F9C21F09297,\ Cheapo zipper not for me Nice bag. As advert...
83747,FFFF67EAA043C2DB092DBC8934077556,\ No specs -- no deal Good tent for car campin...


In [48]:
# get the mean of all the ratings for a product
mean_df = grouped_df['overall'].mean()
mean_df = mean_df.reset_index()
mean_df

Unnamed: 0,asin,overall
0,00018C9635D55E22BF157AA13E91226F,4.090909
1,0001DE3A462B5C5D33AF3BC1053FC20C,3.909091
2,00022ACC61318C98DA944B9BABD9E5AB,4.698413
3,0002C8404EBEDA230E4B66A85CEC5503,3.400000
4,00034EBDF69991833D05B51EE7B11234,4.214286
...,...,...
83744,FFFCB2EBE4D59DA2E256396B12F59FB1,4.941176
83745,FFFEE00C6052E1A688F4639D650AA50D,4.452381
83746,FFFEE7703FE466554E6B5F9C21F09297,4.310345
83747,FFFF67EAA043C2DB092DBC8934077556,4.600000


In [50]:
# merge product mean ratings + aggregated review text
final_df = pd.merge(grouped_lists, mean_df, on="asin")
final_df

Unnamed: 0,asin,0,overall
0,00018C9635D55E22BF157AA13E91226F,\ Five Stars I recommend that you pay little m...,4.090909
1,0001DE3A462B5C5D33AF3BC1053FC20C,"\ Very portable, fairly durable. Works Pretty ...",3.909091
2,00022ACC61318C98DA944B9BABD9E5AB,"\ Great product, poor shipping. Five Stars Fiv...",4.698413
3,0002C8404EBEDA230E4B66A85CEC5503,\ Small and Cheap One Star shades Great Value ...,3.400000
4,00034EBDF69991833D05B51EE7B11234,\ Pretty Good Escrima stick Not for adult spar...,4.214286
...,...,...,...
83744,FFFCB2EBE4D59DA2E256396B12F59FB1,"\ For The Price, Resin Pads All the way. excel...",4.941176
83745,FFFEE00C6052E1A688F4639D650AA50D,\ Good purchase Great balls to go with plastic...,4.452381
83746,FFFEE7703FE466554E6B5F9C21F09297,\ Cheapo zipper not for me Nice bag. As advert...,4.310345
83747,FFFF67EAA043C2DB092DBC8934077556,\ No specs -- no deal Good tent for car campin...,4.600000


In [51]:
# function to use for lambda to categorize as awesome, not awesome
def df_iter(overall):
    if overall > 4.5:
        result = 1.0
    else:
        result = 0.0
        
    return result

In [52]:
final_df['class'] = final_df.apply(lambda row: df_iter(row['overall']), axis=1)
final_df

Unnamed: 0,asin,0,overall,class
0,00018C9635D55E22BF157AA13E91226F,\ Five Stars I recommend that you pay little m...,4.090909,0.0
1,0001DE3A462B5C5D33AF3BC1053FC20C,"\ Very portable, fairly durable. Works Pretty ...",3.909091,0.0
2,00022ACC61318C98DA944B9BABD9E5AB,"\ Great product, poor shipping. Five Stars Fiv...",4.698413,1.0
3,0002C8404EBEDA230E4B66A85CEC5503,\ Small and Cheap One Star shades Great Value ...,3.400000,0.0
4,00034EBDF69991833D05B51EE7B11234,\ Pretty Good Escrima stick Not for adult spar...,4.214286,0.0
...,...,...,...,...
83744,FFFCB2EBE4D59DA2E256396B12F59FB1,"\ For The Price, Resin Pads All the way. excel...",4.941176,1.0
83745,FFFEE00C6052E1A688F4639D650AA50D,\ Good purchase Great balls to go with plastic...,4.452381,0.0
83746,FFFEE7703FE466554E6B5F9C21F09297,\ Cheapo zipper not for me Nice bag. As advert...,4.310345,0.0
83747,FFFF67EAA043C2DB092DBC8934077556,\ No specs -- no deal Good tent for car campin...,4.600000,1.0


In [21]:
# reasonably balanced!
final_df['class'].value_counts()

0.0    46668
1.0    37081
Name: class, dtype: int64

In [53]:
stemmer = SnowballStemmer("english", ignore_stopwords=True)
stop_words = stopwords.words("english")
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(preprocessor=stemmer.stem, stop_words=stop_words, ngram_range = (1,2), tokenizer=token.tokenize)
text_counts = cv.fit_transform(final_df[0])

In [56]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(text_counts, final_df['class'], test_size = 0.2, random_state = 5)

This is the improved logistic regression right here

In [57]:
# normal logistic regression
logreg = LogisticRegression(penalty='l2', solver='liblinear', max_iter=10000, class_weight='balanced')
logreg.fit(X_train, y_train)

# Predicting the results, calculating accuracy
y_pred = logreg.predict(X_test)
print("Accuracy of logistic regression classifier on test set: {:.2f}".format(logreg.score(X_test, y_test)))
# Compute F-1, precision, recall
f1_score = metrics.f1_score(y_test, y_pred, average='weighted')
print('Logistic regression F1: ' + str('{:04.2f}'.format(f1_score*100)) + '%')
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

Accuracy of logistic regression classifier on test set: 0.78
Logistic regression F1: 78.27%
              precision    recall  f1-score   support

         0.0       0.81      0.79      0.80      9318
         1.0       0.75      0.77      0.76      7432

    accuracy                           0.78     16750
   macro avg       0.78      0.78      0.78     16750
weighted avg       0.78      0.78      0.78     16750



This next one **literally** ran all night so pls don't

In [None]:
# boosted logistic regression
logreg = LogisticRegression(penalty='l2', solver='liblinear', max_iter=1000)
# logreg.fit(X_train, y_train)
boost_logreg = AdaBoostRegressor(logreg, n_estimators=100, random_state=123)
boost_logreg.fit(X_train, y_train)

# Predicting the results, calculating accuracy
y_pred = boost_logreg.predict(X_test)
print("Accuracy of logistic regression classifier on test set: {:.2f}".format(boost_logreg.score(X_test, y_test)))
# Compute F-1, precision, recall
f1_score = metrics.f1_score(y_test, y_pred, average='weighted')
print('Logistic regression F1: ' + str('{:04.2f}'.format(f1_score*100)) + '%')
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))