## Logistic regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.ensemble import AdaBoostRegressor, BaggingClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

In [2]:
# LOAD EM UP
df = pd.read_json('data/Sports_and_Outdoors_Reviews_training.json', lines=True)

In [3]:
def text_process(text):
    text = ' '.join(str(text).split())
    return text

In [4]:
# this one takes a lil while so be careful
# group by product and aggregate all the review text
grouped_df = df.groupby("asin")
grouped_lists = grouped_df[["summary", "reviewText"]].apply(text_process).reset_index()
grouped_lists

Unnamed: 0,asin,0
0,00018C9635D55E22BF157AA13E91226F,summary \ 2045677 Five Stars 2045678 I recomme...
1,0001DE3A462B5C5D33AF3BC1053FC20C,"summary \ 2142792 Very portable, fairly durabl..."
2,00022ACC61318C98DA944B9BABD9E5AB,"summary \ 434812 Great product, poor shipping...."
3,0002C8404EBEDA230E4B66A85CEC5503,summary \ 417817 Small and Cheap 417818 One St...
4,00034EBDF69991833D05B51EE7B11234,summary \ 91838 Pretty Good 91839 Escrima stic...
...,...,...
83744,FFFCB2EBE4D59DA2E256396B12F59FB1,"summary \ 1262012 For The Price, Resin Pads Al..."
83745,FFFEE00C6052E1A688F4639D650AA50D,summary \ 593487 Good purchase 593488 Great ba...
83746,FFFEE7703FE466554E6B5F9C21F09297,summary \ 593764 Cheapo zipper not for me 5937...
83747,FFFF67EAA043C2DB092DBC8934077556,summary \ 1496788 No specs -- no deal 1496789 ...


In [5]:
# get the mean of all the ratings for a product
mean_df = grouped_df['overall'].mean()
mean_df = mean_df.reset_index()
mean_df

Unnamed: 0,asin,overall
0,00018C9635D55E22BF157AA13E91226F,4.090909
1,0001DE3A462B5C5D33AF3BC1053FC20C,3.909091
2,00022ACC61318C98DA944B9BABD9E5AB,4.698413
3,0002C8404EBEDA230E4B66A85CEC5503,3.400000
4,00034EBDF69991833D05B51EE7B11234,4.214286
...,...,...
83744,FFFCB2EBE4D59DA2E256396B12F59FB1,4.941176
83745,FFFEE00C6052E1A688F4639D650AA50D,4.452381
83746,FFFEE7703FE466554E6B5F9C21F09297,4.310345
83747,FFFF67EAA043C2DB092DBC8934077556,4.600000


In [6]:
# merge product mean ratings + aggregated review text
final_df = pd.merge(grouped_lists, mean_df, on="asin")
final_df

Unnamed: 0,asin,0,overall
0,00018C9635D55E22BF157AA13E91226F,summary \ 2045677 Five Stars 2045678 I recomme...,4.090909
1,0001DE3A462B5C5D33AF3BC1053FC20C,"summary \ 2142792 Very portable, fairly durabl...",3.909091
2,00022ACC61318C98DA944B9BABD9E5AB,"summary \ 434812 Great product, poor shipping....",4.698413
3,0002C8404EBEDA230E4B66A85CEC5503,summary \ 417817 Small and Cheap 417818 One St...,3.400000
4,00034EBDF69991833D05B51EE7B11234,summary \ 91838 Pretty Good 91839 Escrima stic...,4.214286
...,...,...,...
83744,FFFCB2EBE4D59DA2E256396B12F59FB1,"summary \ 1262012 For The Price, Resin Pads Al...",4.941176
83745,FFFEE00C6052E1A688F4639D650AA50D,summary \ 593487 Good purchase 593488 Great ba...,4.452381
83746,FFFEE7703FE466554E6B5F9C21F09297,summary \ 593764 Cheapo zipper not for me 5937...,4.310345
83747,FFFF67EAA043C2DB092DBC8934077556,summary \ 1496788 No specs -- no deal 1496789 ...,4.600000


In [7]:
# function to use for lambda to categorize as awesome, not awesome
def df_iter(overall):
    if overall > 4.5:
        result = 1.0
    else:
        result = 0.0
        
    return result

In [8]:
final_df['class'] = final_df.apply(lambda row: df_iter(row['overall']), axis=1)
final_df

Unnamed: 0,asin,0,overall,class
0,00018C9635D55E22BF157AA13E91226F,summary \ 2045677 Five Stars 2045678 I recomme...,4.090909,0.0
1,0001DE3A462B5C5D33AF3BC1053FC20C,"summary \ 2142792 Very portable, fairly durabl...",3.909091,0.0
2,00022ACC61318C98DA944B9BABD9E5AB,"summary \ 434812 Great product, poor shipping....",4.698413,1.0
3,0002C8404EBEDA230E4B66A85CEC5503,summary \ 417817 Small and Cheap 417818 One St...,3.400000,0.0
4,00034EBDF69991833D05B51EE7B11234,summary \ 91838 Pretty Good 91839 Escrima stic...,4.214286,0.0
...,...,...,...,...
83744,FFFCB2EBE4D59DA2E256396B12F59FB1,"summary \ 1262012 For The Price, Resin Pads Al...",4.941176,1.0
83745,FFFEE00C6052E1A688F4639D650AA50D,summary \ 593487 Good purchase 593488 Great ba...,4.452381,0.0
83746,FFFEE7703FE466554E6B5F9C21F09297,summary \ 593764 Cheapo zipper not for me 5937...,4.310345,0.0
83747,FFFF67EAA043C2DB092DBC8934077556,summary \ 1496788 No specs -- no deal 1496789 ...,4.600000,1.0


In [21]:
# reasonably balanced!
final_df['class'].value_counts()

0.0    46668
1.0    37081
Name: class, dtype: int64

In [9]:
stemmer = SnowballStemmer("english", ignore_stopwords=True)
stop_words = stopwords.words("english")
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(preprocessor=stemmer.stem, stop_words=stop_words, ngram_range = (1,2), tokenizer=token.tokenize)
text_counts = cv.fit_transform(final_df[0])

In [10]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(text_counts, final_df['class'], test_size = 0.25, random_state = 5)

This is the improved logistic regression right here

In [12]:
# normal logistic regression
logreg = LogisticRegression(penalty='l2', solver='liblinear', max_iter=10000, class_weight='balanced')
logreg.fit(X_train, y_train)

# Predicting the results, calculating accuracy
y_pred = logreg.predict(X_test)
print("Accuracy of logistic regression classifier on test set: {:.2f}".format(logreg.score(X_test, y_test)))
# Compute F-1, precision, recall
f1_score = metrics.f1_score(y_test, y_pred, average='weighted')
print('Logistic regression F1: ' + str('{:04.2f}'.format(f1_score*100)) + '%')
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


convergence after 31 epochs took 16 seconds
Accuracy of logistic regression classifier on test set: 0.78


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   15.9s finished


Accuracy of logistic regression classifier on test set: 0.79
Logistic regression F1: 78.62%
              precision    recall  f1-score   support

         0.0       0.82      0.79      0.80     11634
         1.0       0.75      0.78      0.76      9304

    accuracy                           0.79     20938
   macro avg       0.78      0.79      0.78     20938
weighted avg       0.79      0.79      0.79     20938

LR Precision-Recall: 37.26%


here's the ***boosting***

In [23]:
# boosted logistic regression
logreg = LogisticRegression(penalty='l2',
                            solver='saga',
                            tol=1e-2,
                            max_iter=10000,
                            class_weight='balanced',
                            verbose=True,
                            n_jobs=-1)
# logreg.fit(X_train, y_train)
boost_logreg = AdaBoostClassifier(logreg,
                                  n_estimators=5,
                                  learning_rate=1,
                                  algorithm='SAMME')
boost_logreg.fit(X_train, y_train)

# Predicting the results, calculating accuracy
y_pred = boost_logreg.predict(X_test)
# print("Accuracy of logistic regression classifier on test set: {:.2f}".format(boost_logreg.score(X_test, y_test)))
# Compute F-1, precision, recall
f1_score = metrics.f1_score(y_test, y_pred, average='weighted')
print('Logistic regression F1: ' + str('{:04.2f}'.format(f1_score*100)) + '%')
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


convergence after 90 epochs took 48 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   47.8s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


convergence after 85 epochs took 46 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   45.3s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


convergence after 84 epochs took 44 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   44.2s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


convergence after 84 epochs took 44 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   43.8s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


convergence after 84 epochs took 45 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   44.3s finished


Logistic regression F1: 27.38%
              precision    recall  f1-score   support

     awesome       0.44      1.00      0.62      9304
         not       1.00      0.00      0.00     11634

    accuracy                           0.44     20938
   macro avg       0.72      0.50      0.31     20938
weighted avg       0.75      0.44      0.27     20938



In [24]:
unique, counts = np.unique(y_pred, return_counts=True)
dict(zip(unique, counts))

{'awesome': 20934, 'not': 4}