# Online reviews for women's clothing
Data set from [Kaggle](https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews/home)

In [225]:
import time
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings("ignore")

## Load data and set-up dataframe

In [30]:
df = pd.read_csv('womens_clothing_reviews.csv')
print("Rows: {}\nColumns: {}".format(df.shape[0], df.shape[1]))
df.head()

Rows: 23486
Columns: 11


Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [31]:
df = df[["Review Text", "Rating"]]
df.columns = ["review_text", "rating"]
df.head()

Unnamed: 0,review_text,rating
0,Absolutely wonderful - silky and sexy and comf...,4
1,Love this dress! it's sooo pretty. i happene...,5
2,I had such high hopes for this dress and reall...,3
3,"I love, love, love this jumpsuit. it's fun, fl...",5
4,This shirt is very flattering to all due to th...,5


In [32]:
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22641 entries, 0 to 23485
Data columns (total 2 columns):
review_text    22641 non-null object
rating         22641 non-null int64
dtypes: int64(1), object(1)
memory usage: 530.6+ KB


In [33]:
df["pos_rating"] = df["rating"].apply(lambda x: 1 if x >= 4 else 0)
df = df[["review_text", "pos_rating"]]
df.head()

Unnamed: 0,review_text,pos_rating
0,Absolutely wonderful - silky and sexy and comf...,1
1,Love this dress! it's sooo pretty. i happene...,1
2,I had such high hopes for this dress and reall...,0
3,"I love, love, love this jumpsuit. it's fun, fl...",1
4,This shirt is very flattering to all due to th...,1


In [73]:
X = df["review_text"]
y = df["pos_rating"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [120]:
df["pos_rating"].sum() / len(df["pos_rating"])

0.7706373393401351

## Preprocess text

In [144]:
def preprocess_text(text):
    # Remove punctuation
    no_punc = [char for char in text if char not in string.punctuation]
    no_punc = "".join(no_punc).strip()
    return no_punc.split()

### Bag o' words

In [179]:
def run_bow_model(model, X_train, X_test, y_train, y_test, cv=3):
    bow_pipe = Pipeline([("bow", CountVectorizer(analyzer=preprocess_text)), ("clf", model)])
    bow_pipe.fit(X_train, y_train)
    scores = cross_val_score(bow_pipe, X_train, y_train, cv=cv, n_jobs=-1)
    print("CV accuracy: {:.3f} +/- {:.3f}".format(np.mean(scores), np.std(scores)))
    print("Test accuracy: {:.3f}".format(bow_pipe.score(X_test, y_test)))
    print("Training scores:", scores)

In [180]:
run_bow_model(BernoulliNB(), X_train, X_test, y_train, y_test, cv=5)

CV accuracy: 0.859 +/- 0.004
Test accuracy: 0.863
Training scores: [0.85867508 0.86561514 0.85615142 0.85547491 0.85736825]


In [181]:
run_bow_model(KNeighborsClassifier(), X_train, X_test, y_train, y_test, cv=5)

CV accuracy: 0.786 +/- 0.003
Test accuracy: 0.784
Training scores: [0.78044164 0.78548896 0.78864353 0.78826128 0.7866835 ]


In [182]:
run_bow_model(LogisticRegression(), X_train, X_test, y_train, y_test, cv=5)

CV accuracy: 0.870 +/- 0.002
Test accuracy: 0.870
Training scores: [0.86876972 0.87255521 0.86656151 0.87062165 0.87251499]


In [183]:
run_bow_model(RandomForestClassifier(), X_train, X_test, y_train, y_test, cv=5)

CV accuracy: 0.818 +/- 0.005
Test accuracy: 0.820
Training scores: [0.81671924 0.82712934 0.81419558 0.81760808 0.81350584]


In [184]:
run_bow_model(SVC(max_iter=50), X_train, X_test, y_train, y_test, cv=5)

CV accuracy: 0.644 +/- 0.008
Test accuracy: 0.638
Training scores: [0.63659306 0.65804416 0.63911672 0.65036289 0.63805617]


In [185]:
run_bow_model(GradientBoostingClassifier(), X_train, X_test, y_train, y_test, cv=5)

CV accuracy: 0.837 +/- 0.005
Test accuracy: 0.842
Training scores: [0.83722397 0.8444795  0.83501577 0.8299148  0.83811928]


### TF-IDF

In [216]:
def run_tfidf_model(model, X_train, X_test, y_train, y_test, cv=3):
    tfidf_pipe = Pipeline([("tfidf", TfidfVectorizer(tokenizer=preprocess_text)),("clf", model)])
    tfidf_pipe.fit(X_train, y_train)
    scores = cross_val_score(tfidf_pipe, X_train, y_train, cv=cv, n_jobs=-1)
    print("CV accuracy: {:.3f} +/- {:.3f}".format(np.mean(scores), np.std(scores)))
    print("Test accuracy: {:.3f}".format(tfidf_pipe.score(X_test, y_test)))
    print("Training scores:", scores)

In [217]:
run_tfidf_model(BernoulliNB(), X_train, X_test, y_train, y_test, cv=5)

CV accuracy: 0.858 +/- 0.004
Test accuracy: 0.863
Training scores: [0.86025237 0.86498423 0.85394322 0.85295046 0.85610603]


In [218]:
run_tfidf_model(KNeighborsClassifier(), X_train, X_test, y_train, y_test, cv=5)

CV accuracy: 0.821 +/- 0.006
Test accuracy: 0.825
Training scores: [0.82239748 0.82239748 0.81861199 0.81003471 0.82928369]


In [219]:
run_tfidf_model(LogisticRegression(), X_train, X_test, y_train, y_test, cv=5)

CV accuracy: 0.878 +/- 0.005
Test accuracy: 0.874
Training scores: [0.87476341 0.88328076 0.87444795 0.87156832 0.88482171]


In [220]:
run_tfidf_model(RandomForestClassifier(), X_train, X_test, y_train, y_test, cv=5)

CV accuracy: 0.819 +/- 0.004
Test accuracy: 0.818
Training scores: [0.81861199 0.82397476 0.82208202 0.81255917 0.81792364]


In [221]:
run_tfidf_model(SVC(max_iter=50), X_train, X_test, y_train, y_test, cv=5)

CV accuracy: 0.752 +/- 0.013
Test accuracy: 0.791
Training scores: [0.74416404 0.75425868 0.77223975 0.73398548 0.7551278 ]


In [222]:
run_tfidf_model(GradientBoostingClassifier(), X_train, X_test, y_train, y_test, cv=5)

CV accuracy: 0.840 +/- 0.005
Test accuracy: 0.841
Training scores: [0.8384858  0.84700315 0.83564669 0.8343326  0.84316819]


### Model tuning

In [228]:
param_grid = [ {"clf__penalty": ["l1", "l2"], "clf__C": [1.0, 10.0, 100.0]}]
tfidf_pipe = Pipeline([("tfidf", TfidfVectorizer(tokenizer=preprocess_text)),("clf", LogisticRegression())])
gs_lgr_tfidf = GridSearchCV(tfidf_pipe, param_grid, scoring="accuracy", cv=5, verbose=1, n_jobs=-1)

start_time = time.time()
gs_lgr_tfidf.fit(X_train, y_train)
elpased_time = time.time() - start_time

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   22.0s finished


In [230]:
print("----- {:.6f} seconds -----".format(elpased_time))
print("Best parameter set: {}".format(gs_lgr_tfidf.best_params_))
print("CV accuracy: {:.3f}".format(gs_lgr_tfidf.best_score_))

clf = gs_lgr_tfidf.best_estimator_
print("Test accuracy: {:.3f}".format(clf.score(X_test, y_test)))

----- 23.419274 seconds -----
Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2'}
CV accuracy: 0.879


In [234]:
param_grid = [ {"clf__n_estimators": [500, 1000]}]
tfidf_pipe = Pipeline([("tfidf", TfidfVectorizer(tokenizer=preprocess_text)),
                       ("clf", GradientBoostingClassifier())])
gs_lgr_tfidf = GridSearchCV(tfidf_pipe, param_grid, scoring="accuracy", cv=5, verbose=1, n_jobs=-1)

start_time = time.time()
gs_lgr_tfidf.fit(X_train, y_train)
elpased_time = time.time() - start_time

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  4.6min remaining:  3.0min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  6.0min finished


In [235]:
print("----- {:.6f} seconds -----".format(elpased_time))
print("Best parameter set: {}".format(gs_lgr_tfidf.best_params_))
print("CV accuracy: {:.3f}".format(gs_lgr_tfidf.best_score_))
clf = gs_lgr_tfidf.best_estimator_
print("Test accuracy: {:.3f}".format(clf.score(X_test, y_test)))

----- 579.290218 seconds -----
Best parameter set: {'clf__n_estimators': 1000}
CV accuracy: 0.866
Test accuracy: 0.870
