In [79]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
from sklearn.metrics import accuracy_score, f1_score,roc_auc_score,accuracy_score, confusion_matrix, roc_curve
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
import helper_functions as hf
from sklearn.linear_model import LogisticRegressionCV
seed = 123

In [2]:
data_train = pd.read_pickle('train_set.pkl')
data_test = pd.read_pickle('test_set.pkl')

In [3]:
data_train.head()

Unnamed: 0_level_0,clothing_ID,Age,Title,Text,RAW_Text,review_len,Positive_fb_count,Division,Department,Class,Positively_rated,Reco
original_ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
19245,829,34,flirty and fun,i am 5 5 145 lbs a cup depending on the st...,"I am 5'5"", 145 lbs, a cup. depending on the st...",504,6,General Petite,Tops,Blouses,1,1
22446,768,41,,so thin and see thru it s not even wearable in...,So thin and see thru it's not even wearable in...,188,4,Initmates,Intimate,Lounge,0,0
3813,792,29,cute comfy,just bought these today and they are super sof...,Just bought these today and they are super sof...,286,0,Initmates,Intimate,Sleep,1,1
22549,850,52,sweet top,lightweight silk with sweet details ruffles ...,Lightweight silk with sweet details (ruffles) ...,111,0,General,Tops,Blouses,1,1
3417,1056,24,cute capris fit great,i bought the size 27 in these i m usually a ...,I bought the size 27 in these - i'm usually a ...,239,0,General Petite,Bottoms,Pants,1,1


In [41]:
X = np.array(data_train.RAW_Text)
y = np.array(data_train.Positively_rated)

In [66]:
vect = TfidfVectorizer()
#parameters to tune: n_gram
#We chose not to tune min_df as the reviews tend to be short so repeating words should be rare except stopwords

Let's try it with default parameters

In [67]:
X_vect = vect.fit_transform(X)

In [68]:
model = LogisticRegressionCV(Cs = [1,5,10],cv = 5,penalty =  'l2',max_iter = 500,random_state = seed,
         multi_class = 'ovr')
fitted_model,results = hf.model_fit_train_score_skf(model,X_vect, y)

In [69]:
print(f"Accuracy: {results['Accuracy_mean']:.3f} \nF1 score: {results['F1_mean']:.3f} \nAUC score: {results['AUC_mean']:.3f}")

Accuracy: 0.879 
F1 score: 0.924 
AUC score: 0.931


In [70]:
fitted_model.C_

array([5])

In [71]:
#try to see the 
feature_names = np.array(vect.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['disappointed' 'wanted' 'cheap' 'huge' 'unflattering' 'bad' 'returned'
 'returning' 'however' 'poor']

Largest Coefs: 
['perfect' 'love' 'comfortable' 'compliments' 'perfectly' 'great'
 'sometimes' 'little' 'glad' 'beautifully']


Great, now let's explore if removing stopwords or lemmatization has impact on the model

In [49]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
stopwords = list(spacy.lang.en.stop_words.STOP_WORDS)

In [55]:
def lemma_and_stop (text):
    docs = nlp.pipe(text)
    lemma = []
    lemma_and_stop = []

    for doc in docs:
        if doc.is_parsed:
            lemma.append(" ".join([n.lemma_ for n in doc]))
            lemma_and_stop.append(" ".join([n.lemma_ for n in doc if n.is_stop == False]))
        else:
            # We want to make sure that the lists of parsed results have the
            # same number of entries of the original Dataframe, so add some blanks in case the parse fails
            lemma.append(None)
            lemma_and_stop.append(None)
    return lemma,lemma_and_stop

In [56]:
lemma_X,lem_and_stop_X = lemma_and_stop(data_train.RAW_Text)

In [57]:
vect = TfidfVectorizer()
X_vect_lemma = vect.fit_transform(lemma_X)

In [58]:
model = LogisticRegressionCV(Cs = [0.5,1,5,10],cv = 5,penalty =  'l2',max_iter = 500,random_state = seed,
         multi_class = 'ovr')
fitted_model,results = hf.model_fit_train_score_skf(model,X_vect_lemma, y)

In [59]:
print(f"Accuracy: {results['Accuracy_mean']:.3f} \nF1 score: {results['F1_mean']:.3f} \nAUC score: {results['AUC_mean']:.3f}")

Accuracy: 0.877 
F1 score: 0.922 
AUC score: 0.925


In [60]:
fitted_model.C_

array([5.])

In [61]:
vect = TfidfVectorizer()
X_vect_lemma_stop = vect.fit_transform(lem_and_stop_X)

model = LogisticRegressionCV(Cs = [0.5,1,5,10],cv = 5,penalty =  'l2',max_iter = 500,random_state = seed,
         multi_class = 'ovr')

fitted_model,results = hf.model_fit_train_score_skf(model,X_vect_lemma_stop, y)

In [62]:
print(f"Accuracy: {results['Accuracy_mean']:.3f} \nF1 score: {results['F1_mean']:.3f} \nAUC score: {results['AUC_mean']:.3f}")

Accuracy: 0.868 
F1 score: 0.917 
AUC score: 0.912


Same conclusion as BOW model, lemmatization and stop words removal does not help improving the model performance

We will explore fitting MNB and SVC

In [64]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
fitted_model,results = hf.model_fit_train_score_skf(clf,X_vect, y)

In [65]:
print(f"Accuracy: {results['Accuracy_mean']:.3f} \nF1 score: {results['F1_mean']:.3f} \nAUC score: {results['AUC_mean']:.3f}")

Accuracy: 0.783 
F1 score: 0.877 
AUC score: 0.909


In [74]:
vect_2 = TfidfVectorizer(ngram_range=(1, 2))

In [75]:
X_vect2 = vect_2.fit_transform(X)

In [76]:
model = LogisticRegressionCV(Cs = [0.1,1,5,10],cv = 5,penalty =  'l2',max_iter = 500,random_state = seed,
         multi_class = 'ovr')
fitted_model,results = hf.model_fit_train_score_skf(model,X_vect2, y)

In [77]:
print(f"Accuracy: {results['Accuracy_mean']:.3f} \nF1 score: {results['F1_mean']:.3f} \nAUC score: {results['AUC_mean']:.3f}")

Accuracy: 0.889 
F1 score: 0.930 
AUC score: 0.941


In [81]:
#try to see the 
feature_names2 = np.array(vect_2.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs:\n{}\n'.format(feature_names2[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names2[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['disappointed' 'to love' 'cheap' 'huge' 'not' 'not flattering'
 'unflattering' 'returned' 'was' 'however']

Largest Coefs: 
['perfect' 'love' 'comfortable' 'great' 'not too' 'love this' 'perfectly'
 'fits' 'little' 'compliments']


In [82]:
results

defaultdict(None,
            {'predictions': [array([1, 0, 1, ..., 0, 0, 0], dtype=int64),
              array([0, 1, 1, ..., 1, 0, 1], dtype=int64),
              array([1, 1, 1, ..., 0, 1, 0], dtype=int64),
              array([1, 1, 1, ..., 1, 0, 0], dtype=int64),
              array([1, 1, 1, ..., 1, 1, 0], dtype=int64)],
             'predict_proba': [array([0.80219119, 0.01449929, 0.93160834, ..., 0.01588034, 0.06728294,
                     0.09835457]),
              array([0.40936969, 0.98746757, 0.98231115, ..., 0.63226826, 0.22601763,
                     0.81526476]),
              array([0.99710456, 0.99040941, 0.97165812, ..., 0.28169199, 0.98061749,
                     0.17454011]),
              array([0.99711825, 0.98876105, 0.99753555, ..., 0.50540558, 0.0312755 ,
                     0.23795048]),
              array([0.99519003, 0.99778335, 0.97877786, ..., 0.72942468, 0.87494089,
                     0.46656751])],
             'Accuracy_mean': 0.8888100336932595