In [1]:
import numpy as np
import pandas as pd
from collections import Counter

In [2]:
df = pd.read_csv('data.csv')

## Exploration

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 11 columns):
Unnamed: 0                 23486 non-null int64
Clothing ID                23486 non-null int64
Age                        23486 non-null int64
Title                      19676 non-null object
Review Text                22641 non-null object
Rating                     23486 non-null int64
Recommended IND            23486 non-null int64
Positive Feedback Count    23486 non-null int64
Division Name              23472 non-null object
Department Name            23472 non-null object
Class Name                 23472 non-null object
dtypes: int64(6), object(5)
memory usage: 2.0+ MB


In [5]:
df = df.dropna(axis=0, subset=['Review Text'])

In [6]:
df['Recommended IND'].value_counts()

1    18540
0     4101
Name: Recommended IND, dtype: int64

In [7]:
# we want a balanced dataset, hence we will undersample the positive review
df_positive = df[df['Recommended IND'] == 1].sample(4101)

In [8]:
df_negative = df[df['Recommended IND'] == 0]

In [9]:
df_2 = pd.concat([df_positive, df_negative])

In [10]:
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()

In [11]:
for idx, row in df_2.iterrows():
    for word in row['Review Text'].split(' '):
        if row['Recommended IND'] == 1:
            positive_counts[word] += 1
            total_counts[word] += 1
        else:
            negative_counts[word] += 1
            total_counts[word] += 1

In [12]:
## common positive words?
positive_counts.most_common()

[('the', 12804),
 ('and', 8944),
 ('i', 8891),
 ('a', 7914),
 ('it', 6505),
 ('is', 5551),
 ('to', 4365),
 ('this', 3682),
 ('in', 3605),
 ('but', 2777),
 ('with', 2496),
 ('for', 2474),
 ('of', 2289),
 ('on', 2285),
 ('so', 2047),
 ('my', 1932),
 ('was', 1764),
 ('that', 1641),
 ('I', 1548),
 ('not', 1526),
 ('dress', 1438),
 ('love', 1436),
 ('very', 1384),
 ('have', 1380),
 ('are', 1367),
 ('size', 1275),
 ('be', 1222),
 ('wear', 1184),
 ("it's", 1182),
 ('as', 1178),
 ('like', 1149),
 ('or', 1118),
 ('am', 1058),
 ('fit', 1050),
 ('just', 927),
 ("i'm", 925),
 ('you', 884),
 ('great', 883),
 ('they', 874),
 ('top', 836),
 ('too', 836),
 ('would', 809),
 ('This', 804),
 ('at', 796),
 ('little', 738),
 ('up', 722),
 ('more', 697),
 ('will', 676),
 ('really', 665),
 ('me', 663),
 ('ordered', 650),
 ('fabric', 641),
 ('can', 639),
 ('if', 628),
 ('color', 606),
 ('because', 597),
 ('an', 584),
 ('one', 577),
 ('fits', 566),
 ('than', 562),
 ('bought', 554),
 ('bit', 547),
 ('look', 538

In [13]:
negative_counts.most_common()

[('the', 15661),
 ('i', 8752),
 ('and', 8111),
 ('it', 7496),
 ('a', 6802),
 ('is', 4797),
 ('to', 4546),
 ('was', 4137),
 ('this', 4036),
 ('in', 3701),
 ('but', 3498),
 ('on', 2885),
 ('of', 2851),
 ('not', 2367),
 ('for', 2318),
 ('so', 2020),
 ('my', 1799),
 ('I', 1792),
 ('that', 1672),
 ('like', 1671),
 ('have', 1551),
 ('very', 1531),
 ('dress', 1470),
 ('with', 1400),
 ('too', 1369),
 ('would', 1304),
 ('be', 1258),
 ('just', 1224),
 ('top', 1155),
 ('as', 1153),
 ('are', 1107),
 ('fit', 1079),
 ('at', 1046),
 ('fabric', 986),
 ('size', 947),
 ("it's", 922),
 ('look', 886),
 ('am', 874),
 ('had', 868),
 ('if', 861),
 ('love', 855),
 ('you', 838),
 ('really', 835),
 ('me', 807),
 ('they', 795),
 ('ordered', 793),
 ("i'm", 761),
 ('when', 720),
 ('wear', 675),
 ('it.', 675),
 ('an', 668),
 ('This', 659),
 ('much', 653),
 ('or', 633),
 ('because', 625),
 ('more', 608),
 ('small', 597),
 ('were', 597),
 ('looks', 594),
 ('than', 593),
 ('even', 585),
 ('me.', 582),
 ('material', 57

In [14]:
## ratios of words for their appearance in positive / in negative

pos_neg_ratios = Counter()

for term,count in list(total_counts.most_common()):
    if(count > 100):
        pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
        pos_neg_ratios[term] = pos_neg_ratio

In [15]:
pos_neg_ratios.most_common()

[('dressed', 21.333333333333332),
 ('perfectly.', 10.285714285714286),
 ('compliments', 9.461538461538462),
 ('perfect.', 7.0),
 ('glad', 5.909090909090909),
 ('Love', 5.868421052631579),
 ('it!', 5.15),
 ('comfortable.', 4.709677419354839),
 ('jeans.', 4.615384615384615),
 ('easy', 4.354838709677419),
 ('happy', 4.15625),
 ('fits', 3.6993464052287583),
 ('casual', 3.5319148936170213),
 ('fun', 3.4782608695652173),
 ('comfortable', 3.4622641509433962),
 ('perfect', 3.461038961038961),
 ('dress!', 3.3448275862068964),
 ('navy', 3.076923076923077),
 ('summer', 3.0163934426229506),
 ('wait', 2.975609756097561),
 ('worn', 2.9027777777777777),
 ('leggings', 2.8823529411764706),
 ('perfectly', 2.8333333333333335),
 ('comfortable,', 2.8125),
 ('favorite', 2.757575757575758),
 ('jeans', 2.6923076923076925),
 ('unique', 2.675),
 ('comfy', 2.6315789473684212),
 ('can', 2.6296296296296298),
 ('fall', 2.593220338983051),
 ('slightly', 2.5416666666666665),
 ('skinny', 2.5384615384615383),
 ('many',

In [16]:
text_length = Counter()
for idx, row in df_2.iterrows():
    text_length[idx] = len(row['Review Text'].split())

In [17]:
text_length.most_common()[-10:-1]

[(21640, 5),
 (22126, 5),
 (4386, 4),
 (23027, 4),
 (11214, 4),
 (16863, 4),
 (15265, 3),
 (8605, 3),
 (9121, 3)]

In [18]:
text_length.most_common()

[(16880, 115),
 (18708, 113),
 (6697, 111),
 (9193, 111),
 (13331, 110),
 (5249, 110),
 (21804, 110),
 (7025, 110),
 (17330, 110),
 (6951, 109),
 (1653, 109),
 (17426, 109),
 (4840, 109),
 (9235, 109),
 (20448, 109),
 (12466, 109),
 (10091, 109),
 (12376, 109),
 (13631, 109),
 (13646, 109),
 (16807, 109),
 (18619, 109),
 (6917, 108),
 (4978, 108),
 (16950, 108),
 (6779, 108),
 (7208, 108),
 (4397, 108),
 (15718, 108),
 (17678, 108),
 (18856, 108),
 (19958, 108),
 (2518, 107),
 (1252, 107),
 (19787, 107),
 (13327, 107),
 (19367, 107),
 (16497, 107),
 (8669, 107),
 (6228, 107),
 (1260, 107),
 (19373, 107),
 (17931, 107),
 (11415, 107),
 (21487, 107),
 (2819, 107),
 (23396, 107),
 (20692, 107),
 (19507, 107),
 (4297, 107),
 (4890, 107),
 (12725, 107),
 (12881, 107),
 (13993, 107),
 (16787, 107),
 (18117, 107),
 (18276, 107),
 (18818, 107),
 (19551, 107),
 (23145, 107),
 (23304, 107),
 (7547, 106),
 (6245, 106),
 (2865, 106),
 (15204, 106),
 (9657, 106),
 (18576, 106),
 (12995, 106),
 (196

## Preprocessing

We will clean punctuations from all the review text

In [19]:
from string import punctuation

In [20]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [21]:
def remove_punctuation(txt):
    for p in punctuation:
        txt = txt.replace(p, '')
    
    # also remove \n and \r
    txt = txt.replace('\n', '')
    txt = txt.replace('\r', '')
    return txt

In [22]:
df_2['Review Text'] = df_2['Review Text'].apply(remove_punctuation)

In [28]:
import nltk
from nltk.corpus import stopwords

In [30]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/grandia/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [31]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [45]:
def remove_stopwords(txt):
    cleaned_text = []
    stops = stopwords.words('english')
    # 'I' seems to be missing
    stops.append('I')
    for word in txt.split():
        if word not in stops:
            cleaned_text.append(word)
    
    text = ' '.join(cleaned_text)
        
    return text

In [46]:
df_2['Review Text'] = df_2['Review Text'].apply(remove_stopwords)

### Word Encoding

In [47]:
## build vocab dictionary
combined_text = df_2['Review Text'].str.cat(sep=' ')
words = combined_text.split()

In [48]:
counts = Counter(words)

In [69]:
len(counts)

12524

In [49]:
# sort the vocab based on counts, so that most common word has lower int representation
vocab = sorted(counts, key=counts.get, reverse=True)

In [50]:
# start the coding with 1, since we will use 0 for padding
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

In [51]:
df_3 = df_2.copy()

In [52]:
# transform each review text to int representation
df_3['Review Text Int'] = df_3['Review Text'].apply(lambda x: [vocab_to_int[word] for word in x.split()])

In [53]:
def pad_with_zeros(x):
    return_list = [0] * 113
    count_left_zero = 113 - len(x)
    for idx, word_int in enumerate(x):
        return_list[idx + count_left_zero] = word_int
    
    return return_list

In [54]:
# left pad reviews that are less than 113 words with 0s
df_3['Review Text Int'] = df_3['Review Text Int'].apply(pad_with_zeros)

In [55]:
df_3['Review Text Int'] = df_3['Review Text Int'].apply(lambda x: np.array(x))

## Train and Test Set

In [56]:
from sklearn.model_selection import train_test_split

In [57]:
# convert from array of arrays to 2D array
X = np.stack(df_3['Review Text Int'].as_matrix())
y = df_3['Recommended IND']

  


In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

## Neural Network

In [59]:
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense

Using TensorFlow backend.


In [60]:
lstm_size = 113
embed_size = 400

In [61]:
model = Sequential()
model.add(Embedding(len(counts)+1, embed_size, input_length=113))
model.add(LSTM(lstm_size))
model.add(Dense(1, activation='sigmoid'))

W0926 09:09:21.468464 4582794688 deprecation_wrapper.py:119] From /Users/grandia/anaconda3/envs/pysyft/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0926 09:09:21.544189 4582794688 deprecation_wrapper.py:119] From /Users/grandia/anaconda3/envs/pysyft/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0926 09:09:21.548192 4582794688 deprecation_wrapper.py:119] From /Users/grandia/anaconda3/envs/pysyft/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.



In [62]:
model.compile(loss='binary_crossentropy', optimizer='adam')

W0926 09:09:22.034910 4582794688 deprecation_wrapper.py:119] From /Users/grandia/anaconda3/envs/pysyft/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0926 09:09:22.090688 4582794688 deprecation_wrapper.py:119] From /Users/grandia/anaconda3/envs/pysyft/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3376: The name tf.log is deprecated. Please use tf.math.log instead.

W0926 09:09:22.110435 4582794688 deprecation.py:323] From /Users/grandia/anaconda3/envs/pysyft/lib/python3.7/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [63]:
model.fit(X_train, y_train,
          batch_size=128,
          epochs=3)

W0926 09:09:26.104203 4582794688 deprecation_wrapper.py:119] From /Users/grandia/anaconda3/envs/pysyft/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x13add9fd0>

In [64]:
y_pred = model.predict(X_test)

In [65]:
y_pred = y_pred >= 0.5

In [66]:
from sklearn.metrics import accuracy_score

In [67]:
accuracy_score(y_test, y_pred)

0.8427787934186471

In [68]:
model.save('lstm.h5')

In [70]:
import pickle

In [71]:
pickle.dump(vocab_to_int, open('vocab_to_int.pickle', 'wb'))

In [72]:
vocab_loaded = pickle.load(open('vocab_to_int.pickle', 'rb'))

In [75]:
review = "i love this dress sooooo much"

In [89]:
review.split()

['i', 'love', 'this', 'dress', 'sooooo', 'much']

In [77]:
# remove word not in vocab
new_review = [word for word in review.split() if word in vocab_to_int]

In [82]:
integered = [vocab_to_int[word] for word in new_review]

In [83]:
padded = pad_with_zeros(integered)

In [87]:
reshaped = np.array(padded).reshape(1,-1)

In [88]:
model.predict(reshaped)

array([[0.48872632]], dtype=float32)

In [90]:
from keras.models import load_model

In [91]:
model2 = load_model('lstm.h5')

In [93]:
res = model.predict(reshaped)

In [99]:
res[0][0]

0.48872632