In [1]:
import datetime

import pandas as pd
import spacy
import re
import string

from spacy.tokens import Token
from tqdm import tqdm

# from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from textblob import TextBlob
from textblob import Word

import html

In [2]:
df = pd.read_csv("..//data//Womens Clothing E-Commerce Reviews.csv")
# Removed index from dataframe
df=df.drop(columns=["Unnamed: 0"])

In [3]:
df['Review Text'].head(35).tail(5).tolist()

['Beautifully made pants and on trend with the flared crop. so much cuter in person. love these!',
 "I never would have given these pants a second look online, in person they are much cuter! the stripes are brighter and the fit more flattering. the crop has a cute flare which is right on trend. this brand has always run small for me, i am 5'8 about 140lbs and carry some chubbiness in the belly. i paired it with a collarless loose navy blazer",
 'These pants are even better in person. the only downside is that they need to be dry cleaned.',
 'I ordered this 3 months ago, and it finally came off back order. a huge disappointment. the fit wasn&#39;t so much the issue for me. the quality of the wool is subpar. someone else mentioned a &quot;felted wool&quot;...i guess, is that what you call it?  it does literally feel like felt! super thin, itchy, doesn&#39;t drape very well, and feels cheap (made in china). i got it on sale, but still not worth what i paid. definitely going back.',
 'This

In [74]:
df["Review Text"]=df["Review Text"].apply(str).apply(html.unescape)

In [94]:
df['Review Text'].head(35).tail(5).tolist()

['Beautifully made pants and on trend with the flared crop. so much cuter in person. love these!',
 "I never would have given these pants a second look online, in person they are much cuter! the stripes are brighter and the fit more flattering. the crop has a cute flare which is right on trend. this brand has always run small for me, i am 5'8 about 140lbs and carry some chubbiness in the belly. i paired it with a collarless loose navy blazer",
 'These pants are even better in person. the only downside is that they need to be dry cleaned.',
 'I ordered this 3 months ago, and it finally came off back order. a huge disappointment. the fit wasn\'t so much the issue for me. the quality of the wool is subpar. someone else mentioned a "felted wool"...i guess, is that what you call it?  it does literally feel like felt! super thin, itchy, doesn\'t drape very well, and feels cheap (made in china). i got it on sale, but still not worth what i paid. definitely going back.',
 'This is such a neat 

In [75]:
def renames(feature, values):
    Dict_cols={}
    for value in values:
        Dict_cols[value]=feature+value
    return Dict_cols

In [98]:
pd.isnull(df['Review Text']).value_counts()

False    23486
Name: Review Text, dtype: int64

In [99]:
pd.isnull(df['Title']).value_counts()

False    19676
True      3810
Name: Title, dtype: int64

In [78]:
column_name='Division Name'
dummies= pd.get_dummies(df[column_name])
Encoded_Data=pd.concat([df,dummies], axis=1,ignore_index=False)
Encoded_Data=Encoded_Data.rename(index=str, columns=renames(column_name+'_',dummies.columns.tolist()))

column_name='Department Name'
dummies= pd.get_dummies(Encoded_Data[column_name])
Encoded_Data=pd.concat([Encoded_Data,dummies], axis=1,ignore_index=False)
Encoded_Data=Encoded_Data.rename(index=str, columns=renames(column_name+'_',dummies.columns.tolist()))

column_name='Class Name'
dummies= pd.get_dummies(Encoded_Data[column_name])
Encoded_Data=pd.concat([Encoded_Data,dummies], axis=1,ignore_index=False)
Encoded_Data=Encoded_Data.rename(index=str, columns=renames(column_name+'_',dummies.columns.tolist()))

In [79]:
Encoded_Data.columns

Index(['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating',
       'Recommended IND', 'Positive Feedback Count', 'Division Name',
       'Department Name', 'Class Name', 'Division Name_General',
       'Division Name_General Petite', 'Division Name_Initmates',
       'Department Name_Bottoms', 'Department Name_Dresses',
       'Department Name_Intimate', 'Department Name_Jackets',
       'Department Name_Tops', 'Department Name_Trend', 'Class Name_Blouses',
       'Class Name_Casual bottoms', 'Class Name_Chemises',
       'Class Name_Dresses', 'Class Name_Fine gauge', 'Class Name_Intimates',
       'Class Name_Jackets', 'Class Name_Jeans', 'Class Name_Knits',
       'Class Name_Layering', 'Class Name_Legwear', 'Class Name_Lounge',
       'Class Name_Outerwear', 'Class Name_Pants', 'Class Name_Shorts',
       'Class Name_Skirts', 'Class Name_Sleep', 'Class Name_Sweaters',
       'Class Name_Swim', 'Class Name_Trend'],
      dtype='object')

In [80]:
nlp = spacy.load("en_core_web_sm")

def remove_spell_errors(doc):
    bdoc = TextBlob(str(doc))
    ## Correcting the words
    return str(bdoc.correct())

def stop(doc):
    #return [token for token in doc if not token.is_punct and not token.text.isalnum() and not token.is_digit and not token.is_stop]
    #return [token for token in doc if token.text.isalnum() and not token.is_digit and not token.is_stop]
    return [token for token in doc if not token.is_digit and not token.is_stop and ( token.text.isalpha() or not token.text.isalnum())]

def lemmatize(doc):
    return [token.lemma_.lower() if token.lemma_ != "-PRON-" else token.text.lower() for token in doc]

def remove_line_breaks(doc):
    return [token.replace("\n", " ").replace("\r", " ") for token in doc]

nlp.add_pipe(stop)
nlp.add_pipe(lemmatize)
nlp.add_pipe(remove_line_breaks)

In [81]:
print(str(datetime.datetime.now())+" : Started preprocessing")
#docs = Encoded_Data["Review Text"].apply(remove_spell_errors).to_list()
docs = Encoded_Data["Review Text"].apply(str).to_list()
processed_docs = []

with tqdm(total=len(docs)) as bar:
    for doc in nlp.pipe(docs):
            line = " ".join(doc)
            ## Removing the punctuation
            line=line.translate(str.maketrans('','',string.punctuation))
            ## Removing numbers 
            line=" ".join(list(filter(lambda w : not w.isdigit(), line.split())))
            processed_docs.append(line)
            bar.update(1)

df["processed_Review_text"] = processed_docs
print(str(datetime.datetime.now())+" : Preprocessing completed")

2019-08-15 13:37:13.771218 : Started preprocessing


100%|██████████| 23486/23486 [04:12<00:00, 92.94it/s]  


2019-08-15 13:41:26.584252 : Preprocessing completed


In [82]:
TextBlob("mom").words[0].spellcheck()

[('mon', 0.5094339622641509),
 ('mob', 0.24528301886792453),
 ('mot', 0.0660377358490566),
 ('mop', 0.04716981132075472),
 ('tom', 0.03773584905660377),
 ('com', 0.03773584905660377),
 ('nom', 0.018867924528301886),
 ('om', 0.009433962264150943),
 ('mo', 0.009433962264150943),
 ('mmm', 0.009433962264150943),
 ('mm', 0.009433962264150943)]

In [83]:
#df.to_csv("..//data//(Spelling Correction)Womens Clothing E-Commerce Reviews PreProcessed.csv", index=False)

In [100]:
df["Review Text"].tolist()[630:640]

["This suit fits like a glove! i've had five kids and am currently nursing my last baby so i was looking for a suit to wear to the beach this summer when i'm just not feeling a bikini. this is it! it's comfortable and not binding easy to nursing and swim in. overall worth the extra splurge. i'm tall too so it was plenty long on my torso.",
 'Cute top! the bright red crochet shoulders and back add the perfect pop of color to the light blue pinstriped front. it\'s a little loose and boxy around the bust area but overall i think it\'s a flattering shape. i got the regular s and don\'t find it too short. for reference i\'m 5\'2", 134 lb, 34b. i often wear petite sizes but due to other reviews mentioning that the top is short i ordered the regular size and am happy with the length and fit.',
 'This top looked super cute online but when it came it was super short and boxy. looked like a tent on me. this one went back the same day it came!',
 "Comfy easy dress. i bought the black version. fab

In [103]:
df["processed_Review_text"].tolist()[630:940]

['suit fit like glove kid currently nurse baby look suit wear beach summer feel bikini comfortable bind easy nursing swim overall worth extra splurge tall plenty long torso',
 'cute bright red crochet shoulder add perfect pop color light blue pinstriped little loose boxy bust area overall think flattering shape get regular s find short reference lb wear petite size review mention short order regular size happy length fit',
 'look super cute online come super short boxy look like tent go day come',
 'comfy easy dress buy black version fabric soft wear house hesitate run need buy medium roomy length tall small fit short want',
 'uncertain size get look like asser fit right leg hit weird point calf cute short quote husband nope feel say feel great head retailer near sigh quest continue',
 'love dress day night time detail cute work racerback bra run big normally small dress arm hole big size small generally big like length weight fabric purchase black color',
 'reservation give petite mid

In [115]:
df[df["processed_Review_text"].str.contains('3season')]["Review Text"].tolist()

["A 3-season top that compliments jeans, dress pants, shorts & skirts. light-weight and double layered (so should you go without a bra the girls have protection). emphasizes nice shoulders, length just covers any belly flaws. i'm a 6-8 and could have gone small or medium here.",
 "I have only worn this 3 times with no signs of pilling, yet. it is good for cool spring days/evenings. for me, it'll be the cooler 3-seasons sweater. i wear a size large and it fits properly. i have worn lightweight long sleeve knit tees under this with ease in the sleeves. i think a slim sleeved shirt/blouse would work equally well. while i like the idea of the pockets, i think they'll stretch out easily beyond a tissue or quick hand warming. i removed 1 star due to the high price and i receive",
 "This is a 3-season item that can function as a cardigan or a jacket and will look equally good with jeans or with work slacks. the seaming gives it a bit of structure. the asymmetrical zipper gives it a little twi

In [118]:
df.to_csv("..//data//Womens Clothing E-Commerce Reviews PreProcessed.csv", index=False)

In [86]:
vectorizer = TfidfVectorizer()
review_vectors = vectorizer.fit_transform(df["processed_Review_text"])
features_df = pd.DataFrame(review_vectors.toarray(), columns = vectorizer.get_feature_names())

In [102]:
vectorizer.get_feature_names()

['00p0p',
 '00p0r',
 '0p',
 '0regular',
 '0xs',
 '100s',
 '10m',
 '10m34c',
 '10medium',
 '10the',
 '10yr',
 '112lbs',
 '115lbs',
 '117lb',
 '11year',
 '120it',
 '120lb',
 '120lbs',
 '125as',
 '125lb',
 '12l',
 '12medium',
 '130lbs34c56',
 '135athletic',
 '136s',
 '13rd',
 '140ish',
 '1416xl',
 '145lbs',
 '14large',
 '14xl33',
 '1636h',
 '16xl',
 '16xl36h',
 '18month',
 '1perfect',
 '1season',
 '1the',
 '20off',
 '20s30s',
 '20something',
 '234c',
 '23rds',
 '24p00p',
 '26inch',
 '26petite',
 '28dd30d32c',
 '28dddd30ddd',
 '29size',
 '2inch',
 '2small',
 '2tone',
 '2xs',
 '2year',
 '305it',
 '30d32c',
 '30s40',
 '30someth',
 '31size',
 '3234a',
 '3234ddd',
 '32a',
 '32b',
 '32b24',
 '32b26',
 '32c24',
 '32c2633',
 '32c27',
 '32d',
 '32d109lbs',
 '32d26',
 '32d31',
 '32dâ',
 '332537ish',
 '33b26',
 '33year',
 '3435a28',
 '34a',
 '34a27',
 '34a28',
 '34a2829',
 '34b',
 '34b27',
 '34b2837',
 '34b29',
 '34b32c',
 '34b34c',
 '34c2636',
 '34c28',
 '34c2838',
 '34c6',
 '34d',
 '34d27',
 '34dd

In [87]:
Prep_data=pd.concat([Encoded_Data,features_df], axis=1,ignore_index=False)
Prep_data=Prep_data.drop(columns=['Clothing ID','Title','Division Name', 'Department Name', 'Class Name'])
#Prep_data=Prep_data.drop(columns=Prep_data.columns.tolist()[33:176]) 

In [88]:
Prep_data.columns.tolist()[1:176]

['Review Text',
 'Rating',
 'Recommended IND',
 'Positive Feedback Count',
 'Division Name_General',
 'Division Name_General Petite',
 'Division Name_Initmates',
 'Department Name_Bottoms',
 'Department Name_Dresses',
 'Department Name_Intimate',
 'Department Name_Jackets',
 'Department Name_Tops',
 'Department Name_Trend',
 'Class Name_Blouses',
 'Class Name_Casual bottoms',
 'Class Name_Chemises',
 'Class Name_Dresses',
 'Class Name_Fine gauge',
 'Class Name_Intimates',
 'Class Name_Jackets',
 'Class Name_Jeans',
 'Class Name_Knits',
 'Class Name_Layering',
 'Class Name_Legwear',
 'Class Name_Lounge',
 'Class Name_Outerwear',
 'Class Name_Pants',
 'Class Name_Shorts',
 'Class Name_Skirts',
 'Class Name_Sleep',
 'Class Name_Sweaters',
 'Class Name_Swim',
 'Class Name_Trend',
 '00p0p',
 '00p0r',
 '0p',
 '0regular',
 '0xs',
 '100s',
 '10m',
 '10m34c',
 '10medium',
 '10the',
 '10yr',
 '112lbs',
 '115lbs',
 '117lb',
 '11year',
 '120it',
 '120lb',
 '120lbs',
 '125as',
 '125lb',
 '12l',
 '1

In [None]:
Prep_data.to_csv("..//data//Womens Clothing E-Commerce Reviews PreProcessed.csv", index=False)