In [1]:
import pandas as pd
import spacy
import re

from spacy.tokens import Token
from tqdm import tqdm

# from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv("..//data//Womens Clothing E-Commerce Reviews.csv")
# Removed index from dataframe
df=df.drop(columns=["Unnamed: 0"])

In [3]:
def renames(feature, values):
    Dict_cols={}
    for value in values:
        Dict_cols[value]=feature+value
    return Dict_cols

In [16]:
pd.isnull(df['Review Text']).value_counts()

False    22641
True       845
Name: Review Text, dtype: int64

In [17]:
pd.isnull(df['Title']).value_counts()

False    19676
True      3810
Name: Title, dtype: int64

In [4]:
column_name='Division Name'
dummies= pd.get_dummies(df[column_name])
Encoded_Data=pd.concat([df,dummies], axis=1,ignore_index=False)
Encoded_Data=Encoded_Data.rename(index=str, columns=renames(column_name+'_',dummies.columns.tolist()))

column_name='Department Name'
dummies= pd.get_dummies(Encoded_Data[column_name])
Encoded_Data=pd.concat([Encoded_Data,dummies], axis=1,ignore_index=False)
Encoded_Data=Encoded_Data.rename(index=str, columns=renames(column_name+'_',dummies.columns.tolist()))

column_name='Class Name'
dummies= pd.get_dummies(Encoded_Data[column_name])
Encoded_Data=pd.concat([Encoded_Data,dummies], axis=1,ignore_index=False)
Encoded_Data=Encoded_Data.rename(index=str, columns=renames(column_name+'_',dummies.columns.tolist()))

In [5]:
Encoded_Data.columns

Index(['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating',
       'Recommended IND', 'Positive Feedback Count', 'Division Name',
       'Department Name', 'Class Name', 'Division Name_General',
       'Division Name_General Petite', 'Division Name_Initmates',
       'Department Name_Bottoms', 'Department Name_Dresses',
       'Department Name_Intimate', 'Department Name_Jackets',
       'Department Name_Tops', 'Department Name_Trend', 'Class Name_Blouses',
       'Class Name_Casual bottoms', 'Class Name_Chemises',
       'Class Name_Dresses', 'Class Name_Fine gauge', 'Class Name_Intimates',
       'Class Name_Jackets', 'Class Name_Jeans', 'Class Name_Knits',
       'Class Name_Layering', 'Class Name_Legwear', 'Class Name_Lounge',
       'Class Name_Outerwear', 'Class Name_Pants', 'Class Name_Shorts',
       'Class Name_Skirts', 'Class Name_Sleep', 'Class Name_Sweaters',
       'Class Name_Swim', 'Class Name_Trend'],
      dtype='object')

In [6]:
nlp = spacy.load("en_core_web_sm")

def stop(doc):
    #return [token for token in doc if not token.is_punct and not token.text.isalnum() and not token.is_digit and not token.is_stop]
    return [token for token in doc if token.text.isalnum() and not token.is_digit and not token.is_stop]

def lemmatize(doc):
    return [token.lemma_.lower() if token.lemma_ != "-PRON-" else token.text.lower() for token in doc]

def remove_line_breaks(doc):
    return [token.replace("\n", " ").replace("\r", " ") for token in doc]

nlp.add_pipe(stop)
nlp.add_pipe(lemmatize)
nlp.add_pipe(remove_line_breaks)

In [7]:
docs =  [str(i) for i in Encoded_Data["Review Text"].to_list()]
processed_docs = []

with tqdm(total=len(docs)) as bar:
    for doc in nlp.pipe(docs):
            line = " ".join(doc)
            processed_docs.append(line)
            bar.update(1)

df["processed_Review_text"] = processed_docs

100%|██████████| 23486/23486 [04:40<00:00, 83.63it/s]  


In [8]:
df["Review Text"].tolist()[30:40]

['Beautifully made pants and on trend with the flared crop. so much cuter in person. love these!',
 "I never would have given these pants a second look online, in person they are much cuter! the stripes are brighter and the fit more flattering. the crop has a cute flare which is right on trend. this brand has always run small for me, i am 5'8 about 140lbs and carry some chubbiness in the belly. i paired it with a collarless loose navy blazer",
 'These pants are even better in person. the only downside is that they need to be dry cleaned.',
 'I ordered this 3 months ago, and it finally came off back order. a huge disappointment. the fit wasn&#39;t so much the issue for me. the quality of the wool is subpar. someone else mentioned a &quot;felted wool&quot;...i guess, is that what you call it?  it does literally feel like felt! super thin, itchy, doesn&#39;t drape very well, and feels cheap (made in china). i got it on sale, but still not worth what i paid. definitely going back.',
 'This

In [9]:
df["processed_Review_text"].tolist()[30:40]

['beautifully pant trend flared crop cuter person love',
 'give pant second look online person cut stripe bright fit flattering crop cute flare right trend brand run small 140lbs carry chubbiness belly pair collarless loose navy blazer',
 'pant well person downside need dry clean',
 'order month ago finally come order huge disappointment fit issue quality wool subpar mention guess literally feel like feel super thin itchy drape feel cheap china get sale worth pay definitely go',
 'neat dress color great fabric super soft tall long length add bonus definitely need underneath gap go pair funky tank necklace boot super cute',
 'give second look try store whim love love',
 'comfortable skirt span season easily exciting design good work skirt pair top',
 'order small size medium mom size gorgeous beautifully drape weight warmth need houston fall winter look polished snap unsnapped age appropriate mom look amazing skinny jean legging order gray true photo',
 'super cute comfy pull sizing acc

In [10]:
vectorizer = TfidfVectorizer()
review_vectors = vectorizer.fit_transform(df["processed_Review_text"])
features_df = pd.DataFrame(review_vectors.toarray(), columns = vectorizer.get_feature_names())

In [21]:
Prep_data=pd.concat([Encoded_Data,features_df], axis=1,ignore_index=False)
Prep_data=Prep_data.drop(columns=['Clothing ID','Title','Review Text','Division Name', 'Department Name', 'Class Name'])

In [32]:
Prep_data.columns.tolist()[33:318]

['00p',
 '03dd',
 '0dd',
 '0p',
 '0petite',
 '0r',
 '0verall',
 '0xs',
 '10',
 '100lbs',
 '102lbs',
 '103lbs',
 '104lbs',
 '105lbs',
 '106lbs',
 '107lbs',
 '107pound',
 '108lbs',
 '109lbs',
 '10l',
 '10lbs',
 '10mths',
 '10p',
 '10th',
 '10x',
 '110lbs',
 '111lbs',
 '112lbs',
 '112llbs',
 '113lbs',
 '114lbs',
 '115ish',
 '115lbs',
 '115llb',
 '116bs',
 '116ibs',
 '116lbs',
 '117bl',
 '117lbs',
 '118lbs',
 '11and',
 '11inches',
 '120lbs',
 '122lbs',
 '123lbs',
 '123lbssize',
 '124lbs',
 '125b',
 '125ibs',
 '125ish',
 '125lbs',
 '126lbs',
 '127lbs',
 '128b',
 '128lbs',
 '129lbs',
 '12p',
 '12th',
 '130b',
 '130ibs',
 '130l',
 '130lbs',
 '130pounds',
 '132lbs',
 '133lbs',
 '134b',
 '134lbs',
 '135lbs',
 '136lbs',
 '137lbs',
 '138lbs',
 '13th',
 '14',
 '140b',
 '140ish',
 '140lbs',
 '142lbs',
 '144lbs',
 '145lbs',
 '146lbs',
 '148lbs',
 '149lbs',
 '14p',
 '150lbs',
 '154lbs',
 '155lbs',
 '16',
 '160lbs',
 '165lbs',
 '170lbs',
 '172lbs',
 '175lbs',
 '180lbs',
 '183lbs',
 '18th',
 '190lbs',
