In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from textblob import TextBlob

import text_processor as tp

In [2]:
%matplotlib inline

In [3]:
data_set = pd.read_csv('datasets/womens-ecommerce-clothing-reviews/reviews.csv')

In [4]:
data_set.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [5]:
reviews_df = data_set[['Review Text','Recommended IND']]

In [6]:
reviews_df.head()

Unnamed: 0,Review Text,Recommended IND
0,Absolutely wonderful - silky and sexy and comf...,1
1,Love this dress! it's sooo pretty. i happene...,1
2,I had such high hopes for this dress and reall...,0
3,"I love, love, love this jumpsuit. it's fun, fl...",1
4,This shirt is very flattering to all due to th...,1


In [7]:
reviews_df.isnull().sum()

Review Text        845
Recommended IND      0
dtype: int64

In [8]:
reviews_df.count()

Review Text        22641
Recommended IND    23486
dtype: int64

In [9]:
reviews_df = reviews_df.dropna()

In [10]:
reviews_df.count()

Review Text        22641
Recommended IND    22641
dtype: int64

In [11]:
reviews_df.duplicated().sum()

7

In [12]:
reviews_df = reviews_df.drop_duplicates()

In [13]:
reviews_df.count()

Review Text        22634
Recommended IND    22634
dtype: int64

In [14]:
reviews_df = reviews_df.rename(columns={'Recommended IND':'Recommended'})

In [15]:
reviews_df['Recommended'].unique()

array([1, 0])

In [16]:
reviews_df['Recommended'].value_counts(normalize=True)

1    0.818857
0    0.181143
Name: Recommended, dtype: float64

In [17]:
def find_polarity(text):
    try:
        polarity = TextBlob(text).sentiment.polarity
    except TypeError:
        pass
    return polarity

In [18]:
reviews_df['Polarity'] = reviews_df['Review Text'].apply(find_polarity)

In [19]:
reviews_df.head()

Unnamed: 0,Review Text,Recommended,Polarity
0,Absolutely wonderful - silky and sexy and comf...,1,0.633333
1,Love this dress! it's sooo pretty. i happene...,1,0.339583
2,I had such high hopes for this dress and reall...,0,0.073675
3,"I love, love, love this jumpsuit. it's fun, fl...",1,0.55
4,This shirt is very flattering to all due to th...,1,0.512891


In [20]:
reviews_df = reviews_df.drop(['Polarity'], axis=1)

In [22]:
reviews_df_copy = reviews_df.copy()

In [23]:
reviews_df_copy.head()

Unnamed: 0,Review Text,Recommended
0,Absolutely wonderful - silky and sexy and comf...,1
1,Love this dress! it's sooo pretty. i happene...,1
2,I had such high hopes for this dress and reall...,0
3,"I love, love, love this jumpsuit. it's fun, fl...",1
4,This shirt is very flattering to all due to th...,1


In [24]:
reviews_df_copy['cleaned_text'] = reviews_df_copy['Review Text'].str.lower()

In [25]:
reviews_df_copy.head()

Unnamed: 0,Review Text,Recommended,cleaned_text
0,Absolutely wonderful - silky and sexy and comf...,1,absolutely wonderful - silky and sexy and comf...
1,Love this dress! it's sooo pretty. i happene...,1,love this dress! it's sooo pretty. i happene...
2,I had such high hopes for this dress and reall...,0,i had such high hopes for this dress and reall...
3,"I love, love, love this jumpsuit. it's fun, fl...",1,"i love, love, love this jumpsuit. it's fun, fl..."
4,This shirt is very flattering to all due to th...,1,this shirt is very flattering to all due to th...


In [27]:
reviews_df_copy['cleaned_text'] = reviews_df_copy['cleaned_text'].apply(tp.expand_contractions)

In [28]:
reviews_df_copy['cleaned_text'] = reviews_df_copy['cleaned_text'].apply(tp.remove_special_chars, remove_digits=True)

In [30]:
reviews_df_copy['cleaned_text'] = reviews_df_copy['cleaned_text'].apply(tp.lemmatize_text)

In [31]:
reviews_df_copy.head(4)

Unnamed: 0,Review Text,Recommended,cleaned_text
0,Absolutely wonderful - silky and sexy and comf...,1,absolutely wonderful silky and sexy and comf...
1,Love this dress! it's sooo pretty. i happene...,1,love this dress it be sooo pretty i happen to ...
2,I had such high hopes for this dress and reall...,0,i have such high hope for this dress and reall...
3,"I love, love, love this jumpsuit. it's fun, fl...",1,i love love love this jumpsuit it be fun flirt...


In [32]:
reviews_df_copy['cleaned_text'] = reviews_df_copy['cleaned_text'].apply(tp.remove_stopwords)

In [33]:
reviews_df_copy.head()

Unnamed: 0,Review Text,Recommended,cleaned_text
0,Absolutely wonderful - silky and sexy and comf...,1,absolutely wonderful silky sexy comfortable
1,Love this dress! it's sooo pretty. i happene...,1,love dress sooo pretty happen find store glad ...
2,I had such high hopes for this dress and reall...,0,high hope dress really want work initially ord...
3,"I love, love, love this jumpsuit. it's fun, fl...",1,love love love jumpsuit fun flirty fabulous ev...
4,This shirt is very flattering to all due to th...,1,shirt flattering due adjustable front tie perf...


In [35]:
reviews_df.loc[1]['Review Text']

'Love this dress!  it\'s sooo pretty.  i happened to find it in a store, and i\'m glad i did bc i never would have ordered it online bc it\'s petite.  i bought a petite and am 5\'8".  i love the length on me- hits just a little below the knee.  would definitely be a true midi on someone who is truly petite.'

In [36]:
reviews_df_copy.loc[1]['cleaned_text']

'love dress sooo pretty happen find store glad bc never would order online bc petite buy petite love length hit little knee would definitely true midi someone truly petite'