# Text Proprocessing

In [3]:
import datetime

import pandas as pd
import spacy
import re
import string

from spacy.tokens import Token
from tqdm import tqdm

from textblob import TextBlob
from textblob import Word

import html

In [4]:
df = pd.read_csv("..//data//Womens Clothing E-Commerce Reviews.csv")
# Removed index from dataframe
df=df.drop(columns=["Unnamed: 0"])

## Sample reviews

In [5]:
df['Review Text'].head(35).tail(5).tolist()

['Beautifully made pants and on trend with the flared crop. so much cuter in person. love these!',
 "I never would have given these pants a second look online, in person they are much cuter! the stripes are brighter and the fit more flattering. the crop has a cute flare which is right on trend. this brand has always run small for me, i am 5'8 about 140lbs and carry some chubbiness in the belly. i paired it with a collarless loose navy blazer",
 'These pants are even better in person. the only downside is that they need to be dry cleaned.',
 'I ordered this 3 months ago, and it finally came off back order. a huge disappointment. the fit wasn&#39;t so much the issue for me. the quality of the wool is subpar. someone else mentioned a &quot;felted wool&quot;...i guess, is that what you call it?  it does literally feel like felt! super thin, itchy, doesn&#39;t drape very well, and feels cheap (made in china). i got it on sale, but still not worth what i paid. definitely going back.',
 'This

## Reformatting the HTML escape values

In [6]:
df["Review Text"]=df["Review Text"].apply(str).apply(html.unescape)

## Sample reviews (After Reformatting)

In [7]:
df['Review Text'].head(35).tail(5).tolist()

['Beautifully made pants and on trend with the flared crop. so much cuter in person. love these!',
 "I never would have given these pants a second look online, in person they are much cuter! the stripes are brighter and the fit more flattering. the crop has a cute flare which is right on trend. this brand has always run small for me, i am 5'8 about 140lbs and carry some chubbiness in the belly. i paired it with a collarless loose navy blazer",
 'These pants are even better in person. the only downside is that they need to be dry cleaned.',
 'I ordered this 3 months ago, and it finally came off back order. a huge disappointment. the fit wasn\'t so much the issue for me. the quality of the wool is subpar. someone else mentioned a "felted wool"...i guess, is that what you call it?  it does literally feel like felt! super thin, itchy, doesn\'t drape very well, and feels cheap (made in china). i got it on sale, but still not worth what i paid. definitely going back.',
 'This is such a neat 

## Text preprocessing

- Remove the digits, stopwords, punctuation and alpha numeric words
- Lemmatizing the words 
- removing new line in review

In [8]:
nlp = spacy.load("en_core_web_sm")

def remove_spell_errors(doc):
    bdoc = TextBlob(str(doc))
    ## Correcting the words
    return str(bdoc.correct())

def stop(doc):
    return [token for token in doc if not token.is_digit and not token.is_stop and ( token.text.isalpha() or not token.text.isalnum())]

def lemmatize(doc):
    return [token.lemma_.lower() if token.lemma_ != "-PRON-" else token.text.lower() for token in doc]

def remove_line_breaks(doc):
    return [token.replace("\n", " ").replace("\r", " ") for token in doc]

nlp.add_pipe(stop)
nlp.add_pipe(lemmatize)
nlp.add_pipe(remove_line_breaks)

In [9]:
print(str(datetime.datetime.now())+" : Started preprocessing")
#docs = df["Review Text"].apply(remove_spell_errors).to_list()
docs = df["Review Text"].apply(str).to_list()
processed_docs = []

with tqdm(total=len(docs)) as bar:
    for doc in nlp.pipe(docs):
            line = " ".join(doc)
            ## Removing the punctuation
            line=line.translate(str.maketrans('','',string.punctuation))
            ## Removing numbers 
            line=" ".join(list(filter(lambda w : not w.isdigit(), line.split())))
            processed_docs.append(line)
            bar.update(1)

df["processed_Review_text"] = processed_docs
print(str(datetime.datetime.now())+" : Preprocessing completed")

2019-09-06 11:11:21.021307 : Started preprocessing


100%|████████████████████████████████████| 23486/23486 [05:14<00:00, 74.77it/s]


2019-09-06 11:16:35.224067 : Preprocessing completed


## Experimented with Spell correction 

We tried to correct the wrong spelling words. But most of the review are having casual words which leads to wrong spell correction. So we did not implemented 

In [10]:
TextBlob("mom").words[0].spellcheck()

[('mon', 0.5094339622641509),
 ('mob', 0.24528301886792453),
 ('mot', 0.0660377358490566),
 ('mop', 0.04716981132075472),
 ('tom', 0.03773584905660377),
 ('com', 0.03773584905660377),
 ('nom', 0.018867924528301886),
 ('om', 0.009433962264150943),
 ('mo', 0.009433962264150943),
 ('mmm', 0.009433962264150943),
 ('mm', 0.009433962264150943)]

## After Preprocessed Output

In [16]:
df["Review Text"].tolist()[630:635]

["This suit fits like a glove! i've had five kids and am currently nursing my last baby so i was looking for a suit to wear to the beach this summer when i'm just not feeling a bikini. this is it! it's comfortable and not binding easy to nursing and swim in. overall worth the extra splurge. i'm tall too so it was plenty long on my torso.",
 'Cute top! the bright red crochet shoulders and back add the perfect pop of color to the light blue pinstriped front. it\'s a little loose and boxy around the bust area but overall i think it\'s a flattering shape. i got the regular s and don\'t find it too short. for reference i\'m 5\'2", 134 lb, 34b. i often wear petite sizes but due to other reviews mentioning that the top is short i ordered the regular size and am happy with the length and fit.',
 'This top looked super cute online but when it came it was super short and boxy. looked like a tent on me. this one went back the same day it came!',
 "Comfy easy dress. i bought the black version. fab

In [15]:
df["processed_Review_text"].tolist()[630:635]

['suit fit like glove kid currently nurse baby look suit wear beach summer feel bikini comfortable bind easy nursing swim overall worth extra splurge tall plenty long torso',
 'cute bright red crochet shoulder add perfect pop color light blue pinstriped little loose boxy bust area overall think flattering shape get regular s find short reference lb wear petite size review mention short order regular size happy length fit',
 'look super cute online come super short boxy look like tent go day come',
 'comfy easy dress buy black version fabric soft wear house hesitate run need buy medium roomy length tall small fit short want',
 'uncertain size get look like asser fit right leg hit weird point calf cute short quote husband nope feel say feel great head retailer near sigh quest continue']

In [13]:
df.to_csv("..//data//Womens Clothing E-Commerce Reviews PreProcessed.csv", index=False)