In [1]:
# Turn off warnings 
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')



In [298]:
## Cleaning supplies! 

from nltk.corpus import stopwords
import string
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

import spacy

from textblob import TextBlob

In [3]:
## loading data
df = pd.read_csv('data/reviews.csv', index_col='Unnamed: 0')

# Remove nan text
df.dropna(subset=['Review Text'], inplace= True)

In [4]:
df.shape

(22641, 10)

In [105]:
df.corr()

Unnamed: 0,Clothing ID,Age,Rating,Recommended IND,Positive Feedback Count
Clothing ID,1.0,0.017688,-0.018454,-0.014874,0.044902
Age,0.017688,1.0,0.029962,0.034208,0.04085
Rating,-0.018454,0.029962,1.0,0.792568,-0.060984
Recommended IND,-0.014874,0.034208,0.792568,1.0,-0.065923
Positive Feedback Count,0.044902,0.04085,-0.060984,-0.065923,1.0


In [104]:
df['Review Text'].iloc[150]

'This top is so much better in person. i do not agree with some of the other reviews about the fabric being scratchy. it is not and i have sensitive skin. i love this top and have got lots of compliments.'

In [243]:
# Corpus
corpus = df[['Review Text']].values.tolist()
corpus = [''.join(element) for element in corpus]

In [244]:
corpus[1]

'Love this dress!  it\'s sooo pretty.  i happened to find it in a store, and i\'m glad i did bc i never would have ordered it online bc it\'s petite.  i bought a petite and am 5\'8".  i love the length on me- hits just a little below the knee.  would definitely be a true midi on someone who is truly petite.'

In [245]:
#corpus is a list of strings
type(corpus)

list

In [204]:
my_stopwords = ['i','me','my','myself','we','our','ours','ourselves','you',"you're","you've","you'll","you'd",'your',\
                'yours','yourself','yourselves','he','him','his','himself','she',"she's",'her','hers','herself',\
                'it',"it's",'its','itself','they','them','their','theirs','themselves','what','which','who','whom',\
                'this','that',"that'll",'these','those','am','is','are','was','were','be','been','being','have',\
                'has','had','having','do','does','did','doing' 'a','an','the','and','but','if','or','because','as',\
                'until','while','of','at','by','for','with','about','against','between','into','during','before',\
                'after','to','from','in','out','on','off','over','under','again','further','then','once',\
                'here','there','when','where','why','how','all','any','both','each','few','more','most','other',\
                'some','such','nor','only','own','same','so','than','too','very','s','t','can','will','just','don',\
                "don't",'should',"should've",'now','d','ll','m','o','re','ve','y','ain','aren',"aren't",'couldn',\
                "couldn't",'could','a',"'s", "'m",\
                'didn',"didn't",'doesn',"doesn't",'hadn',"hadn't",'hasn',"hasn't",'haven', "haven't",'isn',"isn't",\
                'ma','mightn',"mightn't",'mustn',"mustn't",'needn',"needn't",'shan',"shan't",'shouldn',"shouldn't",\
                'wasn',"wasn't",'weren',"weren't",'won',"won't",'wouldn',"wouldn't", 'would']



In [109]:
pt = string.punctuation + "''"
pt

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\'\''

In [34]:
contractions = {
"ain't": "am not / are not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"isn't": "is not",
"mayn't": "may not",
"mightn't": "might not",
"mightn't've": "might not have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"wasn't": "was not",
"weren't": "were not",
"won't": "will not",
"won't've": "will not have",
"wouldn't": "would not",
"wouldn't've": "would not have",
}

In [35]:
text="What's the best way to ensure this?"
for word in text.split():
    if word.lower() in contractions:
        text = text.replace(word, contractions[word.lower()])
print(text)

What's the best way to ensure this?


In [248]:
def clean_nots(docs):
    not_docs=[]
    for doc in docs:
        for word in doc.split():
            if word in contractions:
                doc = doc.replace(word,contractions[word])
        not_docs.append(doc)
    return not_docs


In [249]:
corpus[7]

"I ordered this in carbon for store pick up, and had a ton of stuff (as always) to try on and used this top to pair (skirts and pants). everything went with it. the color is really nice charcoal with shimmer, and went well with pencil skirts, flare pants, etc. my only compaint is it is a bit big, sleeves are long and it doesn't go in petite. also a bit loose for me, but no xxs... so i kept it and wil ldecide later since the light color is already sold out in hte smallest size..."

In [252]:
corpus_w_nots = clean_nots(corpus)

In [256]:
corpus_w_nots[0:2]

['Absolutely wonderful - silky and sexy and comfortable',
 'Love this dress!  it\'s sooo pretty.  i happened to find it in a store, and i\'m glad i did bc i never would have ordered it online bc it\'s petite.  i bought a petite and am 5\'8".  i love the length on me- hits just a little below the knee.  would definitely be a true midi on someone who is truly petite.']

In [177]:
def clean_tokens(docs):
    ''' returns a list of lists: the clean tokens of docs'''
    docs_tokens=[]
    for doc in docs:
        clean_tokens = []
        tokens = word_tokenize(doc)
        for token in tokens:
            if token.lower() not in my_stopwords and token not in pt:
                clean_tokens.append(token.lower())
        docs_tokens.append(clean_tokens)    
    return docs_tokens

In [257]:
c_tokens = clean_docs(corpus_w_nots)


In [260]:
#c_tokens is a list of token lists for each document
c_tokens[0]

['absolutely', 'wonderful', 'silky', 'sexy', 'comfortable']

In [264]:
allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])(" ".join(c_tokens[0]))
#" ".join(c_tokens[0])
[token.lemma_ for token in nlp if token.pos_ in allowed_postags]

['absolutely', 'wonderful', 'silky', 'sexy', 'comfortable']

In [282]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])(" ".join(c_tokens[0]))
type(nlp)

spacy.tokens.doc.Doc

In [289]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    tokens_out = []
    for lst in texts:
        doc = nlp(" ".join(lst)) 
        tokens_out.append([token.lemma_ for token in doc if token.lemma != 'not' and token.pos_ in allowed_postags])
    return tokens_out

In [292]:
#lemmatizing
tokens_lemmatized= lemmatization(c_tokens)

In [297]:
ngrams(c_tokens[0:1], 2)

<zip at 0x7f883fc6d080>

In [None]:
ngram_object = TextBlob(corpus)