In [2]:
import numpy as np
import pandas as pd
import re
import nltk
import spacy
import string
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
pd.options.mode.chained_assignment = None

In [3]:
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    text = str(text)
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

In [4]:
FILENAME = "intimates_sentiment_traits.csv"
COLNAME ="Review Text"
NEWCOLNAME = COLNAME + "_"

In [5]:
intimates = pd.read_csv("../output/" + FILENAME)

In [6]:
intimates.head()

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,sentiment_overall,emotional_traits,behavioral_traits,concepts
0,767,33,,Absolutely wonderful silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,100.0,,,"['silky', 'fantastic', 'sexy', 'comfortable']"
1,767,44,Runs big,Bought the black xs to go under the larkspur m...,5,1,0,Initmates,Intimate,Intimates,-0.8,,,"['dress', 'midi', 'skirt', 'larkspur', 'statis..."
2,368,33,,I am pregnant and i thought this would be a gr...,2,0,3,Initmates,Intimate,Intimates,-11.69,,Sedentariness,"['keep', 'store', 'bra', 'soft', 'sleep', 'sha..."
3,368,52,Soft and comfortable,I ve been looking for bralettes that provide s...,4,1,1,Initmates,Intimate,Intimates,-25.6,,,"['bra', 'casual', 'night', 'downside', 'itchy'..."
4,368,36,Itchy tags,3 tags sewn in 2 small about 1 long and 1...,1,0,0,Initmates,Intimate,Intimates,-0.1,,,"['intimate', 'thread', 'item', 'cup', 'flimsy'..."


In [7]:
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    
    text = str(text)
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ambar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ambar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
def stem_words(text):
    
    text = str(text)
    return " ".join([stemmer.stem(word) for word in text.split()])

In [9]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    text = str(text)
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [10]:
intimates[COLNAME + "_"] = intimates[COLNAME].apply(lambda text: remove_punctuation(text))

In [11]:
intimates[COLNAME + "_"] = intimates[COLNAME + "_"].apply(lambda text: remove_stopwords(text))

In [12]:
intimates[COLNAME + "_"]

0            Absolutely wonderful silky sexy comfortable
1      Bought black xs go larkspur midi dress bother ...
2      I pregnant thought would great sleep bra soft ...
3      I looking bralettes provide support binding ti...
4      3 tags sewn 2 small 1 long 1 huge 2 x 3 itchy ...
                             ...                        
149                                       Overall pretty
150    I love robe wait wear getting ready upcoming w...
151    I high hopes things seem ride stay put love ad...
152    I bought pair standard fair sexy comfortable g...
153    A great long line bra beautiful well made howe...
Name: Review Text_, Length: 154, dtype: object

In [13]:
intimates.columns

Index(['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating',
       'Recommended IND', 'Positive Feedback Count', 'Division Name',
       'Department Name', 'Class Name', 'sentiment_overall',
       'emotional_traits', 'behavioral_traits', 'concepts', 'Review Text_'],
      dtype='object')

In [30]:
positive = intimates[intimates.sentiment_overall >= 0.5]

In [31]:
positive_words = ' '.join(positive[COLNAME + "_"])

In [32]:
negative = intimates[intimates.sentiment_overall < 0.5]

In [33]:
negative_words = ' '.join(negative[COLNAME + "_"])

In [34]:
positive_words



In [35]:
negative_words

'Bought black xs go larkspur midi dress bother lining skirt portion grrrrrrrrrrr stats 34a 28 29 36 xs fit smoothly around chest flowy around lower half would say running big straps pretty could easily nightwear 5 6 came knees I pregnant thought would great sleep bra soft fits okay zero support shape would buy b cup smaller get away without support would seen store would passed however lazy return wearing comfortable redeeming quality would recommend larger chested ladies though I looking bralettes provide support binding tight night time casual wear light weight bra could little supportive pretty color nice lines downside retailer tag back itchy need remove 3 tags sewn 2 small 1 long 1 huge 2 x 3 itchy cut thread left behind plasticy even itchy make intimates item itchy tags comfortable also love bralettes wear time including work b cup however one thin flimsy gives support even b cup would lounging bralette itchy If hips bigger size 6 us fully close robe model must wearing larger two

In [36]:
df = pd.DataFrame([positive_words,negative_words])

In [42]:
df.columns =['text']

In [43]:
df

Unnamed: 0,text
0,Absolutely wonderful silky sexy comfortable Or...
1,Bought black xs go larkspur midi dress bother ...


In [39]:
tfidf_vectorizer = TfidfVectorizer()

In [44]:
t0 = time()
results = tfidf_vectorizer.fit_transform(df['text'])
print("done in %0.3fs." % (time() - t0))

done in 0.015s.


In [57]:
results.toarray()

array([[0.00748066, 0.00748066, 0.00532255, ..., 0.00748066, 0.        ,
        0.        ],
       [0.        , 0.        , 0.01731546, ..., 0.        , 0.02433627,
        0.02433627]])

In [58]:
tf_feature_names = tfidf_vectorizer.get_feature_names()

In [59]:
tf_feature_names

['10',
 '100',
 '110',
 '114',
 '115',
 '115lbs',
 '120',
 '123',
 '125lbs',
 '138',
 '150lbs',
 '28',
 '29',
 '30',
 '30dd',
 '32',
 '32a',
 '32b',
 '32c',
 '32d',
 '32dd',
 '34',
 '34a',
 '34b',
 '34c',
 '34d',
 '34dd',
 '35',
 '36',
 '36b',
 '36c',
 '36d',
 '36dd',
 '8yo',
 'able',
 'absolute',
 'absolutely',
 'accidentally',
 'accommodate',
 'across',
 'action',
 'actually',
 'adding',
 'adds',
 'adjustable',
 'adjusting',
 'adorable',
 'adore',
 'agree',
 'airy',
 'aka',
 'allowing',
 'almost',
 'alone',
 'along',
 'also',
 'although',
 'always',
 'amazing',
 'amazingly',
 'amount',
 'announce',
 'annoying',
 'another',
 'anymore',
 'anything',
 'anyways',
 'apart',
 'appalled',
 'appears',
 'applied',
 'appreciate',
 'arms',
 'around',
 'article',
 'as',
 'aside',
 'assoc',
 'athletic',
 'attractive',
 'available',
 'aware',
 'away',
 'awesome',
 'awkward',
 'awkwardly',
 'babies',
 'bachelorette',
 'back',
 'bad',
 'baggy',
 'band',
 'barely',
 'based',
 'basic',
 'beautiful',
 

In [67]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

In [68]:
def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [69]:
top_feats_in_doc(results,tf_feature_names,0,10)

Unnamed: 0,feature,tfidf
0,bra,0.31403
1,love,0.218224
2,size,0.218224
3,wear,0.207579
4,comfortable,0.191612
5,small,0.186289
6,fit,0.180967
7,perfect,0.179536
8,fits,0.164999
9,like,0.154354


In [70]:
top_feats_in_doc(results,tf_feature_names,1,10)

Unnamed: 0,feature,tfidf
0,pretty,0.207785
1,small,0.19047
2,size,0.19047
3,would,0.19047
4,bra,0.173155
5,wear,0.173155
6,,0.170354
7,fit,0.155839
8,however,0.138524
9,even,0.138524
