## Data Preparation Experiments

In [43]:
import os
import json
import pandas as pd

import warnings
warnings.filterwarnings('ignore') 

In [44]:
# Reading a file.
df = None
with open("../data/products/059449771X", "r") as jsonfile:
    df = pd.read_json(jsonfile, orient=str)
df

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,image
0,5,14.0,True,"01 3, 2015",A2CHXQ8OHEPUEB,059449771X,Robbin,Great seller! Product arrived as promised and ...,Great product and fantastic seller!,1420243200,
1,1,,True,"01 8, 2017",A3SHI9VBY9T8S3,059449771X,Milena,Horrible too expensive and does not fit,One Star,1483833600,
2,1,,True,"01 4, 2017",A25K77UY3HQACA,059449771X,Vicki Perry,"No, I received the wrong charger and had to re...",One Star,1483488000,
3,1,,True,"01 3, 2017",A1K58Z05KXDUZX,059449771X,Melissa Fagan,Tried on 2 different nooks and didn't work. Co...,One Star,1483401600,
4,5,,True,"12 24, 2016",A2QBUZGWKA4RHJ,059449771X,Amazon Customer,This is the only replacement that is truly lab...,"The real deal, not a knockoff!",1482537600,
...,...,...,...,...,...,...,...,...,...,...,...
116,1,,True,"03 28, 2015",AU8777WMB30QN,059449771X,Janella Graham,Horrible never worked at all and a huge dissap...,One Star,1427500800,
117,5,,False,"03 15, 2015",ADAF8T30HV8BZ,059449771X,Samantha Rodriguez,This charger is the best charger to use for th...,Five Stars,1426377600,
118,1,3.0,False,"02 8, 2015",A5KX4RL4ZDG6I,059449771X,Kindle Customer,Be prepared to buy hundreds of these cords bec...,Scam waste of money,1423353600,
119,1,,True,"02 8, 2015",A1ZD7SHJJ61JP4,059449771X,Denise A. Niederriter,Takes 2 days to charge. Definitely not origin...,misrepresentation of product,1423353600,


In [45]:
# Only keep the relevant columns
df = df[["asin", "reviewText", "summary", "overall"]]
df

Unnamed: 0,asin,reviewText,summary,overall
0,059449771X,Great seller! Product arrived as promised and ...,Great product and fantastic seller!,5
1,059449771X,Horrible too expensive and does not fit,One Star,1
2,059449771X,"No, I received the wrong charger and had to re...",One Star,1
3,059449771X,Tried on 2 different nooks and didn't work. Co...,One Star,1
4,059449771X,This is the only replacement that is truly lab...,"The real deal, not a knockoff!",5
...,...,...,...,...
116,059449771X,Horrible never worked at all and a huge dissap...,One Star,1
117,059449771X,This charger is the best charger to use for th...,Five Stars,5
118,059449771X,Be prepared to buy hundreds of these cords bec...,Scam waste of money,1
119,059449771X,Takes 2 days to charge. Definitely not origin...,misrepresentation of product,1


### Convert to lowercase

In [46]:
df['reviewText'] = df['reviewText'].str.lower()
df['summary'] = df['summary'].str.lower()

### Perform contraction expansion

In [47]:
import contractions

df['reviewText'] = df['reviewText'].apply(lambda x: contractions.fix(x))
df['summary'] = df['summary'].apply(lambda x: contractions.fix(x))

### Remove extra white space characters.

In [48]:
df['reviewText'] = df['reviewText'].str.replace('\s+', ' ')
df['summary'] = df['summary'].str.replace('\s+', ' ')

In [49]:
df

Unnamed: 0,asin,reviewText,summary,overall
0,059449771X,great seller! product arrived as promised and ...,great product and fantastic seller!,5
1,059449771X,horrible too expensive and does not fit,one star,1
2,059449771X,"no, i received the wrong charger and had to re...",one star,1
3,059449771X,tried on 2 different nooks and did not work. c...,one star,1
4,059449771X,this is the only replacement that is truly lab...,"the real deal, not a knockoff!",5
...,...,...,...,...
116,059449771X,horrible never worked at all and a huge dissap...,one star,1
117,059449771X,this charger is the best charger to use for th...,five stars,5
118,059449771X,be prepared to buy hundreds of these cords bec...,scam waste of money,1
119,059449771X,takes 2 days to charge. definitely not origina...,misrepresentation of product,1


## Exploring the Emotion Lexicon

In [15]:
# Loading the word level emotion lexicon.
emotion_lexicon_df = pd.read_csv("../data/NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt", delimiter="\t", header=None, names=["word", "emotion", "association"])
emotion_lexicon_df

Unnamed: 0,word,emotion,association
0,aback,anger,0
1,aback,anticipation,0
2,aback,disgust,0
3,aback,fear,0
4,aback,joy,0
...,...,...,...
141535,zoom,negative,0
141536,zoom,positive,0
141537,zoom,sadness,0
141538,zoom,surprise,0


In [50]:
grouped_df = emotion_lexicon_df.groupby("word")
for name, group in grouped_df:
    print(name)
    print(group)

aback
    word       emotion  association
0  aback         anger            0
1  aback  anticipation            0
2  aback       disgust            0
3  aback          fear            0
4  aback           joy            0
5  aback      negative            0
6  aback      positive            0
7  aback       sadness            0
8  aback      surprise            0
9  aback         trust            0
abacus
      word       emotion  association
10  abacus         anger            0
11  abacus  anticipation            0
12  abacus       disgust            0
13  abacus          fear            0
14  abacus           joy            0
15  abacus      negative            0
16  abacus      positive            0
17  abacus       sadness            0
18  abacus      surprise            0
19  abacus         trust            1
abandon
       word       emotion  association
20  abandon         anger            0
21  abandon  anticipation            0
22  abandon       disgust            0
23  aband

KeyboardInterrupt: 