### Libraries

In [140]:
import pandas as pd
import numpy as np
from collections import Counter
import time
import sys
import re






import warnings
warnings.filterwarnings('ignore')

In [7]:
df = pd.read_csv('Data/amazon_reviews_us_Electronics_v1_00.tsv.gz', sep='\t', error_bad_lines=False)

b'Skipping line 9076: expected 15 fields, saw 22\nSkipping line 19256: expected 15 fields, saw 22\nSkipping line 24313: expected 15 fields, saw 22\nSkipping line 47211: expected 15 fields, saw 22\nSkipping line 54295: expected 15 fields, saw 22\nSkipping line 56641: expected 15 fields, saw 22\nSkipping line 63067: expected 15 fields, saw 22\n'
b'Skipping line 93796: expected 15 fields, saw 22\n'
b'Skipping line 132806: expected 15 fields, saw 22\nSkipping line 164631: expected 15 fields, saw 22\nSkipping line 167019: expected 15 fields, saw 22\nSkipping line 167212: expected 15 fields, saw 22\n'
b'Skipping line 198103: expected 15 fields, saw 22\nSkipping line 199191: expected 15 fields, saw 22\nSkipping line 202841: expected 15 fields, saw 22\nSkipping line 218228: expected 15 fields, saw 22\nSkipping line 235900: expected 15 fields, saw 22\n'
b'Skipping line 277761: expected 15 fields, saw 22\nSkipping line 304582: expected 15 fields, saw 22\nSkipping line 312029: expected 15 fields,

In [8]:
df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,41409413,R2MTG1GCZLR2DK,B00428R89M,112201306,yoomall 5M Antenna WIFI RP-SMA Female to Male ...,Electronics,5,0,0,N,Y,Five Stars,As described.,2015-08-31
1,US,49668221,R2HBOEM8LE9928,B000068O48,734576678,"Hosa GPM-103 3.5mm TRS to 1/4"" TRS Adaptor",Electronics,5,0,0,N,Y,It works as advertising.,It works as advertising.,2015-08-31
2,US,12338275,R1P4RW1R9FDPEE,B000GGKOG8,614448099,Channel Master Titan 2 Antenna Preamplifier,Electronics,5,1,1,N,Y,Five Stars,Works pissa,2015-08-31
3,US,38487968,R1EBPM82ENI67M,B000NU4OTA,72265257,LIMTECH Wall charger + USB Hotsync & Charging ...,Electronics,1,0,0,N,Y,One Star,Did not work at all.,2015-08-31
4,US,23732619,R372S58V6D11AT,B00JOQIO6S,308169188,Skullcandy Air Raid Portable Bluetooth Speaker,Electronics,5,1,1,N,Y,Overall pleased with the item,Works well. Bass is somewhat lacking but is pr...,2015-08-31


DATA COLUMNS:

marketplace       - 2 letter country code of the marketplace where the review was written.

customer_id       - Random identifier that can be used to aggregate reviews written by a single author.

review_id         - The unique ID of the review.

product_id        - The unique Product ID the review pertains to. In the multilingual dataset the reviews
                    for the same product in different countries can be grouped by the same product_id.
                    
product_parent    - Random identifier that can be used to aggregate reviews for the same product.

product_title     - Title of the product.

product_category  - Broad product category that can be used to group reviews 
                    (also used to group the dataset into coherent parts).
                    
star_rating       - The 1-5 star rating of the review.

helpful_votes     - Number of helpful votes.

total_votes       - Number of total votes the review received.

vine              - Review was written as part of the Vine program.

verified_purchase - The review is on a verified purchase.

review_headline   - The title of the review.

review_body       - The review text.

review_date       - The date the review was written.


### Data Cleaning

In [23]:
#shape of the dataframe (3 million rows and 15 columns)
df.shape

(3090877, 15)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3091024 entries, 0 to 3091023
Data columns (total 15 columns):
marketplace          object
customer_id          int64
review_id            object
product_id           object
product_parent       int64
product_title        object
product_category     object
star_rating          int64
helpful_votes        int64
total_votes          int64
vine                 object
verified_purchase    object
review_headline      object
review_body          object
review_date          object
dtypes: int64(5), object(10)
memory usage: 353.7+ MB


In [16]:
#to check if there is any null value in our dataframe
df.isna().sum()

marketplace           0
customer_id           0
review_id             0
product_id            0
product_parent        0
product_title         4
product_category      0
star_rating           0
helpful_votes         0
total_votes           0
vine                  0
verified_purchase     0
review_headline      31
review_body          88
review_date          24
dtype: int64

In [17]:
#drop null values 
df.dropna(inplace=True)

In [19]:
#reviews of people who bought the product and have an opinion about it matters the most
df.verified_purchase.value_counts()

Y    2597511
N     493366
Name: verified_purchase, dtype: int64

In [20]:
#drop those reviews who didn't buy the product
df1 = df[df.verified_purchase != 'N']

In [21]:
#now our datafarmae only has contains reviews of people that bought the product
df1.verified_purchase.value_counts()

Y    2597511
Name: verified_purchase, dtype: int64

In [22]:
#check the class imbalance
df1.star_rating.value_counts()

5    1540908
4     443627
1     276682
3     195640
2     140654
Name: star_rating, dtype: int64

In [24]:
#show the name of the columns
df1.columns

Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date'],
      dtype='object')

In [27]:
# Create a new dataframe and drop all the columns that we are not going to use them
df2 = df1.drop(columns=['marketplace','customer_id','review_id','product_id','product_parent','product_category',
                       'helpful_votes','total_votes','vine','review_date','verified_purchase'])

In [28]:
df2.head()

Unnamed: 0,product_title,star_rating,review_headline,review_body
0,yoomall 5M Antenna WIFI RP-SMA Female to Male ...,5,Five Stars,As described.
1,"Hosa GPM-103 3.5mm TRS to 1/4"" TRS Adaptor",5,It works as advertising.,It works as advertising.
2,Channel Master Titan 2 Antenna Preamplifier,5,Five Stars,Works pissa
3,LIMTECH Wall charger + USB Hotsync & Charging ...,1,One Star,Did not work at all.
4,Skullcandy Air Raid Portable Bluetooth Speaker,5,Overall pleased with the item,Works well. Bass is somewhat lacking but is pr...


In [38]:
#create DFs of each class
df_class1 = df2[df2.star_rating == 1]
df_class2 = df2[df2.star_rating == 2]
df_class3 = df2[df2.star_rating == 3]
df_class4 = df2[df2.star_rating == 4]
df_class5 = df2[df2.star_rating == 5]

In [39]:
#sample the data of star rating of 5 and 4, because more than 70% of the reviews belong to label 5
df_class5 = df_class5.sample(n=200000, random_state=52)
df_class4 = df_class4.sample(n=200000, random_state=52)

In [41]:
#concatinate the dataframes
df3 = pd.concat([df_class1, df_class2,df_class3,df_class4,df_class5], ignore_index=True)
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1012976 entries, 0 to 1012975
Data columns (total 4 columns):
product_title      1012976 non-null object
star_rating        1012976 non-null int64
review_headline    1012976 non-null object
review_body        1012976 non-null object
dtypes: int64(1), object(3)
memory usage: 30.9+ MB


In [42]:
#check the class imbalance
df3.star_rating.value_counts()

1    276682
5    200000
4    200000
3    195640
2    140654
Name: star_rating, dtype: int64

In [79]:
#shuffle the dataframe,  so all labels wouldn't be in order.
df4 = df3.sample(frac=1).reset_index(drop=True)
df4.head()

Unnamed: 0,product_title,star_rating,review_headline,review_body
0,Logitech 915-000162 Harmony 700 Rechargeable R...,1,doesnt select the correct input on tv,"When changing &#34;activities,&#34; it does no..."
1,JLab JBuds Hi-Fi Noise-Reducing Ear Buds (Purple),1,A real sack of crap,These are the worst earphones I've ever owned....
2,Panasonic ErgoFit In-Ear Earbud Headphone,4,High quality sound,"Nice sound quality, stays in ears while in the..."
3,Soundsoul(TM) Noise-isolating Sport In-ear Hea...,1,One Star Is Too Generous,I threw this pair of headphones into the trash...
4,Mpow Edge Wireless Bluetooth 4.0 Headset Headp...,1,It's a nice looking headset and comfortable to...,"Unfortunately, this headset didn't work well a..."


In [80]:
#feature engineering a column that shows the feeling of the review (emotion)
def emotion(row):
    if row.star_rating == 1 or row.star_rating == 2:
        y = 'Negative'
    elif row.star_rating == 4 or row.star_rating == 5:
        y = 'Positive'
    else:
        y = "Neutral"
    return y       

In [81]:
#create the new column on the fly
df4['emotion'] = df4.apply(lambda row: emotion(row), axis=1)

In [83]:
df4.head()

Unnamed: 0,product_title,star_rating,review_headline,review_body,emotion
0,Logitech 915-000162 Harmony 700 Rechargeable R...,1,doesnt select the correct input on tv,"When changing &#34;activities,&#34; it does no...",Negative
1,JLab JBuds Hi-Fi Noise-Reducing Ear Buds (Purple),1,A real sack of crap,These are the worst earphones I've ever owned....,Negative
2,Panasonic ErgoFit In-Ear Earbud Headphone,4,High quality sound,"Nice sound quality, stays in ears while in the...",Positive
3,Soundsoul(TM) Noise-isolating Sport In-ear Hea...,1,One Star Is Too Generous,I threw this pair of headphones into the trash...,Negative
4,Mpow Edge Wireless Bluetooth 4.0 Headset Headp...,1,It's a nice looking headset and comfortable to...,"Unfortunately, this headset didn't work well a...",Negative


#### Text Cleaning

In [167]:
#define a function to remove special characters, make all letters lower, and remove additional spaces
def text_cleaner(sentence):
    sentence = sentence.lower()
    cleaner = re.compile('[^a-z]')
    cleantext = re.sub(cleaner, ' ', sentence)
    cleaner1 = re.compile('\s+')
    cleantext2 = re.sub(cleaner1, ' ', cleantext)
    return cleantext2

In [168]:
df4['clean_review'] = df4['review_body'].map(lambda s:text_cleaner(s))

In [282]:
df4.tail(5)

Unnamed: 0,product_title,star_rating,review_headline,review_body,emotion,clean_review
1012971,AECO Bluetooth4.0 Headphones Supports NFC Blue...,3,"Sound good, the rest is bad",Sound is good enough. I can kick the volume al...,Neutral,sound is good enough i can kick the volume all...
1012972,Cables To Go 3.5 mm Male/Female Stereo Audio E...,2,The shortage wasn't even at the connection poi...,They are made very cheap and only one side wou...,Negative,they are made very cheap and only one side wou...
1012973,Generic Replacement for Sony XL-5100 Replaceme...,3,Not as bright,I purchased this lamp and thought it would be ...,Neutral,i purchased this lamp and thought it would be ...
1012974,Electroline EDA2100 Bi-Directional Signal Boos...,1,Does not do what is listed on this product. ...,Does not do what is listed on this product. It...,Negative,does not do what is listed on this product it ...
1012975,Yamaha RX-V473 Receiver,3,Been working well for sometime but lately I've...,Been working well for sometime but lately I've...,Neutral,been working well for sometime but lately i ve...


### EDA

In [172]:
#an example of a review
df4.clean_review[1]

'these are the worst earphones i ve ever owned i was leaving town and ordered them via prime shipping so i d have them before i left i wish i had time to listen to earbuds before i had gotten onto the airplane the one thing these little crappies don t do is block noise unless you can consider the distorted sound these emit as noise reducing these are horrible buds '

In [86]:
df4.emotion[1]

'Negative'

In [217]:
def pretty_print_review_label(i):
    print(df4.emotion[i] + " \t : \t " + df4.review_body[i][:80] + "...")

In [218]:
# How could I take this dataset and validate my theory that words are predictive of labels.
print("Labels \t\t : \t Review\n")
pretty_print_review_label(100)
pretty_print_review_label(1000)
pretty_print_review_label(10000)
pretty_print_review_label(100000)
pretty_print_review_label(1000000)

Labels 		 : 	 Review

Neutral 	 : 	 This had pry tools and screwdrivers but it didn't have the second screwdriver ne...
Neutral 	 : 	 Works as intended, but even with the angles other plugs pretty much always get i...
Negative 	 : 	 this thing keeps coming undone end slots too soft for keeping the cord in place ...
Positive 	 : 	 Great speakers, sound is amazing and speakers are elegant and stylish; I would h...
Positive 	 : 	 Works more slowly, but charges well. All my things are powered and the charge se...


In [220]:
#Count acts a lot like a dictionary, but they actually don't create the key, so they are much faster
positive_counts = Counter()
negative_counts = Counter()
neutral_counts = Counter()
total_counts = Counter()

In [221]:
for i in range(len(df4)):
    if (df4.emotion[i] == 'Positive'):
        for word in df4.clean_review[i].split(" "): #to count every positive word in a positive review
            positive_counts[word] += 1
            total_counts[word] += 1
    elif (df4.emotion[i] == 'Negative'):
        for word in df4.clean_review[i].split(" "):
            negative_counts[word] += 1
            total_counts[word] += 1
    else:
        for word in df4.clean_review[i].split(" "):
            neutral_counts[word] += 1
            total_counts[word] += 1

In [222]:
#Total number of words
len(total_counts)

142478

In [223]:
positive_counts.most_common()[:20]

[('the', 1236308),
 ('i', 812546),
 ('to', 650347),
 ('and', 642331),
 ('a', 582366),
 ('it', 566263),
 ('is', 378519),
 ('for', 353372),
 ('', 331200),
 ('of', 304571),
 ('this', 285533),
 ('my', 285289),
 ('in', 250959),
 ('br', 249388),
 ('that', 238965),
 ('with', 235158),
 ('on', 201634),
 ('you', 198851),
 ('but', 195287),
 ('have', 188305)]

In [224]:
negative_counts.most_common()[:20]

[('the', 1381085),
 ('i', 964436),
 ('to', 714286),
 ('it', 712934),
 ('and', 639897),
 ('a', 587627),
 ('of', 357742),
 ('this', 350962),
 ('', 350124),
 ('not', 343817),
 ('is', 334288),
 ('for', 312655),
 ('in', 268851),
 ('my', 266477),
 ('that', 258386),
 ('br', 252823),
 ('was', 250122),
 ('t', 237871),
 ('on', 231259),
 ('but', 222247)]

In [281]:
neutral_counts.most_common()[:20]

[('the', 805432),
 ('i', 482921),
 ('to', 398216),
 ('it', 371018),
 ('a', 328701),
 ('and', 321091),
 ('is', 229674),
 ('for', 194491),
 ('of', 186378),
 ('', 165350),
 ('but', 163665),
 ('br', 160799),
 ('that', 153539),
 ('in', 149266),
 ('not', 148056),
 ('this', 147773),
 ('my', 142679),
 ('on', 124198),
 ('with', 123947),
 ('you', 123755)]

* we are not really intersted in the most frequent positive words, we are interested in words that are most frequently positive versus negative.

* we want to come up with some sort of ratio that is more comparative between these two lists

In [258]:
#because most of the most common words in positive and negative lists are almost the same, making a ratio would solve
#the issue
pos_neg_ratios = Counter()

for term,cnt in list(total_counts.most_common()):
    if(cnt>60):
        pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
        pos_neg_ratios[term] = pos_neg_ratio

In [260]:
#Make it even better
for word, ratio in pos_neg_ratios.most_common():
    if(ratio > 1):#It means ratios greater than 10 because of the log base
        pos_neg_ratios[word] = np.log(ratio)
    else:
        pos_neg_ratios[word] = -np.log((1/(ratio+0.01)))

In [261]:
#words frequently seen in a review with a 'Positive' label
pos_neg_ratios.most_common()[:20]

[('exelent', 3.349904087274605),
 ('excelente', 3.247787168362621),
 ('buen', 3.1780538303479458),
 ('excelent', 2.900422093749666),
 ('exellent', 2.847812143477369),
 ('invaluable', 2.803360380906535),
 ('magni', 2.691243082785829),
 ('lifesaver', 2.4471663218051534),
 ('qualm', 2.3978952727983707),
 ('loves', 2.3837571062647336),
 ('pleasantly', 2.316423327659231),
 ('perfecto', 2.268683541318364),
 ('blends', 2.216200478795225),
 ('exceeded', 2.1983133090714158),
 ('fills', 2.1936339092054906),
 ('simplifies', 2.178532444324067),
 ('quibbles', 2.164963715117998),
 ('modi', 2.159484249353372),
 ('beauties', 2.159484249353372),
 ('xpa', 2.128231705849268)]

In [262]:
#words frequently seen in a review with a 'Negative' label
list(reversed(pos_neg_ratios.most_common()))[:20]

[('fraud', -3.393678549248128),
 ('reimburse', -3.2691858350074026),
 ('dishonest', -3.2336909106533414),
 ('returnable', -3.138833117194664),
 ('paperweight', -3.100509188378212),
 ('wast', -3.0878759509871565),
 ('refunds', -3.073273794016478),
 ('reimbursement', -3.0470255679415414),
 ('emptor', -3.045048892065669),
 ('refund', -3.0113748606411432),
 ('misrepresentation', -2.995732273553991),
 ('fraudulent', -2.9284800465448235),
 ('restocking', -2.8908103026094523),
 ('recourse', -2.8796601023012367),
 ('waste', -2.8619168303665585),
 ('junk', -2.8289888915147876),
 ('insult', -2.753272358021503),
 ('restock', -2.724857319418591),
 ('heeded', -2.724857319418591),
 ('reimbursed', -2.719607412358571)]

### Design a simple NN

In [244]:
#Create input/output data
#Count all the words that happende in our review
vocab = set(total_counts.keys())
vocab_size = len(vocab)
print(vocab_size)

142478


In [247]:
#put the vocab_size into a fixed length vector where each place in the vector belongs to one word of our vocabulary
#vector is empty and we edit it as we go forward
#I did this for memory efficiency, so we dont have to create a vector from scrath every time when we want to use it
#we also don't want to pregenerate data from the entire dataset, because that would be a matrix of 
# vocab_size * len(df4) = 144326794528 !!!!
layer_0 = np.zeros((1, vocab_size))
layer_0

array([[0., 0., 0., ..., 0., 0., 0.]])

In [248]:
word2index = {}

for i,word in enumerate(vocab):
    word2index[word] = i

In [263]:
def update_input_layer(review):
    
    global layer_0
    
    #clear out previous state, reset the layer to be all 0s
    layer_0 *= 0
    for word in review.split(" "):#iterate through each word
        layer_0[0][word2index[word]] += 1  # allocate a position in the vector by incrementing
        
update_input_layer(df4.clean_review[1])

In [279]:
layer_0[0]

array([1., 0., 0., ..., 0., 0., 0.])

In [274]:
#Create a function to get the label 
def get_target_for_label(label):
    if (label == 'Positive'):
        return 1
    elif (label == 'Negative'):
        return -1
    else:
        return 0

In [270]:
df4.emotion[1]

'Negative'

In [271]:
get_target_for_label(df4.emotion[0])

-1

* Create our neural network
* 3 layer neural network
* no non-linearity in hidden layer
* use our functions to create the training data
* create a "pre_process_data" function to create vocabulary for our data generating functions
* modify "train" to train over the entire corpus

In [None]:
#create a network by using counts of words in our review, to predict the sentiment of our review
class SentimentNetwork:
    def __init__(self, reviews, labels, hidden_nodes=10, learning_rate=0.1):
        #set our random number generator
        np.random.seed(52)
        
        self.pre_process_data(reviews, labels)
        
        self.init_network(len(self.review_vocab), hidden_nodes, 1, learning_rate)
        
    def pre_process_data(self, reviews, labels):
        
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                review_vocab.add(word)
        self.review_vocab = list(review_vocab)
        
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        self.label_vocab = list(label_vocab)
        
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
            
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i
            
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        #set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes
        
        #initialize weights
        self.weights_0_1 = np.zeros((self.input_nodes, self.hidden_nodes))
        
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5,
                                           (self.hidden_nodes, self.output_nodes))
        self.learning_rate = learning_rate
        
        self.layer_0 = np.zeros((1, input))
        
    def update_input_layer(self, review):
        
        #clear out previous state, reset the layer to be all 0s
        self.layer_0 *= 0
        for word in review.split(" "):
            if(word in self.word2index.keys()):
                self.layer_0[0][self.word2index[word]] +=1
                
    def get_target_for_label(self, label):
        if (label == 'Positive'):
            return 2
        elif (label == 'Neutral'):
            return 1
        else:
            return 0
        
    def softmax(self, x):
    """Compute softmax values for each sets of scores in x."""
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum()

    def softmax_output_2_derivative(self, output):
        
        mat = np.diag(output)
        
        for i in range(len(mat)):
            for j in range(len(mat)):
                if i == j:
                    mat[i][j] = output[i] * (1 - output[i])
                else:
                    mat[i][j] = -output[i] * output[j]
        return mat
        
    def train(self, training_reviews, training_labels):
        
        assert(len(training_reviews) == len(training_labels))
        
        #how the neural net is doing during the training process
        correct_so_far = 0
        
        start = time.time()
        
        for i in range(len(training_reviews)):
            
            review = training_reviews[i]
            label = training_labels[i]
            
            #implement the forward propagation here
            #forward pass
            
            #input layer
            self.update_input_layer(review)
            
            #Hidden layer
            layer_1 = self.layer_0.dot(self.weights_0_1)
            
            #Output layer
            layer_2 = self.softmax(layer_1.dot(self.weights_1_2))
            
            #implement the backward propagation here
            #Backward pass
            
            #Output error
            layer_2_error = layer_2 - self.get_target_for_label(label)
            layer_2_delta = layer_2_error * self.softmax_output_2_derivative(layer_2)
            
            #backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) #errors propagated to the hidden layer
            layer_1_delta = layer_1_error #hidden layer gradients - no nonlinearity so it's the same as the error
            
            #update the weights
            self.weights_1_2 -= layer_1.T.dot(layer_2_delta) * self.learning_rate #Update hidden-to-output weights
            self.weights_0_1 -= self.layer_0.T.dot(layer_1_delta) * self.learning_rate #update input-to-hidden weights
            
            if(np.abs(layer_2_error) < 0.5):
                correct_so_far += 1