In [1]:
import pandas as pd
import numpy as np

In [14]:
df = pd.read_csv(r"Data/amazon_baby.csv", index_col = 0)
df = df.head(100)

In [15]:
import json
with open(r'Data\important_words.json', 'r') as f: # Reads the list of most frequent words
    important_words = json.load(f)
important_words = [str(s) for s in important_words]

In [16]:
df = df[df['rating'] != 3] # ensuring neutral rating are ignored
df = df.dropna()
df['sentiment'] = df['rating'].apply(lambda rating : +1 if rating >3 else 0)
df.head()

Unnamed: 0,name,review,rating,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5.0,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5.0,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5.0,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5.0,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5.0,1


In [17]:
df['review_clean'] = df['review'].str.replace('[^\w\s]','')  # Remove punctutation
df['review_clean'] = df['review_clean'].str.replace('\d+', '') # Remvove numerics
df['bias'] = 1  # Add Bias : Important for gradient descent

In [18]:
for word in important_words: # Create word count matrix : input feature matrix
    df[word] = df['review_clean'].apply(lambda s: str(s).split().count(word))

In [19]:
df.head(2)

Unnamed: 0,name,review,rating,sentiment,review_clean,bias,baby,one,great,love,...,seems,picture,completely,wish,buying,babies,won,tub,almost,either
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5.0,1,it came early and was not disappointed i love ...,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5.0,1,Very soft and comfortable and warmer than it l...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
df['sentiment'].value_counts()

1    77
0    11
Name: sentiment, dtype: int64

In [21]:
# Input & Output Array
features = df.iloc[:,5:].values
output = df['sentiment'].values.reshape(-1, 1)

$$
P(y_i = +1 | \mathbf{x}_i,\mathbf{w}) = \frac{1}{1 + \exp(-\mathbf{w}^T h(\mathbf{x}_i))},
$$
$$ grad = \sum h(\mathbf{x}_i)[\mathbf{1}[y_i = +1] - \sigma^{w^T h(\mathbf{x}_i)}]$$
$$\ell\ell(\mathbf{w}) = \sum_{i=1}^N \Big( (\mathbf{1}[y_i = +1] - 1)\mathbf{w}^T h(\mathbf{x}_i) - \ln\left(1 + \exp(-\mathbf{w}^T h(\mathbf{x}_i))\right) \Big) $$


In [22]:
# Generate Functions
def get_predictions(features, coefficients):
    ''' Prediction is the sigmoid of WX '''
    pred = np.dot(features, coefficients)
    probability = 1/(1 + np.exp(-pred)).reshape(-1, 1)
    return probability

def get_gradient(error, features):
    ''' Gradient is product of error and features : Refer Notes '''
    grad = np.dot(features.T, error)
    return grad

def get_log_likelihood(features, output, coefficients):
    ''' Log Likelihood is as per above formula '''
    indicator = (output == +1)
    pred = np.dot(features, coefficients)
    first_term = (indicator - 1)*pred
    second_term = np.log(1 + np.exp(-pred))
    log_ll = np.sum(first_term - second_term)
    return log_ll

In [23]:
coefficients = np.zeros(shape = (194,1))
for i in np.arange(100):
    prob = get_predictions(features, coefficients)
    out_indicator = (output == +1)
    error = out_indicator - prob
    grad = get_gradient(error, features)
    coefficients = coefficients + 0.05*grad
    log_ll = get_log_likelihood(features, output, coefficients)
    if i % 10 == 0:
        print(log_ll)
    # Note likelihood is increasing over iterations

-32.74043284455757
-11.6020905287393
-7.9278556908260125
-6.319591942841592
-5.4295664372845085
-4.861606414256798
-4.462520761091753
-4.161975826136507
-3.923616368337765
-3.726946656003595


In [25]:
predictions = np.where(prob > 0.5, 1, 0)
error = output - predictions
accuracy = list(error).count(0)/len(error)
print(round(accuracy,3))

0.989


In [26]:
# Higher weights corresponds to +ve reviews/sentiments & vice-versa
df_coeff = pd.DataFrame([df.iloc[:,5:].columns, coefficients]).T
df_coeff.columns = ['word', 'weights']
df_coeff.sort_values(by = 'weights').head(10)

Unnamed: 0,word,weights
30,used,[-2.389722090125048]
113,waste,[-1.7941920990974933]
39,got,[-1.634211065006681]
97,money,[-1.5482772986116038]
123,received,[-1.3395838760732113]
141,part,[-1.295653252187493]
15,really,[-1.2954786498123965]
29,back,[-1.1950824446778452]
109,going,[-1.097310025522115]
192,almost,[-1.0846801411323013]
