In [1]:
import numpy as np
import pandas as pd
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.externals import joblib
import math

In [2]:
products = pd.read_csv('amazon_baby_subset.csv')

In [3]:
products.head(3)

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1


In [4]:
products['name'].head(10)

0    Stop Pacifier Sucking without tears with Thumb...
1      Nature's Lullabies Second Year Sticker Calendar
2      Nature's Lullabies Second Year Sticker Calendar
3                          Lamaze Peekaboo, I Love You
4    SoftPlay Peek-A-Boo Where's Elmo A Children's ...
5                            Our Baby Girl Memory Book
6    Hunnt&reg; Falling Flowers and Birds Kids Nurs...
7    Blessed By Pope Benedict XVI Divine Mercy Full...
8    Cloth Diaper Pins Stainless Steel Traditional ...
9    Cloth Diaper Pins Stainless Steel Traditional ...
Name: name, dtype: object

In [5]:
print(products.sentiment.value_counts())

 1    26579
-1    26493
Name: sentiment, dtype: int64


## Cleaning

In [6]:
def remove_punctuation(text):
    translator = text.maketrans('','',string.punctuation)
    return text.translate(translator)

In [7]:
products.review.fillna('',inplace=True)

In [8]:
products['review_clean'] = products['review'].apply(remove_punctuation)

In [9]:
products.iloc[0]['review_clean']

'All of my kids have cried nonstop when I tried to ween them off their pacifier until I found Thumbuddy To Loves Binky Fairy Puppet  It is an easy way to work with your kids to allow them to understand where their pacifier is going and help them part from itThis is a must buy book and a great gift for expecting parents  You will save them soo many headachesThanks for this book  You all rock'

In [10]:
important_words = pd.read_json('important_words.json')[0]
important_words = list(important_words)

In [11]:
for word in important_words:
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

In [12]:
products['contains_perfect'] = np.where(products['perfect']>=1,1,0)

In [13]:
print(products.contains_perfect.sum(),'reviews contain the word perfect')

2955 reviews contain the word perfect


## Convert DataFrame to multi-dimensional array 

In [14]:
def get_numpy_data(df, features, label):
    df['constant'] = 1
    features = ['constant'] + features
    features_frame = df[features]
    feature_matrix = features_frame.as_matrix()
    label_sarray = df[label]
    label_array = label_sarray.as_matrix()
    return(feature_matrix, label_array)

In [15]:
feature_matrix, sentiment = get_numpy_data(products,important_words,'sentiment')

In [16]:
print('There are',feature_matrix.shape[1],'features in feature_matrix')

There are 194 features in feature_matrix


## Estimating conditional probability with link function

In [17]:
'''
produces probablistic estimate for P(y_i = +1 | x_i, w).
estimate ranges between 0 and 1.
'''

def predict_probability(feature_matrix, coefficients):
    # Take dot product of feature_matrix and coefficients  
    # YOUR CODE HERE
    score = np.dot(feature_matrix, coefficients)

    # Compute P(y_i = +1 | x_i, w) using the link function
    # YOUR CODE HERE
    predictions = np.exp(score)
    predictions = predictions+1
    predictions = 1/predictions
    
    # return predictions
    return score

## Compute derivative of log likelihood with respect to a single coefficient

In [18]:
def feature_derivative(errors, feature):     
    # Compute the dot product of errors and feature
    derivative = np.dot(errors,feature)
        # Return the derivative
    return derivative

In [19]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores)))
    return lp

## Taking Gradient Steps

In [27]:
from math import sqrt
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    for itr in range(max_iter):
        # Predict P(y_i = +1|x_1,w) using your predict_probability() function
        # YOUR CODE HERE
        predictions = predict_probability(feature_matrix, coefficients)

        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)

        # Compute the errors as indicator - predictions
        errors = indicator - predictions

        for j in range(len(coefficients)): # loop over each coefficient
            # Recall that feature_matrix[:,j] is the feature column associated with coefficients[j]
            # compute the derivative for coefficients[j]. Save it in a variable called derivative
            # YOUR CODE HERE
            derivative = feature_derivative(errors,feature_matrix[:,j])
            
            # add the step size times the derivative to the current coefficient
            # YOUR CODE HERE
            coefficients[j] = step_size*derivative

#         Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            print('iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp))
    return coefficients

In [49]:
from math import sqrt
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    for itr in range(max_iter):
        # Predict P(y_i = +1|x_1,w) using your predict_probability() function
        # YOUR CODE HERE
        predictions = predict_probability(feature_matrix, coefficients)

        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)

        # Compute the errors as indicator - predictions
        errors = indicator - predictions

        for j in range(len(coefficients)): # loop over each coefficient
            # Recall that feature_matrix[:,j] is the feature column associated with coefficients[j]
            # compute the derivative for coefficients[j]. Save it in a variable called derivative
            # YOUR CODE HERE
            derivative = feature_derivative(errors,feature_matrix[:,j])

            # add the step size times the derivative to the current coefficient
            # YOUR CODE HERE
            coefficients[j] = step_size*derivative

        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            print('iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp))
    return coefficients

In [50]:
coefficients=logistic_regression(feature_matrix, sentiment, np.zeros(194), 1e-7, 301)

iteration   0: log likelihood of observed labels = -36784.76305157
iteration   1: log likelihood of observed labels = -36784.65680893
iteration   2: log likelihood of observed labels = -36784.65949602
iteration   3: log likelihood of observed labels = -36784.65943366
iteration   4: log likelihood of observed labels = -36784.65943510
iteration   5: log likelihood of observed labels = -36784.65943506
iteration   6: log likelihood of observed labels = -36784.65943506
iteration   7: log likelihood of observed labels = -36784.65943506
iteration   8: log likelihood of observed labels = -36784.65943506
iteration   9: log likelihood of observed labels = -36784.65943506
iteration  10: log likelihood of observed labels = -36784.65943506
iteration  11: log likelihood of observed labels = -36784.65943506
iteration  12: log likelihood of observed labels = -36784.65943506
iteration  13: log likelihood of observed labels = -36784.65943506
iteration  14: log likelihood of observed labels = -36784.6594

## Predicting Sentiments

In [30]:
print(coefficients.shape)
print(feature_matrix.shape)

(194,)
(53072, 194)


In [31]:
scores = np.dot(feature_matrix,coefficients)

In [41]:
def score_function(score):
    if score>0:
        label=1
    elif score<=0:
        label=-1
    return label

In [44]:
list(scores)

[0.0055431910262623467,
 0.0040263813936399573,
 0.006705388571085602,
 0.0040606700159841686,
 0.0046858394479259132,
 0.0041159383086589972,
 0.0030215556889651141,
 0.0035072985537173379,
 0.0081959082112294922,
 0.0073503695559136283,
 0.010422782278046105,
 0.0068816549765326751,
 0.0048883719876048545,
 0.0056748274437446265,
 0.0073793776335430977,
 0.007474999813810581,
 0.0073369463311086357,
 0.016178525298834564,
 0.0082781496246081901,
 0.0061219892913046874,
 0.0058774824009322942,
 0.0080708412097892611,
 0.0093472345723904903,
 0.0068391889734956144,
 0.0066996213174256693,
 0.013331437821931684,
 0.00826070944722853,
 0.012143674278177256,
 0.0052649730082182508,
 0.0051160162064969308,
 0.0056241524488176136,
 0.0035862032368720147,
 0.0031614461861020834,
 0.0052280934957776417,
 0.0068119284962284612,
 0.0062266841623338821,
 0.0033842989748800968,
 0.010060345578806303,
 0.037198424834865421,
 0.0060844288195253133,
 0.0051795968905494283,
 0.0045489505773208807,
 0