# Load review dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
products = pd.read_csv('c://users/evgeniy.pahnuk/Desktop/Coursera/Classification/week2/amazon_baby_subset.csv')
products = products.fillna({'review':''}) 

In [3]:
products.head(10)['name']

0    Stop Pacifier Sucking without tears with Thumb...
1      Nature's Lullabies Second Year Sticker Calendar
2      Nature's Lullabies Second Year Sticker Calendar
3                          Lamaze Peekaboo, I Love You
4    SoftPlay Peek-A-Boo Where's Elmo A Children's ...
5                            Our Baby Girl Memory Book
6    Hunnt&reg; Falling Flowers and Birds Kids Nurs...
7    Blessed By Pope Benedict XVI Divine Mercy Full...
8    Cloth Diaper Pins Stainless Steel Traditional ...
9    Cloth Diaper Pins Stainless Steel Traditional ...
Name: name, dtype: object

In [4]:
print ('# of positive reviews =', len(products[products['sentiment']==1]))
print ('# of negative reviews =', len(products[products['sentiment']==-1]))

# of positive reviews = 26579
# of negative reviews = 26493


# Apply text cleaning on the review data

In [5]:
import json
with open('c://users/evgeniy.pahnuk/Desktop/Coursera/Classification/week2/important_words.json', 'r') as f: # Reads the list of most frequent words
    important_words = json.load(f)
important_words = [str(s) for s in important_words]

In [6]:
def remove_punctuation(text):
    import string
    new_text = text
    for i in string.punctuation:
        new_text = new_text.str.replace(i, '')
    return new_text

In [7]:
products['review_clean'] = remove_punctuation(products['review'])

In [8]:
for word in important_words:
    #products[word] = products['review_clean'].str.count(str(word))
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

In [9]:
products.shape

(53072, 198)

In [10]:
products.head(1)

Unnamed: 0,name,review,rating,sentiment,review_clean,baby,one,great,love,use,...,seems,picture,completely,wish,buying,babies,won,tub,almost,either
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
products['contains_perfect'] = products['perfect'].apply(lambda perfect : +1 if perfect > 0 else 0)

In [12]:
sum(products['contains_perfect'])

2955

# Convert SFrame to NumPy array

In [13]:
def get_numpy_data(data_sframe, features, label):
    data_sframe['intercept'] = 1
    features = ['intercept'] + features
    features_sframe = data_sframe[features]
    feature_matrix = features_sframe.as_matrix()
    label_sarray = data_sframe[label]
    label_array = np.array(label_sarray)
    return(feature_matrix, label_array)

In [14]:
feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment')

In [15]:
feature_matrix.shape

(53072, 194)

# Estimating conditional probability with link function

In [16]:
import math

In [17]:
'''
produces probablistic estimate for P(y_i = +1 | x_i, w).
estimate ranges between 0 and 1.
'''
def predict_probability(feature_matrix, coefficients):
    # Take dot product of feature_matrix and coefficients  
    scores = np.dot(feature_matrix,coefficients)
    # Compute P(y_i = +1 | x_i, w) using the link function
    predictions = 1/( 1+np.exp(-scores) )
    # return predictions
    return predictions

### Checkpoint

In [18]:
dummy_feature_matrix = np.array([[1.,2.,3.], [1.,-1.,-1]])
dummy_coefficients = np.array([1., 3., -1.])

correct_scores      = np.array( [ 1.*1. + 2.*3. + 3.*(-1.),          1.*1. + (-1.)*3. + (-1.)*(-1.) ] )
correct_predictions = np.array( [ 1./(1+np.exp(-correct_scores[0])), 1./(1+np.exp(-correct_scores[1])) ] )

print ('The following outputs must match ')
print ('------------------------------------------------')
print ('correct_predictions           =', correct_predictions)
print ('output of predict_probability =', predict_probability(dummy_feature_matrix, dummy_coefficients))

The following outputs must match 
------------------------------------------------
correct_predictions           = [ 0.98201379  0.26894142]
output of predict_probability = [ 0.98201379  0.26894142]


# Compute derivative of log likelihood with respect to a single coefficient

In [19]:
def feature_derivative(errors, feature):     
    # Compute the dot product of errors and feature
    derivative = np.dot(errors, feature)
    # Return the derivative
    return derivative

In [20]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    logexp = np.log(1. + np.exp(-scores))
    
    # Simple check to prevent overflow
    mask = np.isinf(logexp)
    logexp[mask] = -scores[mask]
    
    lp = np.sum((indicator-1)*scores - logexp)
    return lp

### Checkpoint

In [21]:
dummy_feature_matrix = np.array([[1.,2.,3.], [1.,-1.,-1]])
dummy_coefficients = np.array([1., 3., -1.])
dummy_sentiment = np.array([-1, 1])

correct_indicators  = np.array( [ -1==+1,                                       1==+1 ] )
correct_scores      = np.array( [ 1.*1. + 2.*3. + 3.*(-1.),                     1.*1. + (-1.)*3. + (-1.)*(-1.) ] )
correct_first_term  = np.array( [ (correct_indicators[0]-1)*correct_scores[0],  (correct_indicators[1]-1)*correct_scores[1] ] )
correct_second_term = np.array( [ np.log(1. + np.exp(-correct_scores[0])),      np.log(1. + np.exp(-correct_scores[1])) ] )

correct_ll          =      sum( [ correct_first_term[0]-correct_second_term[0], correct_first_term[1]-correct_second_term[1] ] ) 

print ('The following outputs must match ')
print ('------------------------------------------------')
print ('correct_log_likelihood           =', correct_ll)
print ('output of compute_log_likelihood =', compute_log_likelihood(dummy_feature_matrix, dummy_sentiment, dummy_coefficients))

The following outputs must match 
------------------------------------------------
correct_log_likelihood           = -5.33141161544
output of compute_log_likelihood = -5.33141161544


## Taking gradient steps

In [22]:
from math import sqrt

def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    for itr in range(0, max_iter):

        # Predict P(y_i = +1|x_i,w) using your predict_probability() function
        # YOUR CODE HERE
        predictions = predict_probability(feature_matrix, initial_coefficients)
        
        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)
        
        # Compute the errors as indicator - predictions
        errors = indicator - predictions
        for j in range(0, len(coefficients)): # loop over each coefficient
            
            # Recall that feature_matrix[:,j] is the feature column associated with coefficients[j].
            # Compute the derivative for coefficients[j]. Save it in a variable called derivative
            # YOUR CODE HERE
            derivative = sum(feature_matrix[:,j]*errors)
            
            # add the step size times the derivative to the current coefficient
            ## YOUR CODE HERE
            coefficients[j] += step_size*derivative
        
        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            print ('iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp))
    return coefficients

In [23]:
coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients=np.zeros(194),
                                   step_size=1e-7, max_iter=301)

iteration   0: log likelihood of observed labels = -36780.91768478
iteration   1: log likelihood of observed labels = -36775.13127954
iteration   2: log likelihood of observed labels = -36769.34795095
iteration   3: log likelihood of observed labels = -36763.56769899
iteration   4: log likelihood of observed labels = -36757.79052366
iteration   5: log likelihood of observed labels = -36752.01642492
iteration   6: log likelihood of observed labels = -36746.24540276
iteration   7: log likelihood of observed labels = -36740.47745714
iteration   8: log likelihood of observed labels = -36734.71258803
iteration   9: log likelihood of observed labels = -36728.95079539
iteration  10: log likelihood of observed labels = -36723.19207918
iteration  11: log likelihood of observed labels = -36717.43643934
iteration  12: log likelihood of observed labels = -36711.68387583
iteration  13: log likelihood of observed labels = -36705.93438858
iteration  14: log likelihood of observed labels = -36700.1879

# Predicting sentiments

In [24]:
# Compute the scores as a dot product between feature_matrix and coefficients.
scores = np.dot(feature_matrix, coefficients)

In [25]:
products['preds'] = pd.Series(scores).apply(lambda scores: +1 if scores > 0 else -1)

In [26]:
products.groupby(by=('preds')).count()

Unnamed: 0_level_0,name,review,rating,sentiment,review_clean,baby,one,great,love,use,...,completely,wish,buying,babies,won,tub,almost,either,contains_perfect,intercept
preds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-1,31674,31724,31724,31724,31724,31724,31724,31724,31724,31724,...,31724,31724,31724,31724,31724,31724,31724,31724,31724,31724
1,21308,21348,21348,21348,21348,21348,21348,21348,21348,21348,...,21348,21348,21348,21348,21348,21348,21348,21348,21348,21348


In [27]:
products.groupby(by=('preds', 'sentiment')).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,name,review,rating,review_clean,baby,one,great,love,use,would,...,completely,wish,buying,babies,won,tub,almost,either,contains_perfect,intercept
preds,sentiment,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
-1,-1,22206,22235,22235,22235,22235,22235,22235,22235,22235,22235,...,22235,22235,22235,22235,22235,22235,22235,22235,22235,22235
-1,1,9468,9489,9489,9489,9489,9489,9489,9489,9489,9489,...,9489,9489,9489,9489,9489,9489,9489,9489,9489,9489
1,-1,4255,4258,4258,4258,4258,4258,4258,4258,4258,4258,...,4258,4258,4258,4258,4258,4258,4258,4258,4258,4258
1,1,17053,17090,17090,17090,17090,17090,17090,17090,17090,17090,...,17090,17090,17090,17090,17090,17090,17090,17090,17090,17090


## Measuring accuracy

In [28]:
num_mistakes = len(products[products['sentiment'] != products['preds']]) # YOUR CODE HERE
accuracy = 1 - num_mistakes/len(products) # YOUR CODE HERE
print ("-----------------------------------------------------")
print ('# Reviews   correctly classified =', len(products) - num_mistakes)
print ('# Reviews incorrectly classified =', num_mistakes)
print ('# Reviews total                  =', len(products))
print ("-----------------------------------------------------")
print ('Accuracy = %.2f' % accuracy)

-----------------------------------------------------
# Reviews   correctly classified = 39325
# Reviews incorrectly classified = 13747
# Reviews total                  = 53072
-----------------------------------------------------
Accuracy = 0.74


## Which words contribute most to positive & negative sentiments

In [29]:
coefficients = list(coefficients[1:]) # exclude intercept
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)

In [30]:
word_coefficient_tuples[0:10]

[('great', 0.069275149999999647),
 ('love', 0.069004250000000072),
 ('easy', 0.06748420000000005),
 ('little', 0.046790450000000261),
 ('loves', 0.046414200000000044),
 ('well', 0.030355849999999917),
 ('perfect', 0.030355849999999917),
 ('old', 0.020272350000000126),
 ('nice', 0.018481399999999922),
 ('soft', 0.017954649999999985)]

In [31]:
word_coefficient_tuples[-10:]

[('return', -0.027977950000000137),
 ('monitor', -0.028324099999999935),
 ('disappointed', -0.030054849999999821),
 ('back', -0.031589949999999895),
 ('even', -0.03347119999999984),
 ('get', -0.03396785000000007),
 ('work', -0.036195249999999971),
 ('money', -0.04141760000000029),
 ('product', -0.047452650000000124),
 ('would', -0.063811999999999619)]