# Week 2_1: Implementing logistic regression from scratch

The goal of this notebook is to implement your own logistic regression classifier. You will:

 * Extract features from Amazon product reviews.
 * Convert an SFrame into a NumPy array.
 * Implement the link function for logistic regression.
 * Write a function to compute the derivative of the log likelihood function with respect to a single coefficient.
 * Implement gradient ascent.
 * Given a set of coefficients, predict sentiments.
 * Compute classification accuracy for the logistic regression model.

In [1]:
import sframe
import numpy as np
import json

In [2]:
products = sframe.SFrame('amazon_baby_subset.gl/')

[INFO] SFrame v1.8.3 started. Logging /tmp/sframe_server_1457699759.log


In [3]:
products.head(10)['name']

dtype: str
Rows: 10
["Stop Pacifier Sucking without tears with Thumbuddy To Love's Binky Fairy Puppet and Adorable Book", "Nature's Lullabies Second Year Sticker Calendar", "Nature's Lullabies Second Year Sticker Calendar", 'Lamaze Peekaboo, I Love You', "SoftPlay Peek-A-Boo Where's Elmo A Children's Book", 'Our Baby Girl Memory Book', 'Hunnt&reg; Falling Flowers and Birds Kids Nursery Home Decor Vinyl Mural Art Wall Paper Stickers', 'Blessed By Pope Benedict XVI Divine Mercy Full Color Medal', 'Cloth Diaper Pins Stainless Steel Traditional Safety Pin (Black)', 'Cloth Diaper Pins Stainless Steel Traditional Safety Pin (Black)']

In [4]:
print '# of positive reviews =', len(products[products['sentiment']==1])
print '# of negative reviews =', len(products[products['sentiment']==-1])

# of positive reviews = 26579
# of negative reviews = 26493


# Apply text cleaning on the review data

In [5]:
with open('important_words.json', 'r') as f:
    important_words = json.load(f)
important_words = [str(s) for s in important_words]

* remove punctuation

In [6]:
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation)

In [7]:
products['review_clean'] = products['review'].apply(remove_punctuation)

* count words in the reviews

In [8]:
for word in important_words:
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

In [9]:
products['perfect']

dtype: int
Rows: 53072
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... ]

In [10]:
products['contains_perfect'] = products['perfect'].apply(lambda x : 1 if x >=1 else 0)

In [11]:
print "# of reviews containing 'perfect':", products['contains_perfect'].sum()

# of reviews containing 'perfect': 2955


# Convert SFrame to Numpy array

In [12]:
def get_numpy_data(data_sframe, features, label):
    data_sframe['intercept'] = 1
    features = ['intercept'] + features
    features_sframe = data_sframe[features]
    feature_matrix = features_sframe.to_numpy()
    label_sarray = data_sframe[label]
    label_array = label_sarray.to_numpy()
    return(feature_matrix, label_array)    

In [13]:
feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment') 

In [14]:
print '# of features in the feature_matrix:', feature_matrix.shape[1]

# of features in the feature_matrix: 194


In [15]:
x = feature_matrix[0]

In [16]:
xx = x.transpose()

In [17]:
xx.shape

(194,)

# Estimating conditional probability with link function

In [18]:
def predict_probability(feature_matrix, coefficients):
    scores = np.dot(feature_matrix, coefficients)
    predictions = []
    for score in scores:
        p = 1./(1+np.exp(-score))
        predictions.append(p)
    return predictions

# Compute derivative of log likelihood wrt to a single coefficient

In [20]:
def feature_derivative(errors, feature):
    derivative = np.sum(np.dot(errors, feature))
    return derivative

In [21]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    logexp = np.log(1. + np.exp(-scores))
    
    # Simple check to prevent overflow
    mask = np.isinf(logexp)
    logexp[mask] = -scores[mask]
    
    lp = np.sum((indicator-1)*scores - logexp)
    return lp

# Taking gradient steps

In [53]:
from math import sqrt

def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    for itr in xrange(max_iter):
        # Predict P(y_i = +1|x_i,w) using your predict_probability() function
        predictions = predict_probability(feature_matrix, coefficients)
        
        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)
        
        # Compute the errors as indicator - predictions
        errors = indicator - predictions

        for j in xrange(len(coefficients)): # loop over each coefficient
            #print coefficients[j]
            # Recall that feature_matrix[:,j] is the feature column associated with coefficients[j].
            # Compute the derivative for coefficients[j]. Save it in a variable called derivative
            derivative = feature_derivative(errors, feature_matrix[:,j])
            #print 'derivative:', derivative
            # add the step size times the derivative to the current coefficient
            coefficients[j] += step_size*derivative
        
        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            print 'iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp)
    return coefficients

In [54]:
coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients=np.zeros(194),
                                   step_size=1e-7, max_iter=301)

iteration   0: log likelihood of observed labels = -36780.91768478
iteration   1: log likelihood of observed labels = -36775.13434712
iteration   2: log likelihood of observed labels = -36769.35713564
iteration   3: log likelihood of observed labels = -36763.58603240
iteration   4: log likelihood of observed labels = -36757.82101962
iteration   5: log likelihood of observed labels = -36752.06207964
iteration   6: log likelihood of observed labels = -36746.30919497
iteration   7: log likelihood of observed labels = -36740.56234821
iteration   8: log likelihood of observed labels = -36734.82152213
iteration   9: log likelihood of observed labels = -36729.08669961
iteration  10: log likelihood of observed labels = -36723.35786366
iteration  11: log likelihood of observed labels = -36717.63499744
iteration  12: log likelihood of observed labels = -36711.91808422
iteration  13: log likelihood of observed labels = -36706.20710739
iteration  14: log likelihood of observed labels = -36700.5020

# Predicting sentiments

In [55]:
scores = np.dot(feature_matrix, coefficients)

In [56]:
print '# of positive_reviews:', np.sum(scores>0)

# of positive_reviews: 25126


# Measuring accuracy

In [68]:
predicted_sentiment = scores>0
print predicted_sentiment

[ True False  True ..., False  True False]


In [69]:
real_sentiment = sentiment>0
print real_sentiment

[ True  True  True ..., False False False]


In [71]:
np.sum(predicted_sentiment == real_sentiment)

39903

In [72]:
num_mistakes = len(products) - np.sum(predicted_sentiment == real_sentiment)
accuracy = np.sum(predicted_sentiment == real_sentiment)/np.float(len(products))
print "-----------------------------------------------------"
print '# Reviews   correctly classified =', len(products) - num_mistakes
print '# Reviews incorrectly classified =', num_mistakes
print '# Reviews total                  =', len(products)
print "-----------------------------------------------------"
print 'Accuracy = %.2f' % accuracy

-----------------------------------------------------
# Reviews   correctly classified = 39903
# Reviews incorrectly classified = 13169
# Reviews total                  = 53072
-----------------------------------------------------
Accuracy = 0.75


# Word contributions

In [59]:
coefficients = list(coefficients[1:]) # remove the intercept

In [60]:
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)

In [61]:
word_coefficient_tuples[:10]

[('great', 0.066546084170457695),
 ('love', 0.065890762922123244),
 ('easy', 0.064794586802578394),
 ('little', 0.045435626308421372),
 ('loves', 0.044976401394906038),
 ('well', 0.03013500109210707),
 ('perfect', 0.029739937104968459),
 ('old', 0.020077541034775381),
 ('nice', 0.018408707995268992),
 ('daughter', 0.01770319990570169)]

In [62]:
word_coefficient_tuples[-10:]

[('monitor', -0.02448210054589172),
 ('return', -0.026592778462247283),
 ('back', -0.027742697230661327),
 ('get', -0.028711552980192581),
 ('disappointed', -0.028978976142317068),
 ('even', -0.030051249236035804),
 ('work', -0.03306951529475273),
 ('money', -0.038982037286487116),
 ('product', -0.041511033392108897),
 ('would', -0.053860148445203128)]