# Logistic Regression - Raw Implementation

## Dataset: Amazon Reviews

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

### Importing the data

In [2]:
# Dataset
products = pd.read_csv('amazon_baby_subset.csv')

# Words that will be used to analyze the reviews
import json
with open("important_words.json") as json_file:
    important_words = json.load(json_file)

### Preprocessing

In [3]:
products = products.fillna({'review':''})  # fill in N/A's in the review column

import string

def remove_punctuation(text):
    remove_punct_map = dict.fromkeys(map(ord, string.punctuation))
    return text.translate(remove_punct_map)

products['review_clean'] = products['review'].apply(remove_punctuation)

for word in important_words:
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

with open("module-4-assignment-train-idx.json") as json_file:
    train_idx = json.load(json_file)
    
with open("module-4-assignment-validation-idx.json") as json_file:
    val_idx = json.load(json_file)

train_data = products.iloc[train_idx]
validation_data = products.iloc[val_idx]

train_data.head()

Unnamed: 0,name,review,rating,sentiment,review_clean,baby,one,great,love,use,...,seems,picture,completely,wish,buying,babies,won,tub,almost,either
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1,We wanted to get something to keep track of ou...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1,One of babys first and favorite books and it i...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1,Very cute interactive book My son loves this b...,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
5,Our Baby Girl Memory Book,"Beautiful book, I love it to record cherished ...",5,1,Beautiful book I love it to record cherished t...,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
print(train_data.shape)
print(validation_data.shape)

(42361, 198)
(10711, 198)


In [5]:
def get_numpy_data(dataframe, features, label):
    dataframe['constant'] = 1
    features = ['constant'] + features
    features_frame = dataframe[features]
    feature_matrix = features_frame.as_matrix()  # old, use .values instead
    label_sarray = dataframe[label]
    label_array = label_sarray.as_matrix()  #.values instead
    return(feature_matrix, label_array)

feature_matrix_train, sentiment_train = get_numpy_data(train_data, important_words, 'sentiment')
feature_matrix_valid, sentiment_valid = get_numpy_data(validation_data, important_words, 'sentiment') 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
  """
  import sys


In [13]:
## train set = 194 unique words, 42361 examples
print(feature_matrix_train.shape)
print(feature_matrix_valid.shape)
print(feature_matrix_train[0].shape)
print(feature_matrix_train[0])

(42361, 194)
(10711, 194)
(194,)
[1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0]


## Model

In [18]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def predict_probability(feature_matrix, coefficients):
    # Take dot product of feature_matrix and coefficients  
    # YOUR CODE HERE
    score = np.dot(feature_matrix, coefficients)
    
    # Compute P(y_i = +1 | x_i, w) using the link function
    # YOUR CODE HERE
    predictions = sigmoid(score)
    
    # return predictions
    return predictions



def feature_derivative_with_L2(errors, feature, coefficient, l2_penalty, feature_is_constant): 
    
    # Compute the dot product of errors and feature
    ## YOUR CODE HERE
    derivative = np.dot(errors, feature)

    # add L2 penalty term for any feature that isn't the intercept.
    if not feature_is_constant: 
        ## YOUR CODE HERE
        derivative = derivative - 2 * l2_penalty * coefficient
        
    return derivative

def compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    
    lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores))) - l2_penalty*np.sum(coefficients[1:]**2)
    
    return lp


def logistic_regression_with_L2(feature_matrix, sentiment, initial_coefficients, step_size, l2_penalty, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    for itr in range(max_iter):
        # Predict P(y_i = +1|x_i,w) using your predict_probability() function
        ## YOUR CODE HERE
        predictions = predict_probability(feature_matrix, coefficients)
        
        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)
        
        # Compute the errors as indicator - predictions
        errors = indicator - predictions
        for j in range(len(coefficients)): # loop over each coefficient
            is_intercept = (j == 0)
            # Recall that feature_matrix[:,j] is the feature column associated with coefficients[j].
            # Compute the derivative for coefficients[j]. Save it in a variable called derivative
            ## YOUR CODE HERE
            derivative = feature_derivative_with_L2(errors, feature_matrix[:,j], coefficients[j], l2_penalty, is_intercept)
            
            # add the step size times the derivative to the current coefficient
            ## YOUR CODE HERE
            coefficients[j] = coefficients[j] + step_size * derivative
        
        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty)
            print('iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp))
    return coefficients 

### Training

In [19]:
penalties = [0, 4, 10, 1e2, 1e3,1e5]

l_of_coefs = []
for penalty in penalties:
    coefs = logistic_regression_with_L2(feature_matrix_train, sentiment_train, np.zeros(194), step_size = 5e-6,l2_penalty=penalty, max_iter=501)
    l_of_coefs.append(coefs)
l_of_coefs

iteration   0: log likelihood of observed labels = -29179.39138303
iteration   1: log likelihood of observed labels = -29003.71259047
iteration   2: log likelihood of observed labels = -28834.66187288
iteration   3: log likelihood of observed labels = -28671.70781507
iteration   4: log likelihood of observed labels = -28514.43078198
iteration   5: log likelihood of observed labels = -28362.48344665
iteration   6: log likelihood of observed labels = -28215.56713122
iteration   7: log likelihood of observed labels = -28073.41743783
iteration   8: log likelihood of observed labels = -27935.79536396
iteration   9: log likelihood of observed labels = -27802.48168669
iteration  10: log likelihood of observed labels = -27673.27331484
iteration  11: log likelihood of observed labels = -27547.98083656
iteration  12: log likelihood of observed labels = -27426.42679977
iteration  13: log likelihood of observed labels = -27308.44444728
iteration  14: log likelihood of observed labels = -27193.8767

iteration   7: log likelihood of observed labels = -28162.56976044
iteration   8: log likelihood of observed labels = -28046.29387744
iteration   9: log likelihood of observed labels = -27935.93902900
iteration  10: log likelihood of observed labels = -27831.15045502
iteration  11: log likelihood of observed labels = -27731.59955260
iteration  12: log likelihood of observed labels = -27636.98108219
iteration  13: log likelihood of observed labels = -27547.01092670
iteration  14: log likelihood of observed labels = -27461.42422295
iteration  15: log likelihood of observed labels = -27379.97375625
iteration  20: log likelihood of observed labels = -27027.18208317
iteration  30: log likelihood of observed labels = -26527.22737267
iteration  40: log likelihood of observed labels = -26206.59048765
iteration  50: log likelihood of observed labels = -25995.96903148
iteration  60: log likelihood of observed labels = -25854.95710284
iteration  70: log likelihood of observed labels = -25759.0810

[array([-6.37421352e-02,  7.40730059e-02,  1.27525058e-02,  8.01624990e-01,
         1.05855398e+00, -1.04152191e-04, -2.87021444e-01, -3.38447399e-03,
         9.84558820e-01,  5.24419456e-01, -8.69675407e-02,  2.08912434e-01,
         4.53866487e-01, -1.96835211e-01,  1.58163325e-01, -1.79058177e-02,
         1.28396325e-01, -7.24293854e-02, -1.51817046e-01, -2.63330304e-01,
         1.56507228e-01,  2.63417760e-01, -1.32474753e-02,  1.05248405e+00,
        -3.75326583e-02, -3.29713873e-04, -6.79948371e-02,  1.93363694e-01,
         1.88508247e-01, -2.68954361e-01,  9.62841996e-02,  3.58309842e-01,
        -4.63096879e-02, -3.68678195e-01,  8.35693208e-01,  4.29393687e-01,
        -6.53274724e-03, -1.18953292e-01,  4.52597544e-02, -1.05433750e-01,
        -1.40179533e-01,  1.15329513e-01,  4.70226839e-02,  2.77177536e-02,
        -1.93377062e-01,  2.65797426e-01,  7.83957751e-02, -1.76600523e-01,
         3.61782536e-01,  1.02765639e-01, -2.71592217e-01,  2.60319903e-01,
         3.3

In [20]:
final_l = []
for coefficients in l_of_coefs:
    coefficients = list(coefs[1:]) # exclude intercept
    word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
    word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)
    final_l.append(word_coefficient_tuples)

positive_words = final_l[0][:5]
negative_words = final_l[0][-5:]

table = pd.DataFrame()

for i in range(193):
    table['intercept'] = [l_of_coefs[j][0] for j in range(6)]
    word = final_l[0][i][0]
    col = [final_l[j][0][1] for j in range(6)]
    table[word] = col
    
table['l2_penalty'] = penalties
table.set_index('l2_penalty', inplace=True)

table

Unnamed: 0_level_0,intercept,love,great,easy,loves,little,perfect,well,old,nice,...,monitor,return,back,disappointed,even,get,work,money,product,would
l2_penalty,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,-0.063742,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,...,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042
4.0,-0.063143,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,...,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042
10.0,-0.062256,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,...,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042
100.0,-0.050438,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,...,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042
1000.0,5.4e-05,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,...,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042
100000.0,0.011362,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,...,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042,0.009042
