In [1]:
import numpy as np
import pandas as pd
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.externals import joblib
import math

## Load and process data

In [2]:
products = pd.read_csv('amazon_baby_subset.csv')

In [3]:
def remove_punctuation(text):
    translator = text.maketrans('','',string.punctuation)
    return text.translate(translator)

In [4]:
products.review.fillna('',inplace=True)

In [5]:
products['review_clean'] = products['review'].apply(remove_punctuation)

In [6]:
important_words = pd.read_json('important_words.json')[0]
important_words = list(important_words)

In [7]:
for word in important_words:
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

## Train-Validation Split

In [8]:
train_indic = open('module-4-assignment-train-idx.json','r')
train_indic=list(train_indic)[0]
train_indic=train_indic.split(',')
train_indic[0]=' 0'
train_indic[-1]=' 53070'

In [9]:
train_indic[0:10]

[' 0', ' 1', ' 3', ' 4', ' 5', ' 6', ' 7', ' 8', ' 10', ' 11']

In [10]:
val_indic = open('module-4-assignment-validation-idx.json','r')
val_indic = list(val_indic)[0]
val_indic = val_indic.split(',')
val_indic[0] = ' 2'
val_indic[-1]=' 53071'

In [11]:
train_list=[]
for line in train_indic:
    t = [int(x.strip()) for x in line.split(',')]
    train_list.append(t[0])
    
train_data = products.iloc[train_list]

In [12]:
val_list=[]
for line in val_indic:
    t = [int(x.strip()) for x in line.split(',')]
    val_list.append(t[0])
    
val_data = products.iloc[val_list]

## Convert data frame to multi-dimensional array

In [13]:
def get_numpy_data(df, features, label):
    df['constant'] = 1
    features = ['constant'] + features
    features_frame = df[features]
    feature_matrix = features_frame.as_matrix()
    label_sarray = df[label]
    label_array = label_sarray.as_matrix()
    return(feature_matrix, label_array)

In [14]:
feature_matrix_train, sentiment_train = get_numpy_data(train_data, important_words, 'sentiment')
feature_matrix_valid, sentiment_valid = get_numpy_data(val_data, important_words, 'sentiment') 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


## Building on logistic regression with no L2 penalty assignment

In [15]:

def predict_probability(feature_matrix, coefficients):
    # Take dot product of feature_matrix and coefficients  
    # YOUR CODE HERE
    score = np.dot(feature_matrix, coefficients)

    # Compute P(y_i = +1 | x_i, w) using the link function
    # YOUR CODE HERE
    predictions = np.exp(score)
    predictions = predictions+1
    predictions = 1/predictions
    
    # return predictions
    return score

## Adding L2 penalty

In [47]:
def feature_derivative_with_L2(errors, feature, coefficient, l2_penalty, feature_is_constant): 
    
    # Compute the dot product of errors and feature
    ## YOUR CODE HERE
    derivative = np.dot(errors,feature)

    # add L2 penalty term for any feature that isn't the intercept.
    if not feature_is_constant: 
        ## YOUR CODE HERE
        derivative = derivative - (2 * l2_penalty * coefficient)
        
    return derivative

In [48]:
print('The intercept, which is the only constant term, was not regularized')

The intercept, which is the only constant term, was not regularized


In [49]:
def compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    
    lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores))) - l2_penalty*np.sum(coefficients[1:]**2)
    
    return lp

In [50]:
print('the term with L2 regulatization decreases ll(w)')

the term with L2 regulatization decreases ll(w)


In [51]:
def logistic_regression_with_L2(feature_matrix, sentiment, initial_coefficients, step_size, l2_penalty, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    for itr in range(max_iter):
        # Predict P(y_i = +1|x_i,w) using your predict_probability() function
        ## YOUR CODE HERE
        predictions = predict_probability(feature_matrix,coefficients)
        
        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)
        
        # Compute the errors as indicator - predictions
        errors = indicator - predictions
        for j in range(len(coefficients)): # loop over each coefficient
            is_intercept = (j == 0)
            # Recall that feature_matrix[:,j] is the feature column associated with coefficients[j].
            # Compute the derivative for coefficients[j]. Save it in a variable called derivative
            ## YOUR CODE HERE
            derivative = feature_derivative_with_L2(errors, 
                                                    feature_matrix[:,j], 
                                                    coefficients[j], 
                                                    l2_penalty, 
                                                    is_intercept)
            
            # add the step size times the derivative to the current coefficient
            ## YOUR CODE HERE
            coefficients[j] = (step_size*derivative)
        
        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty)
            print('iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp))
    return coefficients

In [52]:
feature_matrix = feature_matrix_train
sentiment = sentiment_train
initial_coefficients = np.zeros(194)
step_size = 5e-6
max_iter = 501

In [53]:
coefficients_0_penalty = logistic_regression_with_L2(feature_matrix=feature_matrix_train,
                                                     sentiment=sentiment_train,
                                                     initial_coefficients=np.zeros(194),
                                                     step_size=5e-6,
                                                     max_iter=501,
                                                     l2_penalty=0)

iteration   0: log likelihood of observed labels = -29929.17761132
iteration   1: log likelihood of observed labels = -29188.11956629
iteration   2: log likelihood of observed labels = -29852.05592077
iteration   3: log likelihood of observed labels = -29203.52469145
iteration   4: log likelihood of observed labels = -29780.66513825
iteration   5: log likelihood of observed labels = -29221.12346810
iteration   6: log likelihood of observed labels = -29722.05541657
iteration   7: log likelihood of observed labels = -29239.22037983
iteration   8: log likelihood of observed labels = -29673.66339460
iteration   9: log likelihood of observed labels = -29257.03052540
iteration  10: log likelihood of observed labels = -29633.54031020
iteration  11: log likelihood of observed labels = -29274.03445118
iteration  12: log likelihood of observed labels = -29600.14555481
iteration  13: log likelihood of observed labels = -29289.92503339
iteration  14: log likelihood of observed labels = -29572.2497

In [54]:
coefficients_4_penalty = logistic_regression_with_L2(feature_matrix=feature_matrix_train,
                                                     sentiment=sentiment_train,
                                                     initial_coefficients=np.zeros(194),
                                                     step_size=5e-6,
                                                     max_iter=501,
                                                     l2_penalty=4)

iteration   0: log likelihood of observed labels = -29929.26094613
iteration   1: log likelihood of observed labels = -29188.12483091
iteration   2: log likelihood of observed labels = -29852.17350117
iteration   3: log likelihood of observed labels = -29203.51926031
iteration   4: log likelihood of observed labels = -29780.79845510
iteration   5: log likelihood of observed labels = -29221.10810160
iteration   6: log likelihood of observed labels = -29722.19626108
iteration   7: log likelihood of observed labels = -29239.19628563
iteration   8: log likelihood of observed labels = -29673.80620094
iteration   9: log likelihood of observed labels = -29256.99945120
iteration  10: log likelihood of observed labels = -29633.68135364
iteration  11: log likelihood of observed labels = -29273.99829201
iteration  12: log likelihood of observed labels = -29600.28241872
iteration  13: log likelihood of observed labels = -29289.88560812
iteration  14: log likelihood of observed labels = -29572.3809

In [56]:
coefficients_10_penalty = logistic_regression_with_L2(feature_matrix=feature_matrix_train,
                                                     sentiment=sentiment_train,
                                                     initial_coefficients=np.zeros(194),
                                                     step_size=5e-6,
                                                     max_iter=501,
                                                     l2_penalty=10)

iteration   0: log likelihood of observed labels = -29929.38594835
iteration   1: log likelihood of observed labels = -29188.13272931
iteration   2: log likelihood of observed labels = -29852.34989537
iteration   3: log likelihood of observed labels = -29203.51111524
iteration   4: log likelihood of observed labels = -29780.99848184
iteration   5: log likelihood of observed labels = -29221.08505051
iteration   6: log likelihood of observed labels = -29722.40760506
iteration   7: log likelihood of observed labels = -29239.16013628
iteration   8: log likelihood of observed labels = -29674.02050972
iteration   9: log likelihood of observed labels = -29256.95282221
iteration  10: log likelihood of observed labels = -29633.89303580
iteration  11: log likelihood of observed labels = -29273.94402408
iteration  12: log likelihood of observed labels = -29600.48784498
iteration  13: log likelihood of observed labels = -29289.82642882
iteration  14: log likelihood of observed labels = -29572.5778

In [57]:
coefficients_1e2_penalty = logistic_regression_with_L2(feature_matrix=feature_matrix_train,
                                                       sentiment=sentiment_train,
                                                       initial_coefficients=np.zeros(194),
                                                       step_size=5e-6,
                                                       max_iter=501,
                                                       l2_penalty=1e2)

iteration   0: log likelihood of observed labels = -29931.26098163
iteration   1: log likelihood of observed labels = -29188.25141894
iteration   2: log likelihood of observed labels = -29854.99921418
iteration   3: log likelihood of observed labels = -29203.38918180
iteration   4: log likelihood of observed labels = -29784.00631000
iteration   5: log likelihood of observed labels = -29220.73910678
iteration   6: log likelihood of observed labels = -29725.58892093
iteration   7: log likelihood of observed labels = -29238.61675684
iteration   8: log likelihood of observed labels = -29677.24949776
iteration   9: log likelihood of observed labels = -29256.25086169
iteration  10: log likelihood of observed labels = -29637.08520057
iteration  11: log likelihood of observed labels = -29273.12582626
iteration  12: log likelihood of observed labels = -29603.58812318
iteration  13: log likelihood of observed labels = -29288.93279352
iteration  14: log likelihood of observed labels = -29575.5519

In [58]:
coefficients_1e3_penalty = logistic_regression_with_L2(feature_matrix=feature_matrix_train,
                                                       sentiment=sentiment_train,
                                                       initial_coefficients=np.zeros(194),
                                                       step_size=5e-6,
                                                       max_iter=501,
                                                       l2_penalty=1e3)

iteration   0: log likelihood of observed labels = -29950.01131442
iteration   1: log likelihood of observed labels = -29189.46219907
iteration   2: log likelihood of observed labels = -29881.84895458
iteration   3: log likelihood of observed labels = -29202.20232037
iteration   4: log likelihood of observed labels = -29814.86961841
iteration   5: log likelihood of observed labels = -29217.27565791
iteration   6: log likelihood of observed labels = -29758.59204146
iteration   7: log likelihood of observed labels = -29233.08582811
iteration   8: log likelihood of observed labels = -29711.08397283
iteration   9: log likelihood of observed labels = -29248.99448874
iteration  10: log likelihood of observed labels = -29670.84363801
iteration  11: log likelihood of observed labels = -29264.53607156
iteration  12: log likelihood of observed labels = -29636.65553552
iteration  13: log likelihood of observed labels = -29279.40169674
iteration  14: log likelihood of observed labels = -29607.5247

In [None]:
coefficients_1e5_penalty = logistic_regression_with_L2(feature_matrix=feature_matrix_train,
                                                     sentiment=sentiment_train,
                                                     initial_coefficients=np.zeros(194),
                                                     step_size=5e-6,
                                                     max_iter=501,
                                                     l2_penalty=1e5)

## Compare Coefficients

In [58]:
table=pd.DataFrame(coeff_list)

In [68]:
coeff_list[0]==coeff_list[5]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [64]:
coeff_list[1]

array([  7.14328461e-02,   1.89516861e-02,   1.76979430e-02,
         2.14748014e-02,   1.71123125e-02,   1.50423422e-02,
         9.67044801e-03,   1.25373996e-02,   1.65720397e-02,
         1.56582526e-02,   8.80568503e-03,   1.31426372e-02,
         1.30160618e-02,   7.80779853e-03,   8.34282162e-03,
         8.68843774e-03,   9.93484119e-03,   7.11301037e-03,
         8.34669847e-03,   7.48246928e-03,   8.24212621e-03,
         9.49902111e-03,   7.39405781e-03,   1.12811543e-02,
         5.00372133e-03,   7.41498223e-03,   7.37194977e-03,
         6.44129177e-03,   6.83237032e-03,   3.52772498e-03,
         6.76589993e-03,   8.19826140e-03,   4.75591706e-03,
         3.58255875e-03,   7.76056886e-03,   6.71809371e-03,
         5.52899564e-03,   4.04905308e-03,   5.15888488e-03,
         4.99594935e-03,   4.81042926e-03,   5.64431261e-03,
         4.20481846e-03,   4.93911716e-03,   4.68359352e-03,
         5.28094755e-03,   4.19739564e-03,   2.78524451e-03,
         6.29470376e-03,

In [61]:
table=pd.DataFrame(coeff_list[0])
table

Unnamed: 0,0
0,0.071433
1,0.018952
2,0.017698
3,0.021475
4,0.017112
5,0.015042
6,0.009670
7,0.012537
8,0.016572
9,0.015658


In [43]:
word_coeff_tuples_lst=[]
for x in range(len(coeff_list)):
    coeffs = list(coeff_list[x][1:]) # exclude intercept
    word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coeff_list[x])]
    word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)
    word_coeff_tuples_lst.append(word_coefficient_tuples)

In [52]:
positive_words=word_coeff_tuples_lst[0][0:5]

In [53]:
negative_words=sorted(word_coeff_tuples_lst[0], key=lambda x:x[1], reverse=False)[0:5]

In [54]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = 10, 6

def make_coefficient_plot(table, positive_words, negative_words, l2_penalty_list):
    cmap_positive = plt.get_cmap('Reds')
    cmap_negative = plt.get_cmap('Blues')
    
    xx = l2_penalty_list
    plt.plot(xx, [0.]*len(xx), '--', lw=1, color='k')
    
    table_positive_words = table[table['word'].isin(positive_words)]
    table_negative_words = table[table['word'].isin(negative_words)]
    del table_positive_words['word']
    del table_negative_words['word']
    
    for i in xrange(len(positive_words)):
        color = cmap_positive(0.8*((i+1)/(len(positive_words)*1.2)+0.15))
        plt.plot(xx, table_positive_words[i:i+1].as_matrix().flatten(),
                 '-', label=positive_words[i], linewidth=4.0, color=color)
        
    for i in xrange(len(negative_words)):
        color = cmap_negative(0.8*((i+1)/(len(negative_words)*1.2)+0.15))
        plt.plot(xx, table_negative_words[i:i+1].as_matrix().flatten(),
                 '-', label=negative_words[i], linewidth=4.0, color=color)
        
    plt.legend(loc='best', ncol=3, prop={'size':16}, columnspacing=0.5)
    plt.axis([1, 1e5, -1, 2])
    plt.title('Coefficient path')
    plt.xlabel('L2 penalty ($\lambda$)')
    plt.ylabel('Coefficient value')
    plt.xscale('log')
    plt.rcParams.update({'font.size': 18})
    plt.tight_layout()


make_coefficient_plot(table, positive_words, negative_words, l2_penalty_list=[0, 4, 10, 1e2, 1e3, 1e5])

NameError: name 'table' is not defined