In [1]:
import numpy as np
import pandas as pd
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.externals import joblib
import math

## Load and process data

In [2]:
products = pd.read_csv('amazon_baby_subset.csv')

In [3]:
def remove_punctuation(text):
    translator = text.maketrans('','',string.punctuation)
    return text.translate(translator)

In [4]:
products.review.fillna('',inplace=True)

In [5]:
products['review_clean'] = products['review'].apply(remove_punctuation)

In [6]:
important_words = pd.read_json('important_words.json')[0]
important_words = list(important_words)

In [7]:
for word in important_words:
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

## Train-Validation Split

In [8]:
train_indic = open('module-4-assignment-train-idx.json','r')
train_indic=list(train_indic)[0]
train_indic=train_indic.split(',')
train_indic[0]=' 0'
train_indic[-1]=' 53070'

In [9]:
train_indic[0:10]

[' 0', ' 1', ' 3', ' 4', ' 5', ' 6', ' 7', ' 8', ' 10', ' 11']

In [17]:
val_indic = open('module-4-assignment-validation-idx.json','r')
val_indic = list(val_indic)[0]
val_indic = val_indic.split(',')
val_indic[0] = ' 2'
val_indic[-1]=' 53071'

In [19]:
train_list=[]
for line in train_indic:
    t = [int(x.strip()) for x in line.split(',')]
    train_list.append(t[0])
    
train_data = products.iloc[train_list]

In [20]:
val_list=[]
for line in val_indic:
    t = [int(x.strip()) for x in line.split(',')]
    val_list.append(t[0])
    
val_data = products.iloc[val_list]

## Convert data frame to multi-dimensional array

In [28]:
def get_numpy_data(df, features, label):
    df['constant'] = 1
    features = ['constant'] + features
    features_frame = df[features]
    feature_matrix = features_frame.as_matrix()
    label_sarray = df[label]
    label_array = label_sarray.as_matrix()
    return(feature_matrix, label_array)

In [30]:
feature_matrix_train, sentiment_train = get_numpy_data(train_data, important_words, 'sentiment')
feature_matrix_valid, sentiment_valid = get_numpy_data(val_data, important_words, 'sentiment') 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


## Building on logistic regression with no L2 penalty assignment

In [31]:

def predict_probability(feature_matrix, coefficients):
    # Take dot product of feature_matrix and coefficients  
    # YOUR CODE HERE
    score = np.dot(feature_matrix, coefficients)

    # Compute P(y_i = +1 | x_i, w) using the link function
    # YOUR CODE HERE
    predictions = np.exp(score)
    predictions = predictions+1
    predictions = 1/predictions
    
    # return predictions
    return score

## Adding L2 penalty

In [None]:
def feature_derivative(errors, feature):     
    # Compute the dot product of errors and feature
    derivative = np.dot(errors,feature)
        # Return the derivative
    return derivative

def feature_derivative_with_L2(errors, feature, coefficient, l2_penalty, feature_is_constant): 
    
    # Compute the dot product of errors and feature
    ## YOUR CODE HERE
    derivative = feature_derivative(errors, feature)

    # add L2 penalty term for any feature that isn't the intercept.
    if not feature_is_constant: 
        ## YOUR CODE HERE
        derivative - 2*l2_penalty*coefficient
        
    return derivative

In [None]:
def compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    
    lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores))) - l2_penalty*np.sum(coefficients[1:]**2)
    
    return lp