# This notebook shows how to learn weights for a logistic regression model using gradient ascent

In [None]:
#import needed libraries
import numpy as np
import graphlab as gl #The dataset is in SFrame format

## Load Amazon baby products reviews dataset

To download this dataset click [here](https://s3.amazonaws.com/static.dato.com/files/coursera/course-3/amazon_baby_subset.gl.zip)

In [None]:
%cd 'C:\Users\Rolex James\Documents\MOOCs\ML Univ of Washington\Classification\Resources\machine-learning-specialization-master\course-3'

In [None]:
#I am using a subset of the amazon baby products reviews dataset
products = gl.SFrame('amazon_baby_subset.gl/')

### Exploring the dataset

In [None]:
products.head(5)

In [None]:
print 'Number of positive reviews =', len(products[products['sentiment']== 1])
print 'Number of negative reviews =', len(products[products['sentiment']== -1])

In [None]:
print products.shape

In [None]:
#Let's look at a single review 
print products['review'][0]

## Apply text cleaning to the review column

We will use  the word counts of the 193 most frequent words as the features for each review in the dataset. 
We will also remove punctuations from the 'review' column of the dataset

In [None]:
print important_words[0:10]

In [None]:
#Function to remove punctuations a string
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation) 

In [None]:
#Create a column that contains 'clean' reviews
products['review_clean'] = products['review'].apply(remove_punctuation)

In [None]:
#Let's view the products SFrame again
products.head(5)

Next we create columns that contain word counts for each word in the important_words list

In [None]:
for word in important_words:
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

In [None]:
#To run gradient ascent on our data we need to convert it to matrices
def get_numpy_data(data_sframe, features, label):
    data_sframe['intercept'] = 1
    features = ['intercept'] + features
    features_sframe = data_sframe[features]
    feature_matrix = features_sframe.to_numpy()
    label_sarray = data_sframe[label]
    label_array = label_sarray.to_numpy()
    return(feature_matrix, label_array)

In [None]:
#Split data into training and test sets, set seed to ensure reproducibility
train, test = products.random_split(.8, seed=1)

In [None]:
print "Number of reviews in training data: ", train.shape[0]
print "Number of reviews in test data: ", test.shape[0]

In [None]:
train_matrix, sentiment_train = get_numpy_data(train, important_words, 'sentiment')

### First we write a function to compute conditional probability with logistic link function

Recall that the logistic link function is given by:
$$
P(y_i = +1 | \mathbf{x}_i,\mathbf{w}) = \frac{1}{1 + \exp(-\mathbf{w}^T h(\mathbf{x}_i))},
$$

where the feature vector $h(\mathbf{x}_i)$ represents the word counts of **important_words** in the review  $\mathbf{x}_i$. 

In [None]:
def predict_probability(feature_matrix, coefficients):
    '''
    produces probablistic estimate for P(y_i = +1 | x_i, w).
    estimate ranges between 0 and 1.
    '''
    score = np.dot(feature_matrix, coefficients)
    predictions = 1/(1 + np.exp(-score))
    return predictions

## Next we need to compute derivative of the log likelihood with respect to a single coefficient

This is given by:
$$
\frac{\partial\ell}{\partial w_j} = \sum_{i=1}^N h_j(\mathbf{x}_i)\left(\mathbf{1}[y_i = +1] - P(y_i = +1 | \mathbf{x}_i, \mathbf{w})\right)
$$

This function accepts two arguments:
* `errors` vector containing $\mathbf{1}[y_i = +1] - P(y_i = +1 | \mathbf{x}_i, \mathbf{w})$ for all $i$. This is simply the difference between the true values and our predictions using a given set of weights.
* `feature` vector containing $h_j(\mathbf{x}_i)$  for all $i$. 

In [None]:
def feature_derivative(errors, feature):     
    derivative = np.dot(errors, feature)
    return derivative

Next we compute the log likelihood for a given set of weights. This helps us to check how well the learning algorithm is doing. For each iteration the log likelihood should increase.

The log likelihood equation is given by:
$$\ell\ell(\mathbf{w}) = \sum_{i=1}^N \Big( (\mathbf{1}[y_i = +1] - 1)\mathbf{w}^T h(\mathbf{x}_i) - \ln\left(1 + \exp(-\mathbf{w}^T h(\mathbf{x}_i))\right) \Big) $$

In [None]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    logexp = np.log(1. + np.exp(-scores))
    
    # Simple check to prevent overflow
    mask = np.isinf(logexp)
    logexp[mask] = -scores[mask]
    
    lp = np.sum((indicator-1)*scores - logexp)
    return lp

## Running Gradient Ascent

The following function takes gradient steps to the optimum (the maximum point of the log likelihood function)

In [None]:
from math import sqrt

def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    for itr in xrange(max_iter):
        predictions = predict_probability(feature_matrix, coefficients)
        
        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)
        errors = indicator - predictions
        for j in xrange(len(coefficients)): # loop over each coefficient
            derivative = feature_derivative(errors, feature_matrix[:,j])
            coefficients[j] = coefficients[j] + (step_size * derivative)
            
        # Check whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            print 'iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp)
                
    return coefficients

Running our logistc regression solver

In [None]:
coefficients = logistic_regression(train_matrix, sentiment_train, initial_coefficients=np.zeros(194),
                                   step_size=1e-7, max_iter=301)

As you can see the log likelihood increases after every iteration

## Making Predictions on the test data

In [None]:
#Convert test data to a numpy array
test_matrix, sentiment_test = get_numpy_data(test, important_words, 'sentiment')

In [None]:
#Compute probability estimates of reviews in test data being positive
pred_proba = predict_probability(test_matrix, coefficients)

In [None]:
#Predict class labels based on probability estimates
#Here I am using 0.5 as the probability threshold for predicting the positive class
#This is because logit(0) = 0.5
pred_labels = np.where(pred_proba >= 0.5, +1, -1)

In [None]:
#View the first 10 predictions
pred_labels[0:10]

Alternatively we can use the scores (i.e. dot product of the coefficients and feature values) for predicting class labels

In [None]:
pred_scores = np.dot(test_matrix, coefficients)
pred_labels_scores = np.where(pred_scores > 0, +1, -1)
pred_labels_scores[0:10]

## Measuring accuracy on test data

$$
\mbox{accuracy} = \frac{\mbox{# correctly classified data points}}{\mbox{# total data points}}
$$

In [None]:
num_mistakes = len(test) - np.sum(pred_labels == np.array(test['sentiment']))
accuracy = np.sum(pred_labels == np.array(test['sentiment'])) / float(len(test))
print "-----------------------------------------------------"
print 'Number of test reviews correctly classified =', len(test) - num_mistakes
print 'Number of test reviews incorrectly classified =', num_mistakes
print 'Total number of reviews                       =', len(test)
print "-----------------------------------------------------"
print 'Accuracy = %.2f' % accuracy

This model does pretty well on the test set. It is better than a majority classifier which would have an accuaracy of about 50%

## Finally, let's see the words most associated with positive & negative sentiment