<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Logistic-Regression" data-toc-modified-id="Logistic-Regression-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Logistic Regression</a></span><ul class="toc-item"><li><span><a href="#Generating-synthetic-data" data-toc-modified-id="Generating-synthetic-data-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Generating synthetic data</a></span></li><li><span><a href="#Sigmoid-function" data-toc-modified-id="Sigmoid-function-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Sigmoid function</a></span></li><li><span><a href="#Log-likelihood-function" data-toc-modified-id="Log-likelihood-function-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Log-likelihood function</a></span></li><li><span><a href="#Gradient-&amp;-Regression" data-toc-modified-id="Gradient-&amp;-Regression-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Gradient &amp; Regression</a></span></li><li><span><a href="#Run-the-model" data-toc-modified-id="Run-the-model-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Run the model</a></span></li><li><span><a href="#vs.-SkLearn" data-toc-modified-id="vs.-SkLearn-1.6"><span class="toc-item-num">1.6&nbsp;&nbsp;</span>vs. SkLearn</a></span></li><li><span><a href="#Accuracies" data-toc-modified-id="Accuracies-1.7"><span class="toc-item-num">1.7&nbsp;&nbsp;</span>Accuracies</a></span></li></ul></li></ul></div>

# Logistic Regression

In [57]:
from jupyterthemes import jtplot
jtplot.style()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook

## Generating synthetic data

In [151]:
np.random.seed(42)  # random seed for repeatability
n_instances = 5000  # how many instances

x1 = np.random.multivariate_normal([0, 0], [[1, 0.25],[0.25, 1]], n_instances)
x2 = np.random.multivariate_normal([3, 3], [[1, 0.25],[0.25, 1]], n_instances)

synthetic_features = np.vstack((x1, x2)).astype(np.float32)
synthetic_labels   = np.hstack((np.zeros(n_instances), np.ones(n_instances)))

In [154]:
#help(np.random.multivariate_normal)

In [152]:
plt.scatter(x1[:,0], x1[:,1], alpha=0.4, c='g', marker='+')
plt.scatter(x2[:,0], x2[:,1], alpha=0.3, c='b', marker='+')

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x268e3374208>

## Sigmoid function

$$ \sigma(z) = \frac{1}{1+e^{-z}}$$

In [155]:
def sigmoid(scores):
    return 1 / (1 + np.exp(-scores))

## Log-likelihood function

$$ L^{log} = \sum_{i=1}^{N}y_{i}\beta ^{T}x_{i} - log(1+e^{\beta^{T}x_{i}}) $$

In [156]:
def log_likelihood(features, target, weights):
    scores = np.dot(features, weights)
    logL = np.sum( target*scores - np.log(1 + np.exp(scores)) )
    return logL

## Gradient & Regression

$$ \bigtriangledown L^{log} = X^{T}(Y - Predictions) $$

In [157]:
grad_ = lambda X, Y, preds: np.dot(X.T, Y-preds)

In [162]:
def logistic_regression(features, target, num_steps, learning_rate, add_intercept = False):
    if add_intercept:
        intercept = np.ones((features.shape[0], 1))
        features = np.hstack((intercept, features))
        
    weights = np.zeros(features.shape[1])
    
    for step in range(num_steps):
        scores = np.dot(features, weights)
        predictions = sigmoid(scores)

        # Update weights with gradient
        gradient = grad_(features, target, predictions)
        weights += learning_rate * gradient
        
        # Print log-likelihood every so often
        if step % 10000 == 0:
            print(log_likelihood(features, target, weights))
        
    return weights

## Run the model

In [167]:
weights = logistic_regression(synthetic_features, synthetic_labels,
                     num_steps = 300000, learning_rate = 5e-5, add_intercept=True)

-4274.265889560436
-795.2424536307747
-795.2424513913602
-795.24245139136
-795.24245139136
-795.24245139136
-795.24245139136
-795.24245139136
-795.24245139136
-795.24245139136
-795.24245139136
-795.24245139136
-795.24245139136
-795.24245139136
-795.24245139136
-795.24245139136
-795.24245139136
-795.24245139136
-795.24245139136
-795.24245139136
-795.24245139136
-795.24245139136
-795.24245139136
-795.24245139136
-795.24245139136
-795.24245139136
-795.24245139136
-795.24245139136
-795.24245139136
-795.24245139136


## vs. SkLearn

In [169]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(fit_intercept=True, C = 1e15)
clf.fit(synthetic_features, synthetic_labels)

print(clf.intercept_, clf.coef_)
print(weights)

[-7.06502849] [[2.41419304 2.31520744]]
[-7.0691384   2.41560378  2.31651304]




## Accuracies

In [170]:
data_with_intercept = np.hstack((np.ones((synthetic_features.shape[0], 1)),
                                 synthetic_features))
final_scores = np.dot(data_with_intercept, weights)
preds = np.round(sigmoid(final_scores))

print('Accuracy from scratch: {0}'.format((preds == synthetic_labels).sum().astype(float) / len(preds)))
print('Accuracy from sk-learn: {0}'.format(clf.score(synthetic_features, synthetic_labels)))

Accuracy from scratch: 0.9693
Accuracy from sk-learn: 0.9693
