In [1]:
## implementation of logistic regression

In [2]:
from sklearn.base import BaseEstimator
import numpy as np

In [188]:
# dummy data
np.random.seed(12)
num_observations = 50000

x1 = np.random.multivariate_normal([0, 0], [[1, .75],[.75, 1]], num_observations)
x2 = np.random.multivariate_normal([1, 4], [[1, .75],[.75, 1]], num_observations)

simulated_separableish_features = np.vstack((x1, x2)).astype(np.float32)
simulated_labels = np.hstack((np.zeros(num_observations),
                              np.ones(num_observations)))

In [230]:
class LogisticRegression(BaseEstimator):
    def __init__(self,iterations,batchSize,learningRate):
        self.beta = None
        self.batchSize = batchSize
        self.learningRate = learningRate
        self.iterations = iterations
        return
        
    def predict(self, X):
        X = np.concatenate((X,np.ones((X.shape[0],1))),axis=1)
        return 1/(1+np.exp(-np.dot(X,self.beta)))
    
    def logLikelihood(self, beta, Xdata, ydata):
        Xbeta = np.dot(Xdata,beta)
        logLikelihood = np.sum(ydata*Xbeta - np.log(1+np.exp(Xbeta)))
        return logLikelihood
    
    def computeBatchGrad(self, Xbatch,ybatch,prediction):
        Xbatch = np.concatenate((Xbatch,np.ones((Xbatch.shape[0],1))),axis=1)
        return np.dot(Xbatch.T,ybatch-prediction)
        
    def fit(self, X,y):
        epoch = X.shape[0]/self.batchSize
        self.beta = np.zeros(X.shape[1] + 1)
        for i in xrange(self.iterations):
            jstart = 0
            for j in xrange(epoch):
                Xbatch  = X[jstart:jstart+self.batchSize,:]
                ybatch = y[jstart:jstart+self.batchSize]
                jstart = jstart+self.batchSize
                prediction = self.predict(Xbatch)
                grad = self.computeBatchGrad(Xbatch,ybatch,prediction)
                self.beta = self.beta + self.learningRate*grad
            
            grad = self.computeBatchGrad(X[jstart+self.batchSize:,:],y[jstart+self.batchSize:],
                                    self.predict(X[jstart+self.batchSize:,:])) #to train for the remaining samples
            self.beta = self.beta + self.learningRate*grad
        return self
        
    def getWeights(self):
        return self.beta
        

In [231]:
cls = LogisticRegression(1000,2000,0.0001)

In [232]:
simulated_separableish_features.shape

(100000, 2)

In [233]:
%%time 
cls.fit(simulated_separableish_features,simulated_labels)

CPU times: user 4.44 s, sys: 3.13 ms, total: 4.44 s
Wall time: 4.44 s


LogisticRegression(batchSize=2000, iterations=1000, learningRate=0.0001)

In [234]:
cls.get_params()

{'batchSize': 2000, 'iterations': 1000, 'learningRate': 0.0001}

In [None]:
pred =  cls.predict(simulated_separableish_features)
pred = np.round(pred)

### Comparison to Sklearn SGDClassifier

In [235]:
from sklearn.linear_model import SGDClassifier

clfs = SGDClassifier(loss='log', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=1000, 
                     shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal',
                     eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, average=False)


In [224]:
%%time
clfs.fit(simulated_separableish_features, simulated_labels)

print clfs.intercept_, clfs.coef_

[-10.58907999] [[-3.71846327  6.24280612]]
CPU times: user 7.68 s, sys: 11 ms, total: 7.69 s
Wall time: 7.7 s


In [225]:
pred1= clfs.predict_proba(simulated_separableish_features)

In [229]:
print 'Accuracy from scratch: {0}'.format((pred == simulated_labels).sum().astype(float) / len(pred))
print 'Accuracy from sk-learn: {0}'.format(clfs.score(simulated_separableish_features, simulated_labels))

Accuracy from scratch: 0.99396
Accuracy from sk-learn: 0.9939
