In [5]:
import numpy as np
import pandas as pd
import os.path
import math

from scipy.special import expit as sigmoid
from sklearn.model_selection import train_test_split
from numpy import repeat, dot
from numpy.linalg import inv

softmax = lambda x : np.exp(x-np.max(x)) / np.sum(np.exp(x-np.max(x)), axis=0) # softmax definition


In [6]:
class LogisticRegression:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.nObs, self.nFeatures = X.shape
        assert(self.nObs==y.shape[0])
        #self.allclasses = np.array(np.unique(y))
        self.allclasses = np.unique(y)
        self.nclass = len(self.allclasses)
        #set up weights
        self.W = np.random.normal(0,0.001, (self.nclass, self.nFeatures))       
        # self.W = np.random.uniform(-0.1, 0.1, (self.nclass, self.nFeatures))
        
    def _IRLS(self, y, X, cl, iters,d=0.05, tolerance=0.001):
        # https://en.wikipedia.org/wiki/Iteratively_reweighted_least_squares
        delW = np.empty(self.W.shape)
        delta = np.array( repeat(d,self.nObs) ).reshape(1,self.nObs)
        r = repeat(1,self.nObs)
        R = np.diag(r)
        Wi = dot( inv( X.T.dot(R).dot(X) ),( X.T.dot(R).dot(y) ) )
        
        for it in range(iters):
            delW = Wi
            delr =  abs(y - X.dot(Wi)).T
            r = 1.0/np.maximum( delta, delr )
            R = np.diag(r[0])
            Wi = dot( inv( X.T.dot(R).dot(X) ),( X.T.dot(R).dot(y) ) )
            t = sum( abs(Wi - delW) ) 
            if it%50==0:
                #print("IRLS for %d :: Tolerance is %s" % (cl, t))
                pass
            if t<tolerance:
                return Wi
        
    def _train(self):
        for i in range(len(self.W)):
            thisy = np.array([1 if c==self.allclasses[i] else 0 for c in self.y])
            self.W[i,:] = self._IRLS(thisy,self.X, self.allclasses[i], 1000)
                
    def predict(self, Xnew):
        ynew = []
        for xnew in Xnew:
            pred=[]
            for each in self.allclasses:
                c = int(each)
                pred.append(dot(self.W[c], xnew))
            pyx = softmax(pred)
            match=np.argmax(pyx)
            ynew.append(self.allclasses[match])
        return ynew
    
    def calc_error(self, ynew, target):
        correctness = np.array([yn == y for (yn,y) in zip(ynew, target)])
        return 1 - (np.sum(correctness) / target.size)
    
    def validate(self, Xnew, Ynew):
        mypredictions=self.predict(Xnew)
        return self.calc_error(mypredictions, Ynew)
    
    def build(self):
        print()
        print("Building the Logistic regression model.. ")
        self._train()
        return self

In [7]:
#select % of random sub-indices out of the given dataset
def rsubindices(dSet, percent):
    r,_ = dSet.shape
    if not r:
        return np.ndarray(shape=(0,))  
    indices=np.arange(r)
    return np.random.choice(indices, math.ceil(percent/100*len(indices)), replace=False)

def logisticRegression(filename, num_splits, train_percent=[10,25,50,75,100]):
    # from sklearn.datasets import load_boston
    # boston = load_boston()
    assert os.path.isfile(filename) and os.access(filename, os.R_OK)
    df=pd.read_csv(filename, sep=',', header = None)
    #print(df.shape)
    #df.head()
    
    data = df.as_matrix()
    X=data[:, :-1]
    y=data[:, -1]
    #assert(all(y==boston.target))
    del df, data
    
    X=X+np.random.normal(0, 0.001, X.shape) #to prevent numerical problem
    
    if len(np.unique(y))>15:
        # if the target values are more than some reasonable no (15), we take that as binary classifier
        b = np.percentile(y, 50)
        f=np.vectorize(lambda x: 0 if x<b else 1)
        y=f(y)
        assert(X.shape[0]==y.shape[0])
        y=np.reshape(y, [X.shape[0], 1])
       
    errormatrix = np.zeros( (num_splits, len(train_percent)) )
    # perform for num-splits iterations
    for i in range(num_splits):
        print("------------------------------------------------------------")
        print(" splitting the input data-frame into 80-20 train-test data..")
        print("------------------------------------------------------------")
        X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2)
        Y_train = np.reshape(Y_train, [X_train.shape[0], 1])
        Y_test = np.reshape(Y_test, [X_test.shape[0], 1])
        
        # percentage of data to be used for the model
        for j,p in enumerate(train_percent):
            selection = rsubindices(Y_train, p)
            train = X_train[selection, :]
            labels= Y_train[selection]
            model = LogisticRegression(train,labels).build()
            errormatrix[i, j] = model.validate(X_test, Y_test)
        print()
    errmu = np.mean(errormatrix, axis=0)
    errsigma = np.std(errormatrix, axis=0, ddof=1)
    
    print("Mean test-error"+str(errmu))
    print("Std test-error"+str(errsigma))

In [8]:
logisticRegression('boston.csv', 10)

------------------------------------------------------------
 splitting the input data-frame into 80-20 train-test data..
------------------------------------------------------------

Building the Logistic regression model.. 

Building the Logistic regression model.. 

Building the Logistic regression model.. 

Building the Logistic regression model.. 

Building the Logistic regression model.. 

------------------------------------------------------------
 splitting the input data-frame into 80-20 train-test data..
------------------------------------------------------------

Building the Logistic regression model.. 

Building the Logistic regression model.. 

Building the Logistic regression model.. 

Building the Logistic regression model.. 

Building the Logistic regression model.. 

------------------------------------------------------------
 splitting the input data-frame into 80-20 train-test data..
------------------------------------------------------------

Building the Logis

In [84]:
logisticRegression('digits.csv', 10)

------------------------------------------------------------
 splitting the input data-frame into 80-20 train-test data..
------------------------------------------------------------
Building the Logistic regression model.. 
IRLS for 0 :: Tolerance is 22.001845628
IRLS for 0 :: Tolerance is 0.0230583010952
IRLS for 1 :: Tolerance is 49.4299067778
IRLS for 1 :: Tolerance is 0.0085055815492
IRLS for 1 :: Tolerance is 0.00258769718641
IRLS for 2 :: Tolerance is 25.5183786199
IRLS for 3 :: Tolerance is 34.2594080478
IRLS for 3 :: Tolerance is 0.0191542500396
IRLS for 4 :: Tolerance is 19.0785674023
IRLS for 4 :: Tolerance is 0.00495372622934
IRLS for 5 :: Tolerance is 41.1605550938
IRLS for 5 :: Tolerance is 0.0044765213084
IRLS for 6 :: Tolerance is 34.8386026295
IRLS for 6 :: Tolerance is 0.00230526316722
IRLS for 7 :: Tolerance is 9.85901811374
IRLS for 8 :: Tolerance is 50.0749442965
IRLS for 9 :: Tolerance is 15.9973719894
IRLS for 9 :: Tolerance is 0.00518629619597
Building the Logis

IRLS for 7 :: Tolerance is 0.00277193419937
IRLS for 8 :: Tolerance is 54.8966865252
IRLS for 8 :: Tolerance is 0.0214225760454
IRLS for 9 :: Tolerance is 38.6078571828
IRLS for 9 :: Tolerance is 0.0139764283174
Building the Logistic regression model.. 
IRLS for 0 :: Tolerance is 7.30852477048
IRLS for 1 :: Tolerance is 17.0191329683
IRLS for 1 :: Tolerance is 0.175551429121
IRLS for 1 :: Tolerance is 0.0164582497306
IRLS for 1 :: Tolerance is 0.00153918129475
IRLS for 2 :: Tolerance is 20.534120178
IRLS for 3 :: Tolerance is 19.5376729619
IRLS for 3 :: Tolerance is 0.00205557915407
IRLS for 3 :: Tolerance is 0.00174514815104
IRLS for 3 :: Tolerance is 0.00139449117359
IRLS for 3 :: Tolerance is 0.00106781372682
IRLS for 4 :: Tolerance is 10.2223797591
IRLS for 5 :: Tolerance is 8.29749020539
IRLS for 6 :: Tolerance is 13.0690361876
IRLS for 7 :: Tolerance is 14.1919502882
IRLS for 7 :: Tolerance is 0.0296238403535
IRLS for 8 :: Tolerance is 14.0551187859
IRLS for 9 :: Tolerance is 9.5

IRLS for 8 :: Tolerance is 22.0806534624
IRLS for 9 :: Tolerance is 27.5055223447
Building the Logistic regression model.. 
IRLS for 0 :: Tolerance is 7.51636297112
IRLS for 1 :: Tolerance is 16.2477790969
IRLS for 2 :: Tolerance is 6.3753016188
IRLS for 3 :: Tolerance is 10.2982078428
IRLS for 4 :: Tolerance is 6.89066883041
IRLS for 4 :: Tolerance is 0.00708983750396
IRLS for 5 :: Tolerance is 8.83606815964
IRLS for 6 :: Tolerance is 8.29718168755
IRLS for 7 :: Tolerance is 10.8384235549
IRLS for 8 :: Tolerance is 7.17619668532
IRLS for 9 :: Tolerance is 8.55967891693
Building the Logistic regression model.. 
IRLS for 0 :: Tolerance is 4.78979389792
IRLS for 0 :: Tolerance is 0.0102536762585
IRLS for 1 :: Tolerance is 10.9638257167
IRLS for 2 :: Tolerance is 3.24138779618
IRLS for 2 :: Tolerance is 0.00229076751013
IRLS for 3 :: Tolerance is 2.74611345006
IRLS for 3 :: Tolerance is 0.00243465249686
IRLS for 3 :: Tolerance is 0.00168834549487
IRLS for 3 :: Tolerance is 0.0011397623338

IRLS for 5 :: Tolerance is 5.67714437887
IRLS for 5 :: Tolerance is 0.00195412789695
IRLS for 6 :: Tolerance is 2.94251349463
IRLS for 7 :: Tolerance is 6.23182387412
IRLS for 7 :: Tolerance is 0.0204682006543
IRLS for 7 :: Tolerance is 0.000956415145444
IRLS for 8 :: Tolerance is 10.6829861803
IRLS for 9 :: Tolerance is 7.84376925611
Building the Logistic regression model.. 
IRLS for 0 :: Tolerance is 3.87909170975
IRLS for 0 :: Tolerance is 0.0207579155482
IRLS for 0 :: Tolerance is 0.00130575562452
IRLS for 1 :: Tolerance is 8.73819927877
IRLS for 1 :: Tolerance is 0.00396078691405
IRLS for 1 :: Tolerance is 0.00166852440942
IRLS for 2 :: Tolerance is 1.56065653425
IRLS for 3 :: Tolerance is 2.18412181221
IRLS for 4 :: Tolerance is 5.83891837268
IRLS for 4 :: Tolerance is 0.00476221136163
IRLS for 5 :: Tolerance is 5.68440850041
IRLS for 5 :: Tolerance is 0.000981206807119
IRLS for 6 :: Tolerance is 1.60039416247
IRLS for 7 :: Tolerance is 4.49245592628
IRLS for 8 :: Tolerance is 7.

IRLS for 6 :: Tolerance is 3.13811250887
IRLS for 7 :: Tolerance is 6.20292430799
IRLS for 8 :: Tolerance is 4.82996200064
IRLS for 9 :: Tolerance is 2.6335756098

------------------------------------------------------------
 splitting the input data-frame into 80-20 train-test data..
------------------------------------------------------------
Building the Logistic regression model.. 
IRLS for 0 :: Tolerance is 20.6098869148
IRLS for 0 :: Tolerance is 0.000997165486433
IRLS for 1 :: Tolerance is 27.8349488974
IRLS for 1 :: Tolerance is 0.152355702459
IRLS for 2 :: Tolerance is 26.4576641506
IRLS for 3 :: Tolerance is 24.1220991522
IRLS for 3 :: Tolerance is 0.0598967629092
IRLS for 3 :: Tolerance is 0.00193060597491
IRLS for 4 :: Tolerance is 24.8319700219
IRLS for 4 :: Tolerance is 0.00563141974786
IRLS for 5 :: Tolerance is 16.5233990811
IRLS for 5 :: Tolerance is 0.0147294594725
IRLS for 6 :: Tolerance is 28.5130108313
IRLS for 6 :: Tolerance is 0.0108720366227
IRLS for 6 :: Tolera