In [1]:
import numpy as np
import pandas as pd

# Implementation

The machine learning algorithm for logistic regression is given below. The similarity to the process of linear regression means that we could probably create a single Regression class and take Linear and Logistic as children. This will be implemented in the future. 

Also, note that X should ideally be normalized (centered and scaled) before the it is used to train the model. This is to avoid any log(0) errors from extremely small y_hat (due to large values in X). This can be fixed by instantiating w and b appropriately within the model as well, which will be fixed in the future.

In [27]:
# Given 2 1D ndarrays, gives Mean Squared Error of the two
def bin_cross_entropy(y,p):
    if len(y) != len(p):
        return "ndarrays are length " + len(y) + " and " + len(p) + ". Please provide 2 same-length ndarrays."
    return (-1/len(y))*sum([np.log(p[i]) if y[i]==1 else np.log(1-p[i]) for i in range(len(y))])

In [57]:
class LogisticRegression:
    def __init__(self):
        self.w = None
        self.b = None
        self.alpha = None
        self.iter = 0
    
    # Train model with gradient descent
    # X: feature data, y: outcome data
    # costf: cost function
    # alpha: learning rate
    # n_iter: number of iterations to run
    # epsilon: threshold of cost change; if cost change < epsilon after some iteration, then gradient descent ends
    def train(self, X, y, costf=bin_cross_entropy, alpha=0.0001, n_iter=10000, epsilon=0):
        
        # Edge cases
        if len(X.shape)!=2:
            return "Please enter valid training data. X must be 2 dimensional ndarray."
        elif y.size<2:
            return "Need at least 2 data points to do linear regression."
        elif X.shape[0]!=y.size:
            return "Number of data points in X and y do not match."
        elif self.iter>0:
            return "This model has already been trained. Please instantiate another model."
        
        point_n, feat_n = X.shape
        
        # Random w, b initialization
        w = np.random.rand(feat_n)
        b = np.random.rand()
        
        # Training model
        y_hat = 1/(1+np.exp(-(np.dot(w, X.T) + b)))
        print(y_hat)
        cost=costf(y, y_hat)
        cost_change=cost
        while (self.iter<n_iter) & (cost_change>epsilon):
            
            ## Gradient descent
            dw = np.array([(-(2/point_n)*np.dot(X[:,j], y-y_hat)) for j in range(feat_n)])
            db = -(2/point_n)*sum(y-y_hat)
            w -= alpha*dw
            b -= alpha*db
            
            y_hat = 1/(1+np.exp(-(np.dot(w, X.T) + b)))
            cost_change= cost - costf(y, y_hat)
            cost -= cost_change
            
            self.iter+=1
            print("Iteration %d: Cost has improved by %7.5f" % (self.iter,cost_change))
            
        
        # Set attributes
        self.w=w
        self.b=b
        self.alpha=alpha
        
        return "Done"
    
    def predict(self, xp):
        return 1/(1+np.exp(-(np.dot(self.w, xp.T) + self.b)))

In [71]:
model = LogisticRegression()

In [72]:
np.random.seed(64)
X=np.random.rand(np.random.randint(100),np.random.randint(100))*np.random.randint(4)
y=np.random.randint(2, size=X.shape[0])
X, y

(array([[5.95592513e-01, 4.49858899e-01, 4.57019825e-01, ...,
         3.06022763e-01, 4.23342451e-01, 3.94106188e-01],
        [5.69013591e-01, 5.75177869e-01, 6.23282804e-01, ...,
         1.07540770e-01, 2.85563797e-01, 4.56019416e-02],
        [9.60817542e-01, 1.61641473e-02, 6.03501774e-01, ...,
         9.68583928e-01, 5.26287581e-01, 1.61175945e-01],
        ...,
        [3.79081019e-01, 6.35963956e-01, 6.14019674e-01, ...,
         5.09330327e-01, 6.03897314e-01, 6.81348681e-01],
        [4.73116121e-01, 5.96572735e-01, 6.47931130e-01, ...,
         1.22165808e-01, 8.43884355e-01, 2.06653080e-01],
        [5.09360919e-01, 2.43914499e-04, 8.57397280e-01, ...,
         6.44847581e-01, 8.60275139e-01, 4.69754545e-01]]),
 array([0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0,
        1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
        1, 1]))

In [73]:
X.shape, y.shape

((68, 38), (68,))

In [74]:
model.train(X, y, alpha=0.01, epsilon=0.00001)

[0.99976723 0.99996199 0.99998885 0.99992218 0.99986258 0.99998978
 0.99998093 0.99996448 0.99999606 0.99996734 0.99990538 0.99998366
 0.99998061 0.99998593 0.99998357 0.99994162 0.99998302 0.9995181
 0.99999418 0.99999384 0.99986968 0.99999145 0.99997223 0.99991629
 0.99999362 0.9999173  0.99994573 0.99996703 0.99998242 0.99998217
 0.99998931 0.999971   0.99993008 0.99997619 0.99998707 0.99995184
 0.99999076 0.99999346 0.99988377 0.99998561 0.99999248 0.9999729
 0.99992771 0.99996506 0.99997937 0.99999299 0.99999364 0.9999701
 0.99999238 0.99999346 0.9999975  0.99996823 0.99997934 0.9999939
 0.9999703  0.99998428 0.99993105 0.99988024 0.99999182 0.99981663
 0.99997876 0.99990071 0.9999961  0.99994023 0.99999556 0.99993102
 0.99999395 0.99998732]
Iteration 1: Cost has improved by 0.04540
Iteration 2: Cost has improved by 0.04540
Iteration 3: Cost has improved by 0.04540
Iteration 4: Cost has improved by 0.04540
Iteration 5: Cost has improved by 0.04540
Iteration 6: Cost has improved by

Iteration 563: Cost has improved by 0.00018
Iteration 564: Cost has improved by 0.00018
Iteration 565: Cost has improved by 0.00018
Iteration 566: Cost has improved by 0.00018
Iteration 567: Cost has improved by 0.00018
Iteration 568: Cost has improved by 0.00018
Iteration 569: Cost has improved by 0.00018
Iteration 570: Cost has improved by 0.00018
Iteration 571: Cost has improved by 0.00018
Iteration 572: Cost has improved by 0.00018
Iteration 573: Cost has improved by 0.00018
Iteration 574: Cost has improved by 0.00018
Iteration 575: Cost has improved by 0.00018
Iteration 576: Cost has improved by 0.00018
Iteration 577: Cost has improved by 0.00018
Iteration 578: Cost has improved by 0.00017
Iteration 579: Cost has improved by 0.00017
Iteration 580: Cost has improved by 0.00017
Iteration 581: Cost has improved by 0.00017
Iteration 582: Cost has improved by 0.00017
Iteration 583: Cost has improved by 0.00017
Iteration 584: Cost has improved by 0.00017
Iteration 585: Cost has improved

Iteration 1092: Cost has improved by 0.00011
Iteration 1093: Cost has improved by 0.00011
Iteration 1094: Cost has improved by 0.00011
Iteration 1095: Cost has improved by 0.00011
Iteration 1096: Cost has improved by 0.00011
Iteration 1097: Cost has improved by 0.00011
Iteration 1098: Cost has improved by 0.00011
Iteration 1099: Cost has improved by 0.00011
Iteration 1100: Cost has improved by 0.00011
Iteration 1101: Cost has improved by 0.00011
Iteration 1102: Cost has improved by 0.00011
Iteration 1103: Cost has improved by 0.00011
Iteration 1104: Cost has improved by 0.00011
Iteration 1105: Cost has improved by 0.00011
Iteration 1106: Cost has improved by 0.00011
Iteration 1107: Cost has improved by 0.00011
Iteration 1108: Cost has improved by 0.00011
Iteration 1109: Cost has improved by 0.00011
Iteration 1110: Cost has improved by 0.00011
Iteration 1111: Cost has improved by 0.00011
Iteration 1112: Cost has improved by 0.00011
Iteration 1113: Cost has improved by 0.00011
Iteration 

Iteration 1584: Cost has improved by 0.00008
Iteration 1585: Cost has improved by 0.00008
Iteration 1586: Cost has improved by 0.00008
Iteration 1587: Cost has improved by 0.00007
Iteration 1588: Cost has improved by 0.00007
Iteration 1589: Cost has improved by 0.00007
Iteration 1590: Cost has improved by 0.00007
Iteration 1591: Cost has improved by 0.00007
Iteration 1592: Cost has improved by 0.00007
Iteration 1593: Cost has improved by 0.00007
Iteration 1594: Cost has improved by 0.00007
Iteration 1595: Cost has improved by 0.00007
Iteration 1596: Cost has improved by 0.00007
Iteration 1597: Cost has improved by 0.00007
Iteration 1598: Cost has improved by 0.00007
Iteration 1599: Cost has improved by 0.00007
Iteration 1600: Cost has improved by 0.00007
Iteration 1601: Cost has improved by 0.00007
Iteration 1602: Cost has improved by 0.00007
Iteration 1603: Cost has improved by 0.00007
Iteration 1604: Cost has improved by 0.00007
Iteration 1605: Cost has improved by 0.00007
Iteration 

Iteration 2209: Cost has improved by 0.00005
Iteration 2210: Cost has improved by 0.00005
Iteration 2211: Cost has improved by 0.00005
Iteration 2212: Cost has improved by 0.00005
Iteration 2213: Cost has improved by 0.00005
Iteration 2214: Cost has improved by 0.00005
Iteration 2215: Cost has improved by 0.00005
Iteration 2216: Cost has improved by 0.00005
Iteration 2217: Cost has improved by 0.00005
Iteration 2218: Cost has improved by 0.00005
Iteration 2219: Cost has improved by 0.00005
Iteration 2220: Cost has improved by 0.00005
Iteration 2221: Cost has improved by 0.00005
Iteration 2222: Cost has improved by 0.00005
Iteration 2223: Cost has improved by 0.00005
Iteration 2224: Cost has improved by 0.00005
Iteration 2225: Cost has improved by 0.00005
Iteration 2226: Cost has improved by 0.00005
Iteration 2227: Cost has improved by 0.00005
Iteration 2228: Cost has improved by 0.00005
Iteration 2229: Cost has improved by 0.00005
Iteration 2230: Cost has improved by 0.00005
Iteration 

Iteration 2775: Cost has improved by 0.00004
Iteration 2776: Cost has improved by 0.00004
Iteration 2777: Cost has improved by 0.00004
Iteration 2778: Cost has improved by 0.00004
Iteration 2779: Cost has improved by 0.00004
Iteration 2780: Cost has improved by 0.00004
Iteration 2781: Cost has improved by 0.00004
Iteration 2782: Cost has improved by 0.00004
Iteration 2783: Cost has improved by 0.00004
Iteration 2784: Cost has improved by 0.00004
Iteration 2785: Cost has improved by 0.00004
Iteration 2786: Cost has improved by 0.00004
Iteration 2787: Cost has improved by 0.00004
Iteration 2788: Cost has improved by 0.00004
Iteration 2789: Cost has improved by 0.00004
Iteration 2790: Cost has improved by 0.00004
Iteration 2791: Cost has improved by 0.00004
Iteration 2792: Cost has improved by 0.00004
Iteration 2793: Cost has improved by 0.00004
Iteration 2794: Cost has improved by 0.00004
Iteration 2795: Cost has improved by 0.00004
Iteration 2796: Cost has improved by 0.00004
Iteration 

Iteration 3287: Cost has improved by 0.00003
Iteration 3288: Cost has improved by 0.00003
Iteration 3289: Cost has improved by 0.00003
Iteration 3290: Cost has improved by 0.00003
Iteration 3291: Cost has improved by 0.00003
Iteration 3292: Cost has improved by 0.00003
Iteration 3293: Cost has improved by 0.00003
Iteration 3294: Cost has improved by 0.00003
Iteration 3295: Cost has improved by 0.00003
Iteration 3296: Cost has improved by 0.00003
Iteration 3297: Cost has improved by 0.00003
Iteration 3298: Cost has improved by 0.00003
Iteration 3299: Cost has improved by 0.00003
Iteration 3300: Cost has improved by 0.00003
Iteration 3301: Cost has improved by 0.00003
Iteration 3302: Cost has improved by 0.00003
Iteration 3303: Cost has improved by 0.00003
Iteration 3304: Cost has improved by 0.00003
Iteration 3305: Cost has improved by 0.00003
Iteration 3306: Cost has improved by 0.00003
Iteration 3307: Cost has improved by 0.00003
Iteration 3308: Cost has improved by 0.00003
Iteration 

Iteration 3794: Cost has improved by 0.00003
Iteration 3795: Cost has improved by 0.00003
Iteration 3796: Cost has improved by 0.00003
Iteration 3797: Cost has improved by 0.00003
Iteration 3798: Cost has improved by 0.00003
Iteration 3799: Cost has improved by 0.00003
Iteration 3800: Cost has improved by 0.00003
Iteration 3801: Cost has improved by 0.00003
Iteration 3802: Cost has improved by 0.00003
Iteration 3803: Cost has improved by 0.00003
Iteration 3804: Cost has improved by 0.00003
Iteration 3805: Cost has improved by 0.00003
Iteration 3806: Cost has improved by 0.00003
Iteration 3807: Cost has improved by 0.00003
Iteration 3808: Cost has improved by 0.00003
Iteration 3809: Cost has improved by 0.00003
Iteration 3810: Cost has improved by 0.00003
Iteration 3811: Cost has improved by 0.00003
Iteration 3812: Cost has improved by 0.00003
Iteration 3813: Cost has improved by 0.00003
Iteration 3814: Cost has improved by 0.00003
Iteration 3815: Cost has improved by 0.00003
Iteration 

Iteration 4342: Cost has improved by 0.00002
Iteration 4343: Cost has improved by 0.00002
Iteration 4344: Cost has improved by 0.00002
Iteration 4345: Cost has improved by 0.00002
Iteration 4346: Cost has improved by 0.00002
Iteration 4347: Cost has improved by 0.00002
Iteration 4348: Cost has improved by 0.00002
Iteration 4349: Cost has improved by 0.00002
Iteration 4350: Cost has improved by 0.00002
Iteration 4351: Cost has improved by 0.00002
Iteration 4352: Cost has improved by 0.00002
Iteration 4353: Cost has improved by 0.00002
Iteration 4354: Cost has improved by 0.00002
Iteration 4355: Cost has improved by 0.00002
Iteration 4356: Cost has improved by 0.00002
Iteration 4357: Cost has improved by 0.00002
Iteration 4358: Cost has improved by 0.00002
Iteration 4359: Cost has improved by 0.00002
Iteration 4360: Cost has improved by 0.00002
Iteration 4361: Cost has improved by 0.00002
Iteration 4362: Cost has improved by 0.00002
Iteration 4363: Cost has improved by 0.00002
Iteration 

Iteration 4838: Cost has improved by 0.00002
Iteration 4839: Cost has improved by 0.00002
Iteration 4840: Cost has improved by 0.00002
Iteration 4841: Cost has improved by 0.00002
Iteration 4842: Cost has improved by 0.00002
Iteration 4843: Cost has improved by 0.00002
Iteration 4844: Cost has improved by 0.00002
Iteration 4845: Cost has improved by 0.00002
Iteration 4846: Cost has improved by 0.00002
Iteration 4847: Cost has improved by 0.00002
Iteration 4848: Cost has improved by 0.00002
Iteration 4849: Cost has improved by 0.00002
Iteration 4850: Cost has improved by 0.00002
Iteration 4851: Cost has improved by 0.00002
Iteration 4852: Cost has improved by 0.00002
Iteration 4853: Cost has improved by 0.00002
Iteration 4854: Cost has improved by 0.00002
Iteration 4855: Cost has improved by 0.00002
Iteration 4856: Cost has improved by 0.00002
Iteration 4857: Cost has improved by 0.00002
Iteration 4858: Cost has improved by 0.00002
Iteration 4859: Cost has improved by 0.00002
Iteration 

Iteration 5422: Cost has improved by 0.00002
Iteration 5423: Cost has improved by 0.00002
Iteration 5424: Cost has improved by 0.00002
Iteration 5425: Cost has improved by 0.00002
Iteration 5426: Cost has improved by 0.00002
Iteration 5427: Cost has improved by 0.00002
Iteration 5428: Cost has improved by 0.00002
Iteration 5429: Cost has improved by 0.00002
Iteration 5430: Cost has improved by 0.00002
Iteration 5431: Cost has improved by 0.00002
Iteration 5432: Cost has improved by 0.00002
Iteration 5433: Cost has improved by 0.00002
Iteration 5434: Cost has improved by 0.00002
Iteration 5435: Cost has improved by 0.00002
Iteration 5436: Cost has improved by 0.00002
Iteration 5437: Cost has improved by 0.00002
Iteration 5438: Cost has improved by 0.00002
Iteration 5439: Cost has improved by 0.00002
Iteration 5440: Cost has improved by 0.00002
Iteration 5441: Cost has improved by 0.00002
Iteration 5442: Cost has improved by 0.00002
Iteration 5443: Cost has improved by 0.00002
Iteration 

Iteration 5982: Cost has improved by 0.00002
Iteration 5983: Cost has improved by 0.00002
Iteration 5984: Cost has improved by 0.00002
Iteration 5985: Cost has improved by 0.00002
Iteration 5986: Cost has improved by 0.00002
Iteration 5987: Cost has improved by 0.00002
Iteration 5988: Cost has improved by 0.00002
Iteration 5989: Cost has improved by 0.00002
Iteration 5990: Cost has improved by 0.00002
Iteration 5991: Cost has improved by 0.00002
Iteration 5992: Cost has improved by 0.00002
Iteration 5993: Cost has improved by 0.00002
Iteration 5994: Cost has improved by 0.00002
Iteration 5995: Cost has improved by 0.00002
Iteration 5996: Cost has improved by 0.00002
Iteration 5997: Cost has improved by 0.00002
Iteration 5998: Cost has improved by 0.00002
Iteration 5999: Cost has improved by 0.00002
Iteration 6000: Cost has improved by 0.00002
Iteration 6001: Cost has improved by 0.00002
Iteration 6002: Cost has improved by 0.00002
Iteration 6003: Cost has improved by 0.00002
Iteration 

Iteration 6452: Cost has improved by 0.00001
Iteration 6453: Cost has improved by 0.00001
Iteration 6454: Cost has improved by 0.00001
Iteration 6455: Cost has improved by 0.00001
Iteration 6456: Cost has improved by 0.00001
Iteration 6457: Cost has improved by 0.00001
Iteration 6458: Cost has improved by 0.00001
Iteration 6459: Cost has improved by 0.00001
Iteration 6460: Cost has improved by 0.00001
Iteration 6461: Cost has improved by 0.00001
Iteration 6462: Cost has improved by 0.00001
Iteration 6463: Cost has improved by 0.00001
Iteration 6464: Cost has improved by 0.00001
Iteration 6465: Cost has improved by 0.00001
Iteration 6466: Cost has improved by 0.00001
Iteration 6467: Cost has improved by 0.00001
Iteration 6468: Cost has improved by 0.00001
Iteration 6469: Cost has improved by 0.00001
Iteration 6470: Cost has improved by 0.00001
Iteration 6471: Cost has improved by 0.00001
Iteration 6472: Cost has improved by 0.00001
Iteration 6473: Cost has improved by 0.00001
Iteration 

Iteration 7017: Cost has improved by 0.00001
Iteration 7018: Cost has improved by 0.00001
Iteration 7019: Cost has improved by 0.00001
Iteration 7020: Cost has improved by 0.00001
Iteration 7021: Cost has improved by 0.00001
Iteration 7022: Cost has improved by 0.00001
Iteration 7023: Cost has improved by 0.00001
Iteration 7024: Cost has improved by 0.00001
Iteration 7025: Cost has improved by 0.00001
Iteration 7026: Cost has improved by 0.00001
Iteration 7027: Cost has improved by 0.00001
Iteration 7028: Cost has improved by 0.00001
Iteration 7029: Cost has improved by 0.00001
Iteration 7030: Cost has improved by 0.00001
Iteration 7031: Cost has improved by 0.00001
Iteration 7032: Cost has improved by 0.00001
Iteration 7033: Cost has improved by 0.00001
Iteration 7034: Cost has improved by 0.00001
Iteration 7035: Cost has improved by 0.00001
Iteration 7036: Cost has improved by 0.00001
Iteration 7037: Cost has improved by 0.00001
Iteration 7038: Cost has improved by 0.00001
Iteration 

Iteration 7548: Cost has improved by 0.00001
Iteration 7549: Cost has improved by 0.00001
Iteration 7550: Cost has improved by 0.00001
Iteration 7551: Cost has improved by 0.00001
Iteration 7552: Cost has improved by 0.00001
Iteration 7553: Cost has improved by 0.00001
Iteration 7554: Cost has improved by 0.00001
Iteration 7555: Cost has improved by 0.00001
Iteration 7556: Cost has improved by 0.00001
Iteration 7557: Cost has improved by 0.00001
Iteration 7558: Cost has improved by 0.00001
Iteration 7559: Cost has improved by 0.00001
Iteration 7560: Cost has improved by 0.00001
Iteration 7561: Cost has improved by 0.00001
Iteration 7562: Cost has improved by 0.00001
Iteration 7563: Cost has improved by 0.00001
Iteration 7564: Cost has improved by 0.00001
Iteration 7565: Cost has improved by 0.00001
Iteration 7566: Cost has improved by 0.00001
Iteration 7567: Cost has improved by 0.00001
Iteration 7568: Cost has improved by 0.00001
Iteration 7569: Cost has improved by 0.00001
Iteration 

Iteration 8046: Cost has improved by 0.00001
Iteration 8047: Cost has improved by 0.00001
Iteration 8048: Cost has improved by 0.00001
Iteration 8049: Cost has improved by 0.00001
Iteration 8050: Cost has improved by 0.00001
Iteration 8051: Cost has improved by 0.00001
Iteration 8052: Cost has improved by 0.00001
Iteration 8053: Cost has improved by 0.00001
Iteration 8054: Cost has improved by 0.00001
Iteration 8055: Cost has improved by 0.00001
Iteration 8056: Cost has improved by 0.00001
Iteration 8057: Cost has improved by 0.00001
Iteration 8058: Cost has improved by 0.00001
Iteration 8059: Cost has improved by 0.00001
Iteration 8060: Cost has improved by 0.00001
Iteration 8061: Cost has improved by 0.00001
Iteration 8062: Cost has improved by 0.00001
Iteration 8063: Cost has improved by 0.00001
Iteration 8064: Cost has improved by 0.00001
Iteration 8065: Cost has improved by 0.00001
Iteration 8066: Cost has improved by 0.00001
Iteration 8067: Cost has improved by 0.00001
Iteration 

'Done'

In [75]:
model.w, model.b

(array([-1.19997907,  0.50503241, -1.4125149 , -0.31041791,  2.12066055,
         0.10513009,  1.14853937, -0.45347825, -0.69102086, -1.04056827,
         2.01037976,  1.38516037, -0.82250319,  0.62731282, -0.40310207,
        -0.44999874, -0.82686705, -0.79271969,  1.03313151,  1.74427869,
         0.38162349, -0.37987982,  0.7572579 ,  0.25678353, -0.78173829,
         0.789525  , -1.06728481, -0.06989186, -0.09596742, -0.75199859,
        -0.30910591,  1.07947821,  2.3552682 , -1.08525112,  0.74180729,
        -1.3617656 , -0.5692986 ,  0.26144652]),
 -1.0104785456339687)

In [86]:
np.random.seed(64)
xp = np.random.rand(10,X.shape[1])*np.random.randint(1,4)
model.predict(xp)

array([0.97756853, 0.73560202, 0.58785962, 0.02332875, 0.31476799,
       0.74196372, 0.23940931, 0.24159811, 0.20075119, 0.87604558])