In [49]:
import numpy as np
from math import *

In [50]:
# sigmoid function
def sigma(x):
    return 1/(1+exp(-x))

# derivative of sigmoid
def sigma_der(x):
    return exp(-x)/((1+exp(-x))**2) # return sigma(x)*(1-sigma(x))

# preparing data
def prep(x_1,x_2):
    return np.array([1,x_1,x_2])    

### Log-likelihood equation using cross entropy
$$J(\vec{w})=\displaystyle\sum_{i}y_ilog(p_1(\vec{w},\vec{x}_i))+(1-y_i)log(p_0(\vec{w},\vec{x}_i))$$
where,
$$p_1(\vec{w},\vec{x}_i)=\sigma(\vec{w}.\vec{x}_i)$$
$$p_0(\vec{w},\vec{x}_i)=1-\sigma(\vec{w}.\vec{x}_i)$$
### Maximising Log-likelihood is same as minimising -J(w)
$$\dfrac{d}{d\vec{w}}(-J(\vec{w}))=\displaystyle\sum_{i}(\sigma(\vec{w}.\vec{x}_i)-y_i)\vec{x}_i$$
### Using Gradient-descent to find optimal parameters
$$\vec{w}^{(i+1)}=\vec{w}^{(i)}-\gamma\dfrac{d}{d\vec{w}}(-J(\vec{w}))$$

In [51]:
# implementing logistic regression using cross entropy and gradient descent
def log_reg(X,Y,W_init,gamma):
    
    n=X.shape[0] # number of data
    m=X.shape[1] # 1+number of features
    w=W_init
    a=0
    
    while(1):  
        upd=0
        for i in range(n):
            z=w@X[i]
            upd+=gamma*(sigma(z)-Y[i])*X[i]
        change=np.linalg.norm(upd)
        if(change>=1e-3):
            w-=upd
        else:
            return w
        if(a==0):
            a+=1
            print("after 1 iteration, weights are:",w)
            

In [52]:
tr_X=np.array([prep(0.346,0.78),prep(0.303,0.439),prep(0.358,0.729),prep(0.602,0.863),prep(0.790,0.753),prep(0.611,0.965)])
print(tr_X)
tr_Y=np.array([0,0,0,1,1,1])
te_X=np.array([prep(0.959,0.382),prep(0.750,0.306),prep(0.395,0.760),prep(0.823,0.764),prep(0.761,0.874),prep(0.844,0.435)])
te_Y=np.array([0,0,0,1,1,1])
print(te_X)
W=np.array([-1,1.5,0.5])
lr_rate=0.1

[[1.    0.346 0.78 ]
 [1.    0.303 0.439]
 [1.    0.358 0.729]
 [1.    0.602 0.863]
 [1.    0.79  0.753]
 [1.    0.611 0.965]]
[[1.    0.959 0.382]
 [1.    0.75  0.306]
 [1.    0.395 0.76 ]
 [1.    0.823 0.764]
 [1.    0.761 0.874]
 [1.    0.844 0.435]]


In [53]:
final_w=log_reg(tr_X,tr_Y,W,lr_rate)
n=te_X.shape[0]
y_pred=np.zeros(6)
for i in range(n):
    x=te_X[i]
    z=final_w@x
    if(sigma(z)>=0.5):
        y_pred[i]=1
    else:
        y_pred[i]=0

print('Final Weights are:',final_w)
print("Prediction:",y_pred)

after 1 iteration, weights are: [-1.01899756  1.53210518  0.51181202]
Final Weights are: [-18.07774125  26.47820683   6.58909946]
Prediction: [1. 1. 0. 1. 1. 1.]


### a

$$\vec{w}=[-1, 1.5, 0.5]$$
$$P(\hat y=1|x_1,x_2)=\dfrac{1}{1+e^{-(-1+1.5x_1+0.5x_2)}}$$
$$P(\hat y=0|x_1,x_2)=1-\dfrac{1}{1+e^{-(-1+1.5x_1+0.5x_2)}}$$
$$J(\vec{w})=\displaystyle\sum_{i}y_ilog(P(\hat y_i=1|\vec{x}_i))+(1-y_i)log(P(\hat y_i=0|\vec{x}_i))$$
where
$$\vec{x}=[1,x_1,x_2]$$

### b

$$\vec{w}=[-1.01899756,  1.53210518,  0.51181202]$$
$$P(\hat y=1|x_1,x_2)=\dfrac{1}{1+e^{-(-1.01899756+1.53210518x_1+0.51181202x_2)}}$$
$$P(\hat y=0|x_1,x_2)=1-\dfrac{1}{1+e^{-(-1.01899756+1.53210518x_1+0.51181202x_2)}}$$
$$J(\vec{w})=\displaystyle\sum_{i}y_ilog(P(\hat y_i=1|\vec{x}_i))+(1-y_i)log(P(\hat y_i=0|\vec{x}_i))$$
where
$$\vec{x}=[1,x_1,x_2]$$

### c

In [54]:
print('For tolerance of 10^{-3}, gradient descent converges to the weights:',[-18.07774125,26.47820683,6.58909946])
print('Actual y:',[0, 0, 0, 1, 1, 1])
print('Prediction:',[1, 1, 0, 1, 1, 1])
print('Accuracy=(TP+TN)/(TP+TN+FP+FN)=4/6=0.67')
print('Precision=TP/(TP+FP)=3/5=0.6')
print('Recall=TP/(TP+FN)=3/3=1')

For tolerance of 10^{-3}, gradient descent converges to the weights: [-18.07774125, 26.47820683, 6.58909946]
Actual y: [0.0, 0.0, 0.0, 1.0, 1.0, 1.0]
Prediction: [1.0, 1.0, 0.0, 1.0, 1.0, 1.0]
Accuracy=(TP+TN)/(TP+TN+FP+FN)=4/6=0.67
Precision=TP/(TP+FP)=3/5=0.6
Recall=TP/(TP+FN)=3/3=1
