In [240]:
import numpy as np
import pandas as pd
import matplotlib as plt

In [241]:
dataset = pd.read_csv('E:\ELL_project\problem1\health_data.csv')
dataset = dataset.sample(frac = 1)
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1:].values
datasize = X.shape[0]
X_train = X[:(datasize*7)//10,:]
y_train = y[:(datasize*7)//10,:]
X_test = X[(datasize*7)//10:,:]
y_test = y[(datasize*7)//10:,:]

# X_train_size = X_train.shape[0]
# X_train = np.append(np.ones((X_train_size,1),X_train)


In [242]:
def gradDesc(X,y,theta):
    h  = np.dot(X,theta)
    dsc = X.shape[0]
    loss = h-y
    total_loss = np.dot(np.ones((1,dsc)),np.square(loss))[0,0] /dsc
    update = np.dot(np.transpose(X),loss)
    update /= dsc
    return(total_loss,update)

In [243]:
def linReg(X,y,iter=100,alpha=0.01,batchSize=32):
    # print(X.shape)
    theta = np.random.random((X.shape[1],1))
    # print(theta)
    datasize = X.shape[0]
    loss_epoch = 0
    for i in range(iter):
        if((i+1)%1000==0 and i>0):
            print('Loss for {} iterations: {}'.format(i+1,loss_epoch))
        fro = 0
        loss_epoch = 0
        while(True):
            to = min(fro+batchSize,datasize)
            l,theta_grad = gradDesc(X[fro:to,:],y[fro:to,:],theta)
            # print(theta_grad)
            loss_epoch += l
            theta -= (alpha*theta_grad)
            fro = to

            if(to>=datasize):
                break
    
    return (theta,loss_epoch)

In [244]:
def feature_scaling(X_train):
    training_size = X_train.shape[0]
    X_mean = np.sum(X_train,axis=0) / training_size
    X_var = np.sqrt(np.sum((np.square(X-X_mean)),axis=0)/training_size)
    X_train_reg = (X_train - X_mean) / X_var
    return (X_mean,X_var,X_train_reg)

In [245]:
def poly_feat(X,degree=2):
    num_feats = X.shape[1]
    num_vals = X.shape[0]
    X_cross = X
    for i in range(num_feats):
        for j in range(i+1,num_feats):
            X_cross = np.append( X_cross ,  np.multiply ( X[:,i:i+1] , X[:,j:j+1] ) , axis=1 )
    for i in range(3,degree):
        X_power = np.power(X,i)
        X_cross = np.append(X_cross,X_power,axis=1)

    return ( X_cross )

In [246]:
def polyReg(X,y,iter=100,alpha=0.01,batchSize=32,degree=2):
    X_mean,X_var,X_norm = feature_scaling(X)
    X_cross = poly_feat(X_norm,degree)
    train_size = X_cross.shape[0]
    X_cross = np.append(np.ones((train_size,1)),X_cross,axis=1)
    opt, tl = linReg(X_cross,y,iter=iter,alpha=alpha,batchSize=batchSize)
    return ( X_mean, X_var, opt , tl )


In [260]:
train_datasize = X_train.shape[0]
X_mean, X_var, opt_theta, train_loss = polyReg(X_train,y_train,200000,0.01,train_datasize,6)
print(X_mean.shape)
print(opt_theta)

Loss for 1000 iterations: 0.08877955119098134
Loss for 2000 iterations: 0.08310755261649674
Loss for 3000 iterations: 0.08177666068637857
Loss for 4000 iterations: 0.08105549404721583
Loss for 5000 iterations: 0.08057493872595808
Loss for 6000 iterations: 0.08022060100730931
Loss for 7000 iterations: 0.07994532616399232
Loss for 8000 iterations: 0.07972540566938668
Loss for 9000 iterations: 0.07954690229650362
Loss for 10000 iterations: 0.0794006529873451
Loss for 11000 iterations: 0.07928014801890079
Loss for 12000 iterations: 0.07918050929490632
Loss for 13000 iterations: 0.07909794625635738
Loss for 14000 iterations: 0.07902944116421648
Loss for 15000 iterations: 0.07897255313902159
Loss for 16000 iterations: 0.07892528764831666
Loss for 17000 iterations: 0.07888600428359824
Loss for 18000 iterations: 0.07885334835555255
Loss for 19000 iterations: 0.07882619824861409
Loss for 20000 iterations: 0.07880362382089157
Loss for 21000 iterations: 0.0787848529350108
Loss for 22000 iteration

In [261]:
def predicter(X_mean, X_var, X_test, opt_theta,degree=2):
    X_test_norm = (X_test - X_mean) / X_var
    X_test_cross = poly_feat(X_test_norm,degree)
    test_size = X_test_cross.shape[0]
    X_test_cross = np.append( np.ones((test_size,1)) , X_test_cross , axis=1)

    # print(X_test_cross.shape)
    # print(opt_theta.shape)
    y_pred = np.dot(X_test_cross, opt_theta)

    return y_pred


In [262]:
def accuracy_metrics(X_mean,X_var,X_test,y_test,opt_theta,degree=2):
    y_pred = predicter(X_mean, X_var, X_test, opt_theta, degree)
    loss_y = y_pred - y_test

    test_size = y_pred.shape[0]
    total_loss_y = np.dot(np.ones((1,test_size)),np.square(loss_y))[0,0] / test_size

    # print(total_loss_y)

    y_pred_thresh = y_pred>=0.5

    tp = np.sum((y_pred_thresh+y_test)==2 , axis=0)[0]
    tn = np.sum(y_pred_thresh==y_test , axis=0)[0] - tp
    fp = np.sum(y_pred_thresh , axis=0)[0]-tp
    fn = test_size-tp-tn-fp


    print('tp: {} , tn: {} , fp: {} , fn: {}'.format(tp,tn,fp,fn))

    acc = (tp+tn)/test_size
    prec = (tp)/(tp+fp)
    recl = (tp)/(tp+fn)
    f1 = 2*prec*recl/(prec+recl)

    print('Accuracy: {}'.format( acc  ))
    print('Precision: {}'.format( prec  ))
    print('Recall: {}'.format( recl  ))
    print('F1 score: {}'.format( f1  ))




In [263]:
print('Train Accuracy')

accuracy_metrics(X_mean,X_var,X_train,y_train,opt_theta,degree=6)
print('..............................................')
print('Test Accuracy')
accuracy_metrics(X_mean,X_var,X_test,y_test,opt_theta,degree=6)

Train Accuracy
tp: 178 , tn: 249 , fp: 32 , fn: 31
Accuracy: 0.8714285714285714
Precision: 0.8476190476190476
Recall: 0.8516746411483254
F1 score: 0.8496420047732698
..............................................
Test Accuracy
tp: 70 , tn: 103 , fp: 16 , fn: 21
Accuracy: 0.8238095238095238
Precision: 0.813953488372093
Recall: 0.7692307692307693
F1 score: 0.7909604519774011
