In [2]:
import numpy as np
import pandas as pd
import matplotlib as plt

In [3]:
dataset = pd.read_csv('E:\ELL_project\problem1\health_data.csv')
dataset = dataset.sample(frac = 1)
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1:].values
datasize = X.shape[0]
X_train = X[:(datasize*7)//10,:]
y_train = y[:(datasize*7)//10,:]
X_test = X[(datasize*7)//10:,:]
y_test = y[(datasize*7)//10:,:]

# X_train_size = X_train.shape[0]
# X_train = np.append(np.ones((X_train_size,1),X_train)


In [4]:
def gradDesc(X,y,theta):
    h  = np.dot(X,theta)
    dsc = X.shape[0]
    loss = h-y
    total_loss = np.dot(np.ones((1,dsc)),np.square(loss))[0,0] /dsc
    update = np.dot(np.transpose(X),loss)
    update /= dsc
    return(total_loss,update)

In [5]:
def linReg(X,y,iter=100,alpha=0.01,batchSize=32):
    # print(X.shape)
    theta = np.random.random((X.shape[1],1))
    # print(theta)
    datasize = X.shape[0]
    loss_epoch = 0
    for i in range(iter):
        if((i+1)%1000==0 and i>0):
            print('Loss for {} iterations: {}'.format(i+1,loss_epoch))
        fro = 0
        loss_epoch = 0
        while(True):
            to = min(fro+batchSize,datasize)
            l,theta_grad = gradDesc(X[fro:to,:],y[fro:to,:],theta)
            # print(theta_grad)
            loss_epoch += l
            theta -= (alpha*theta_grad)
            fro = to

            if(to>=datasize):
                break
    
    return (theta,loss_epoch)

In [6]:
def feature_scaling(X_train):
    training_size = X_train.shape[0]
    X_mean = np.sum(X_train,axis=0) / training_size
    X_var = np.sqrt(np.sum((np.square(X-X_mean)),axis=0)/training_size)
    X_train_reg = (X_train - X_mean) / X_var
    return (X_mean,X_var,X_train_reg)

In [7]:
def poly_feat(X,degree=2):
    num_feats = X.shape[1]
    num_vals = X.shape[0]
    X_cross = X
    for i in range(num_feats):
        for j in range(i+1,num_feats):
            X_cross = np.append( X_cross ,  np.multiply ( X[:,i:i+1] , X[:,j:j+1] ) , axis=1 )
    for i in range(3,degree):
        X_power = np.power(X,i)
        X_cross = np.append(X_cross,X_power,axis=1)

    return ( X_cross )

In [8]:
def polyReg(X,y,iter=100,alpha=0.01,batchSize=32,degree=2):
    X_mean,X_var,X_norm = feature_scaling(X)
    X_cross = poly_feat(X_norm,degree)
    train_size = X_cross.shape[0]
    X_cross = np.append(np.ones((train_size,1)),X_cross,axis=1)
    opt, tl = linReg(X_cross,y,iter=iter,alpha=alpha,batchSize=batchSize)
    return ( X_mean, X_var, opt , tl )


In [9]:
train_datasize = X_train.shape[0]
X_mean, X_var, opt_theta, train_loss = polyReg(X_train,y_train,200000,0.01,train_datasize,6)
print(X_mean.shape)
print(opt_theta)

Loss for 1000 iterations: 0.10133692194614084
Loss for 2000 iterations: 0.0909107299072428
Loss for 3000 iterations: 0.08665561054595813
Loss for 4000 iterations: 0.08470303357136973
Loss for 5000 iterations: 0.08366616339600486
Loss for 6000 iterations: 0.08302696978605627
Loss for 7000 iterations: 0.08258457463792523
Loss for 8000 iterations: 0.082255605002815
Loss for 9000 iterations: 0.08200140064658072
Loss for 10000 iterations: 0.08180121056327097
Loss for 11000 iterations: 0.08164213225701573
Loss for 12000 iterations: 0.0815151891272087
Loss for 13000 iterations: 0.08141368950307014
Loss for 14000 iterations: 0.08133245779805111
Loss for 15000 iterations: 0.08126741740166235
Loss for 16000 iterations: 0.08121532922888727
Loss for 17000 iterations: 0.08117360900903131
Loss for 18000 iterations: 0.08114019084224784
Loss for 19000 iterations: 0.08111342163460941
Loss for 20000 iterations: 0.0810919779710418
Loss for 21000 iterations: 0.08107480009583228
Loss for 22000 iterations: 

In [10]:
def predicter(X_mean, X_var, X_test, opt_theta,degree=2):
    X_test_norm = (X_test - X_mean) / X_var
    X_test_cross = poly_feat(X_test_norm,degree)
    test_size = X_test_cross.shape[0]
    X_test_cross = np.append( np.ones((test_size,1)) , X_test_cross , axis=1)

    # print(X_test_cross.shape)
    # print(opt_theta.shape)
    y_pred = np.dot(X_test_cross, opt_theta)

    return y_pred


In [11]:
def accuracy_metrics(X_mean,X_var,X_test,y_test,opt_theta,degree=2):
    y_pred = predicter(X_mean, X_var, X_test, opt_theta, degree)
    loss_y = y_pred - y_test

    test_size = y_pred.shape[0]
    total_loss_y = np.dot(np.ones((1,test_size)),np.square(loss_y))[0,0] / test_size

    # print(total_loss_y)

    y_pred_thresh = y_pred>=0.5

    tp = np.sum((y_pred_thresh+y_test)==2 , axis=0)[0]
    tn = np.sum(y_pred_thresh==y_test , axis=0)[0] - tp
    fp = np.sum(y_pred_thresh , axis=0)[0]-tp
    fn = test_size-tp-tn-fp


    print('tp: {} , tn: {} , fp: {} , fn: {}'.format(tp,tn,fp,fn))

    acc = (tp+tn)/test_size
    prec = (tp)/(tp+fp)
    recl = (tp)/(tp+fn)
    f1 = 2*prec*recl/(prec+recl)

    print('Accuracy: {}'.format( acc  ))
    print('Precision: {}'.format( prec  ))
    print('Recall: {}'.format( recl  ))
    print('F1 score: {}'.format( f1  ))

    return acc




In [12]:
tr=[]
te=[]

In [13]:
for d in range(1,10):
    print('Train Accuracy')
    train_acc = accuracy_metrics(X_mean,X_var,X_train,y_train,opt_theta,degree=d)
    print('..............................................')
    print('Test Accuracy')
    test_acc = accuracy_metrics(X_mean,X_var,X_test,y_test,opt_theta,degree=d)
    tr.append(train_acc)
    te.append(test_acc)

Train Accuracy


ValueError: shapes (490,7) and (16,1) not aligned: 7 (dim 1) != 16 (dim 0)