In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import random
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score 

In [18]:
data = pd.read_csv("train.csv",
                   names=[
                       "age", "workclass", "fnlwgt", "education",
                       "education-num", "marital-status", "occupation",
                       "relationship", "race", "sex", "capital-gain",
                       "capital-loss", "hours-per-week", "native-country",
                       "output"
                   ])
li = [
    "workclass", "education", "marital-status", "occupation", "relationship",
    "race", "sex", "native-country", "output"
]
for i in li:
    data[i] = data[i].astype('category')
    data[i] = data[i].cat.codes
y = data.iloc[:, 14]
x = data.iloc[:, 0:14]
y = y.to_numpy()
x_norm = (x - x.mean()) / (x.max() - x.min())
x1 = x_norm.to_numpy()
x = np.insert(x1, 0, 1, axis=1)
data = pd.read_csv("test.csv",
                   names=[
                       "age", "workclass", "fnlwgt", "education",
                       "education-num", "marital-status", "occupation",
                       "relationship", "race", "sex", "capital-gain",
                       "capital-loss", "hours-per-week", "native-country",
                       "output"
                   ])
for i in li:
    data[i] = data[i].astype('category')
    data[i] = data[i].cat.codes
y_test = data.iloc[:, 14]
x_test = data.iloc[:, 0:14]
y_test = y_test.to_numpy()
x_norm = (x_test - x_test.mean()) / (x_test.max() - x_test.min())
x1 = x_norm.to_numpy()
x_test = np.insert(x1, 0, 1, axis=1)

In [19]:
def acc_sig(z):
    return 1 / (1 + np.exp(0.5 - z))

In [20]:
def sigmoid(z):
    return 1 / (1 + np.exp(0 - z))

In [21]:
def accuracy(x, y, theta):
    ans = x @ theta.T
#     num = 0
#     for i in range(y.shape[0]):
#         if (ans[i]>=0.5):
#             ans[i][0]=1
#         else:
#             ans[i][0]=0
#         if (y[i]==ans[i][0]):
#             num+=1
#     return (num*100)/y.shape[0]
    ans = acc_sig(ans)
    ans = np.round(ans)
    y = np.reshape(y, (-1, 1))
    ans = ans-y
    num = np.sum(np.abs(ans))
    return ((y.shape[0]-num)*100)/y.shape[0]

In [22]:
def regression(x, y, alpha, x_test, y_test,type="normal",L=0):
    epoch = 10000
    theta = np.ones((1, 15))
    y = np.reshape(y, (-1, 1))
    if (type=="normal"):
        for k in range(epoch):
            z = x @ theta.T
            h = sigmoid(z)
            t = x.T @ (h - y)
            grad = (t) / y.shape[0]
            theta -= alpha * grad.T
    else:
        rms_arr = [0] * epoch
        rms_validation = [0] * epoch
        if (type=="l1"):
            for k in range(epoch):
                z = x @ theta.T
                h = sigmoid(z)
                t = x.T @ (h - y)
                grad = (t) / y.shape[0]
                grad += 2*L*np.sum(theta/np.abs(theta))
                theta -= alpha * grad.T
                rms_arr[k] = accuracy(x, y, theta)
                rms_validation[k] = accuracy(x_test, y_test, theta)
        elif (type=="l2"):
            for k in range(epoch):
                z = x @ theta.T
                h = sigmoid(z)
                t = x.T @ (h - y)
                grad = (t) / y.shape[0]
                grad+= L*np.sum(theta)
                theta -= alpha * grad.T
                rms_arr[k] = accuracy(x, y, theta)
                rms_validation[k] = accuracy(x_test, y_test, theta)
        plt.plot(rms_arr)
        plt.title('Accuracy vs epoch for Training set',type)
        plt.ylabel('Accuracy')
        plt.xlabel('epoch')
        plt.show()
        plt.plot(rms_validation)
        plt.title('Accuracy vs epoch for Testing set',type)
        plt.ylabel('Accuracy')
        plt.xlabel('epoch')
        plt.show()
    return theta

In [23]:
learning_rate = 0.15
theta = regression(x, y, learning_rate, x_test, y_test)
print(theta)

[[-1.56196494e+00  2.81176033e+00 -6.68035542e-01  8.25161601e-01
   2.39151509e-01  5.11575648e+00 -1.27959669e+00 -9.02597930e-04
  -6.02033597e-01  4.37880180e-01  9.13318528e-01  4.79186184e+00
   2.37076451e+00  2.78771071e+00  8.19426726e-02]]


In [24]:
print(accuracy(x_test,y_test,theta))

80.1261620185923


In [None]:
alphas = np.logspace(-4,1,1000)
model = Lasso()
grid = GridSearchCV(estimator=model,param_grid=dict(alpha=alphas),cv=5)
grid.fit(x,y)
L = grid.best_estimator_.alpha
print(learning_rate ,L)
theta = regression(x,y,learning_rate,x_test,y_test,'l1',L)
print('Parameter vector Regression =>')
print(theta.T)