In [1]:
import pandas as pd
import numpy as np


In [2]:
def obtaindata(filename):
    table = pd.read_table(filename,delim_whitespace=True,header=None)
    points,cols = table.shape
    data = np.ones((points,cols+1))
    data[:,1:] = table
    
    return data

In [3]:
train_data = obtaindata('http://work.caltech.edu/data/in.dta')

In [4]:
# performing non-linear transformation of φ(x1, x2) = (1, x1, x2, x1^2, x2^2, x1x2, |x1 − x2|, |x1 + x2|)
def transform(data):
    points,cols = data.shape
    transformed_data = np.ones((points,9))
    x1 = data[:,1]
    x2 = data[:,2]
    transformed_data[:,:3] = data[:,:3]
    transformed_data[:,3] = x1.dot(x1)
    transformed_data[:,4] = x2.dot(x2)
    transformed_data[:,5] = x1.dot(x2)
    transformed_data[:,6] = np.abs(x1-x2)
    transformed_data[:,7] = np.abs(x1+x2)
    transformed_data[:,8] = data[:,3]
    return transformed_data

In [5]:
# transformed data
transformed_data = transform(train_data)

In [None]:
def linear_regression(data):
    rows, cols = data.shape
    X = data[:,:(cols-1)]
    H = np.linalg.inv((X.T).dot(X)).dot(X.T)
    W = H.dot(data[:,(cols-1)])
    
    return W

def get_error(W,data):
    rows, cols = data.shape
    X = data[:,:(cols-1)]
    out = np.sign(np.sum(W*X, axis=1))
    correct = (out==data[:,8])
    error = (np.sum(correct==True))/float(rows)
    
    return (1-error)

In [None]:
# performing linear regression on the training set
weight_in = linear_regression(transformed_data)
error_in = get_error(weight_in,transformed_data)
print(weight_in)
print(error_in)

In [8]:
# getting testing data
test_data = obtaindata('http://work.caltech.edu/data/out.dta')
transformed_data_out = transform(test_data)
error_out = get_error(weight_in, transformed_data_out)
print(error_out)

0.528


In [11]:
# regularization 
k = -3
l = 10**k
N_in = train_data.shape[0]
N_out = test_data.shape[0]
reg_in = error_in**2 + np.sum(weight_in.dot(weight_in))*(l/N_in)
reg_out = error_out**2 + np.sum(weight_in.dot(weight_in))*(l/N_out)

In [12]:
print(reg_in,reg_out)

0.007457366217152011 0.2787994598418299


In [15]:
# using k = 3
k = 3
l = 10**k
reg_in = error_in**2 + np.sum(weight_in.dot(weight_in))*(l/N_in)
reg_out = error_out**2 + np.sum(weight_in.dot(weight_in))*(l/N_out)

In [16]:
print(reg_in,reg_out)

110.43478858057738 15.73862582985226
