In [177]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import mean_squared_error

### Regularization with Weight Decay

In [54]:
train = pd.read_table('datasets/in.dta', sep='  ', header=None, engine='python')
test = pd.read_table('datasets/out.dta', sep='  ', header=None, engine='python')

In [55]:
train.head()

Unnamed: 0,0,1,2
0,-0.77947,0.838221,1.0
1,0.155635,0.895377,1.0
2,-0.059908,-0.71778,1.0
3,0.207596,0.758933,1.0
4,-0.195983,-0.375487,-1.0


In [56]:
test.head()

Unnamed: 0,0,1,2
0,-0.106006,-0.081467,-1.0
1,0.17793,-0.345951,-1.0
2,0.102162,0.718258,1.0
3,0.694078,0.623397,-1.0
4,0.023541,0.727432,1.0


In [163]:
class LinearRegression():
    def __init__(self, ld=0):
        self.ld = ld
        
    def fit(self, X, y):
        X, y = np.asarray(X), np.asarray(y)
        self.weights = np.linalg.inv(X.T @ X + self.ld*np.identity(X.shape[1])) @ (X.T) @ y.reshape(-1,1)
        
    def predict(self, X):
        X = np.asarray(X)
        predict = X @ (self.weights)
        return predict.squeeze()

In [122]:
def nonlinear_transform(data, funcs=None):
    """
    Input:
    - data: an array or DataFrame
    - funcs: a list of function, e.g. funcs=[lambda x: x[0] + x[1], lambda x: x[2]=0]..., which are applied to the given data
    
    Output:
    - a transformed array
    """
    
    data_arr = np.asarray(data)
    for f in funcs:
        new_col = np.asarray(f(data)).reshape(-1,1)
        data_arr = np.hstack((data_arr, new_col))
    return data_arr

def error(y_true, y_pred):
    mul = y_true * y_pred
    return np.sum(mul < 0) / len(y_true)

In [136]:
X_train = train.drop(columns=[2])
y_train = train[2]

X_test = test.drop(columns=[2])
y_test = test[2]

In [158]:
nonlinear_funcs = [lambda x: x[0]**2,
                  lambda x: x[1]**2,
                  lambda x: x[0]*x[1],
                  lambda x: np.abs(x[0]-x[1]),
                  lambda x: np.abs(x[0]+x[1])]

X_train_transformed = nonlinear_transform(X_train, nonlinear_funcs)
X_test_transformed = nonlinear_transform(X_test, nonlinear_funcs)

X_train_transformed = np.hstack((np.ones((X_train.shape[0],1)), X_train_transformed))
X_test_transformed = np.hstack((np.ones((X_test.shape[0],1)), X_test_transformed))

In [160]:
X_train_transformed.shape

(35, 8)

#### Question 2 (without weight decay)

In [200]:
lr = LinearRegression()
lr.fit(X_train_transformed, y_train)

train_pred = lr.predict(X_train_transformed)
test_pred = lr.predict(X_test_transformed)

print('In-sample error: {:.3f}'.format(error(y_train, train_pred)))
print('Out-of-sample error: {:.3f}'.format(error(y_test, test_pred)))

In-sample error: 0.029
Out-of-sample error: 0.084


#### Question 3

In [198]:
lr1 = LinearRegression(ld=10**-3)
lr1.fit(X_train_transformed, y_train)

train_pred = lr1.predict(X_train_transformed)
test_pred = lr1.predict(X_test_transformed)

print('In-sample error: {:.3f}'.format(error(y_train, train_pred)))
print('Out-of-sample error: {:.3f}'.format(error(y_test, test_pred)))

In-sample error: 0.029
Out-of-sample error: 0.080


#### Question 4

In [199]:
lr2 = LinearRegression(ld=10**3)
lr2.fit(X_train_transformed, y_train)

train_pred = lr2.predict(X_train_transformed)
test_pred = lr2.predict(X_test_transformed)

print('In-sample error: {:.3f}'.format(error(y_train, train_pred)))
print('Out-of-sample error: {:.3f}'.format(error(y_test, test_pred)))

In-sample error: 0.371
Out-of-sample error: 0.436


#### Question 5

In [209]:
k_list = list(range(-2, 3))
oos_error = {}

for k in k_list:
    lr = LinearRegression(ld=10**k)
    lr.fit(X_train_transformed, y_train)
    pred = lr.predict(X_test_transformed)
    oos_error[k] = error(y_test, pred)

print('Out-of-sample for different k: {}'.format(oos_error))

Out-of-sample for different k: {-2: 0.084, -1: 0.056, 0: 0.092, 1: 0.124, 2: 0.228}


#### Question 6

In [217]:
oos_error = []
for k in range(-10, 10):
    lr = LinearRegression(ld=10**k)
    lr.fit(X_train_transformed, y_train)
    pred = lr.predict(X_test_transformed)
    oos_error.append(error(y_test, pred))
    
print('Minimum out-of-sample error: {}'.format(min(oos_error)))

Minimum out-of-sample error: 0.056
