In [47]:
# https://work.caltech.edu/homework/hw7.pdf
# Problems 1-5

import pandas as pd
import numpy as np

training_and_validation_data = pd.read_csv('in.dta', sep='\s+', header=None)

# Split in.dta into training (first 25 examples) and validation (last 10 examples).  

training_data = training_and_validation_data.iloc[:25,:].to_numpy()
validation_data = training_and_validation_data.iloc[25:,:].to_numpy()

phi = [
    lambda xs: np.ones(len(xs)),
    lambda xs: xs[:, 0],
    lambda xs: xs[:, 1],
    lambda xs: xs[:, 0] ** 2,
    lambda xs: xs[:, 1] ** 2,
    lambda xs: xs[:, 0] * xs[:, 1],
    lambda xs: np.abs(xs[:, 0] - xs[:, 1]),
    lambda xs: np.abs(xs[: , 0] + xs[:, 1])
]

training_xs = training_data[:,0:2]
training_ys = training_data[:,2]

validation_xs = validation_data[:,0:2]
validation_ys = validation_data[:,2]

testing_data = pd.read_csv('out.dta', sep='\s+', header=None).to_numpy()
testing_xs = testing_data[:,0:2]
testing_ys = testing_data[:,2]

def transform_data(xs, k):
    zs = []
    for i in range(0, k + 1):
        transformed = phi[i](xs)
        zs.append(transformed)
    
    return np.column_stack(tuple(zs))

def pseudo_inverse(xs):
    return np.linalg.inv(xs.T @ xs) @ xs.T

validation_errs = {}
testing_errs = {}
small_train_validation_errs = {}
small_train_testing_errs = {}

# Train on the 25 examples only, using the validation set of 10 examples to select between five models that apply linear regression to phi_0 through phi_k, with k = 3,4,5,6,7. 
for k in range(3, 7 + 1):
    training_zs = transform_data(training_xs, k)
    weights = pseudo_inverse(training_zs) @ training_ys

    validation_zs = transform_data(validation_xs, k)
    predicted_ys = np.sign(validation_zs @ weights)
    
    misclassified_points = np.flatnonzero(predicted_ys != validation_ys)
    validation_errs[k] = len(misclassified_points) / len(validation_xs)

    testing_zs = transform_data(testing_xs, k)
    predicted_ys = np.sign(testing_zs @ weights)

    misclassified_points = np.flatnonzero(predicted_ys != testing_ys)
    testing_errs[k] = len(misclassified_points) / len(testing_xs)

    # Now let's switch validation and training sets
    weights = pseudo_inverse(validation_zs) @ validation_ys
    predicted_ys = np.sign(training_zs @ weights)

    misclassified_points = np.flatnonzero(predicted_ys != training_ys)
    small_train_validation_errs[k] = len(misclassified_points) / len(training_xs)

    predicted_ys = np.sign(testing_zs @ weights)
    misclassified_points = np.flatnonzero(predicted_ys != testing_ys)
    testing_errs[k] = len(misclassified_points) / len(testing_xs)

    small_train_testing_errs[k] = len(misclassified_points) / len(testing_xs)


# 1. For which model is the classification err on the validation set smallest?
print("Classification errors on validation:", validation_errs)

# 2. Evaluate the out-of-sample classification error using out.dta on the 5 models to see how well the validation set predicted the best of the 5 models. For which model is the out-of-sample classification error smallest?
print("Classification errors on testing:", testing_errs)


# Reverse the role of training and validation sets; now training with the last 10 examples and validating with the first 25 examples. 
# 
# 3. For which model is the classification error on the validation set smallest?
print("Classification errors w/ smaller training on validation:", small_train_validation_errs)

# 4. Once again, evaluate the out-of-sample classification error using out.dta on the 5 models to see how well the validation set predicted the best of the 5 models. For which model is the out-of-sample classification error smallest?
print("Classification errors w/ smaller training on testing:", small_train_testing_errs)

# 5. What values are closest in Euclidean distance to the out-of-sample classification error obtained for the model chosen in Problems 1 and 3, respectively?


Classification errors on validation: {3: 0.3, 4: 0.5, 5: 0.2, 6: 0.0, 7: 0.1}
Classification errors on testing: {3: 0.396, 4: 0.388, 5: 0.284, 6: 0.192, 7: 0.196}
Classification errors w/ smaller training on validation: {3: 0.28, 4: 0.36, 5: 0.2, 6: 0.08, 7: 0.12}
Classification errors w/ smaller training on testing: {3: 0.396, 4: 0.388, 5: 0.284, 6: 0.192, 7: 0.196}


In [None]:
# Problem 6

np.unif