In [1]:
# Useful starting lines
import numpy as np
from proj1_helpers import *
from cross_validation import *
from helpers import *
from implementations import *

%load_ext autoreload
%autoreload 2

# Define seed for train/test random splitting
seed = 10

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
DATA_TRAIN_PATH = 'data/train.csv' # TODO: download train data and supply path here 
DATA_TEST_PATH = 'data/test.csv' # TODO: download train data and supply path here

We load the training data into our y (labels), tX (input matrix) and ids (indexes)

In [3]:
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Gradient descent

In [4]:
k_fold = 10

# Split data in k-fold
k_indices = build_k_indices(y, k_fold, seed)

# Model parameters
gamma = 0.01
max_iters = 500

accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation(y, tX, k_indices, k, least_squares_gd, initial_w=None, max_iters=max_iters, gamma=gamma)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

0 - Training accuracy: 0.762916 / Test accuracy : 0.764600
1 - Training accuracy: 0.764182 / Test accuracy : 0.758680
2 - Training accuracy: 0.763200 / Test accuracy : 0.767200
3 - Training accuracy: 0.763733 / Test accuracy : 0.761080
4 - Training accuracy: 0.763133 / Test accuracy : 0.764400
5 - Training accuracy: 0.763449 / Test accuracy : 0.761720
6 - Training accuracy: 0.763062 / Test accuracy : 0.760840
7 - Training accuracy: 0.763262 / Test accuracy : 0.765520
8 - Training accuracy: 0.762427 / Test accuracy : 0.765280
9 - Training accuracy: 0.763173 / Test accuracy : 0.762360

Average test accuracy: 0.763168
Variance test accuracy: 0.000006
Min test accuracy: 0.758680
Max test accuracy: 0.767200


## Stochastic gradient descent

In [5]:
k_fold = 10
gamma = 0.01
max_iters = 100

# Split data in k-fold
k_indices = build_k_indices(y, k_fold, seed)

accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation(y, tX, k_indices, k, least_squares_sgd, initial_w=None, max_iters=max_iters, gamma=gamma)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

0 - Training accuracy: 0.688742 / Test accuracy : 0.689160
1 - Training accuracy: 0.676387 / Test accuracy : 0.677600
2 - Training accuracy: 0.633613 / Test accuracy : 0.634400
3 - Training accuracy: 0.674324 / Test accuracy : 0.671400
4 - Training accuracy: 0.688796 / Test accuracy : 0.693120
5 - Training accuracy: 0.687262 / Test accuracy : 0.684240
6 - Training accuracy: 0.696013 / Test accuracy : 0.694320
7 - Training accuracy: 0.728467 / Test accuracy : 0.732600
8 - Training accuracy: 0.702493 / Test accuracy : 0.704560
9 - Training accuracy: 0.679089 / Test accuracy : 0.681080

Average test accuracy: 0.686248
Variance test accuracy: 0.000562
Min test accuracy: 0.634400
Max test accuracy: 0.732600


## Least squares

In [6]:
k_fold = 10

# Split data in k-fold
k_indices = build_k_indices(y, k_fold, seed)

accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation(y, tX, k_indices, k, least_squares)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

0 - Training accuracy: 0.776400 / Test accuracy : 0.775120
1 - Training accuracy: 0.777689 / Test accuracy : 0.772280
2 - Training accuracy: 0.777324 / Test accuracy : 0.779400
3 - Training accuracy: 0.777453 / Test accuracy : 0.774560
4 - Training accuracy: 0.776422 / Test accuracy : 0.778600
5 - Training accuracy: 0.776787 / Test accuracy : 0.776040
6 - Training accuracy: 0.776698 / Test accuracy : 0.773560
7 - Training accuracy: 0.776773 / Test accuracy : 0.780200
8 - Training accuracy: 0.775524 / Test accuracy : 0.777160
9 - Training accuracy: 0.776147 / Test accuracy : 0.779080

Average test accuracy: 0.776600
Variance test accuracy: 0.000007
Min test accuracy: 0.772280
Max test accuracy: 0.780200


## Ridge regression

In [7]:
k_fold = 2
lambdas = [0.002, 0.001, 0.001, 0.01]
degrees = [4, 7, 9, 9]

# Split data in k-fold
k_indices = build_k_indices(y, k_fold, seed)

accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation_ridge_regression(y, tX, k_indices, k, lambdas, degrees)
    accs_train.append(acc_train)
    accs_test.append(acc_test)

for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

0 - Training accuracy: 0.838232 / Test accuracy : 0.832320
1 - Training accuracy: 0.840952 / Test accuracy : 0.834792

Average test accuracy: 0.833556
Variance test accuracy: 0.000002
Min test accuracy: 0.832320
Max test accuracy: 0.834792


## Logistic regression

In [8]:
k_fold = 10
gamma = 0.6
max_iters = 100

# Split data in k-fold
k_indices = build_k_indices(y, k_fold, seed)

accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation(y, tX, k_indices, k, logistic_regression, initial_w=None ,max_iters=max_iters, gamma=gamma)
    accs_train.append(acc_train)
    accs_test.append(acc_test)
    
for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

  l = np.log(1 + tx.dot(w))


0 - Training accuracy: 0.678978 / Test accuracy : 0.682600
1 - Training accuracy: 0.678978 / Test accuracy : 0.678160
2 - Training accuracy: 0.679098 / Test accuracy : 0.679560
3 - Training accuracy: 0.679436 / Test accuracy : 0.677960
4 - Training accuracy: 0.680462 / Test accuracy : 0.677560
5 - Training accuracy: 0.680138 / Test accuracy : 0.682920
6 - Training accuracy: 0.679773 / Test accuracy : 0.678120
7 - Training accuracy: 0.678396 / Test accuracy : 0.675920
8 - Training accuracy: 0.678796 / Test accuracy : 0.680440
9 - Training accuracy: 0.679996 / Test accuracy : 0.678720

Average test accuracy: 0.679196
Variance test accuracy: 0.000004
Min test accuracy: 0.675920
Max test accuracy: 0.682920


## Regularized logistic regression

In [9]:
k_fold = 10
gamma = 0.6
lambda_ = 0.04
max_iters = 100

# Split data in k-fold
k_indices = build_k_indices(y, k_fold, seed)

accs_train = []
accs_test = []

for k in range(k_fold):
    acc_train, acc_test = cross_validation(y, tX, k_indices, k, reg_logistic_regression, lambda_=lambda_, initial_w=None, max_iters=max_iters, gamma=gamma)
    accs_train.append(acc_train)
    accs_test.append(acc_test)

for i in range(len(accs_train)):
    print("%d - Training accuracy: %f / Test accuracy : %f" % (i, accs_train[i], accs_test[i]))

print("\nAverage test accuracy: %f" % np.mean(accs_test))
print("Variance test accuracy: %f" % np.var(accs_test))
print("Min test accuracy: %f" % np.min(accs_test))
print("Max test accuracy: %f" % np.max(accs_test))

  l = np.log(1 + tx.dot(w))


0 - Training accuracy: 0.678978 / Test accuracy : 0.682600
1 - Training accuracy: 0.678978 / Test accuracy : 0.678160
2 - Training accuracy: 0.679098 / Test accuracy : 0.679560
3 - Training accuracy: 0.679436 / Test accuracy : 0.677960
4 - Training accuracy: 0.680462 / Test accuracy : 0.677560
5 - Training accuracy: 0.680138 / Test accuracy : 0.682920
6 - Training accuracy: 0.679773 / Test accuracy : 0.678120
7 - Training accuracy: 0.678396 / Test accuracy : 0.675920
8 - Training accuracy: 0.678796 / Test accuracy : 0.680440
9 - Training accuracy: 0.679996 / Test accuracy : 0.678720

Average test accuracy: 0.679196
Variance test accuracy: 0.000004
Min test accuracy: 0.675920
Max test accuracy: 0.682920
