In [None]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [None]:
from implementations import *
from dataprocessing import *
from classifiers import *

In [None]:
from proj1_helpers import *

DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids, features = load_csv_data(DATA_TRAIN_PATH, sub_sample=False)

## Split data

In [None]:
indices_split, X_split, y_split = split_data(features, tX, y)

## Standardize data

In [None]:
X_split_std, mean_split, std_split = [], [], []
for X in X_split: 
    # remove features with more than 20% of NaN and standardize
    X_std, mean_std, std_std = standardize(remove_NaN_features(X, 0.2))
    
    X_split_std.append(X_std)
    mean_split.append(mean_std)
    std_split.append(std_std)

## Test different models

In [None]:
# try different models
X_split_poly = [ build_X(X, 10, 5) for X in X_split_std ]
lambda_ = 1e-8
k = 5

models_try = [
    LeastSquaresL2(lambda_),
    LeastSquaresL1(lambda_, verbose=True, max_evaluations=500),
    LogisticRegression(),
    LogisticRegressionL2(lambda_),
    LogisticRegressionL1(lambda_, verbose=True, max_evaluations=500),
]

total_accs = []

for model in models_try:
    
    accuracies = []
    # iterate over 4 sub datasets
    for i in range(len(X_split_poly)):
        acc = np.mean(cross_validate_kfold(y_split[i], X_split_poly[i], model, k))
        accuracies.append(acc)
        
    # compute mean (weighted)
    accuracy = 0
    for i, acc in enumerate(accuracies):
        accuracy += acc * len(y_split[i])
    accuracy /= len(y) 
    
    total_accs.append(accuracy)

x_axis = range(len(models_try))
plt.xticks(x_axis, ["LS L2", "LS L1", "LogReg", "LogReg L2", "LogReg L1"])
plt.plot(x_axis, total_accs)
plt.show()

## Grid search for rough estimate of best integer power, best half power and best lambda

In [None]:
# find best lambda_ or degree
d_ints = range(7, 11)
d_sqs = range(2, 6)
lambdas = [ math.pow(10, c) for c in np.linspace(-13,-7,10) ]

max_train = 0
max_train_d_int = 0
max_train_d_sq = 0
max_train_d_lambda = 0

for d_int in d_ints:
    for d_sq in d_sqs:
        # build expanded dataset
        X_split_poly = [ build_X(X, d_int, d_sq) for X in X_split_std ]
        
        for lambda_ in lambdas:
            
            models = []
            y_pred = np.ones(tX.shape[0])

            # train and test submodels
            for i in range(len(X_split_poly)):
                lse = LeastSquaresL2(lambda_)
                lse.fit(y_split[i], X_split_poly[i])
                models.append(lse)
                y_pred[indices_split[i]] = lse.predict(X_split_poly[i])

            acc_train = np.mean(y == y_pred)
            if acc_train > max_train:
                max_train = acc_train
                max_train_d_int = d_int
                max_train_d_sq = d_sq
                max_train_d_lambda = lambda_
            
            print(f"d_int={d_int}, d_sq={d_sq}, lambda_={lambda_} - train={acc_train}")
            

## Cross-validation to optimize the hyper-parameter lambda

In [None]:
# cross-validate to optimize lambda_
total_acc = []
k = 10
lambdas = [ math.pow(10, c) for c in np.linspace(-12,-11,100) ]
d_int = 10
d_sq = 5

X_split_poly = [ build_X(X, d_int, d_sq) for X in X_split_std ]

for lambda_ in lambdas:
    print(f"lambda={lambda_}", end=" - ")
    
    accuracies = []
    
    # iterate over 4 sub datasets
    for i in range(len(X_split_poly)):
        classifier = LeastSquaresL2(lambda_)
        acc = np.mean(cross_validate_kfold(y_split[i], X_split_poly[i], classifier, k))
        accuracies.append(acc)
        
    # compute mean (weighted)
    accuracy = 0
    for i, acc in enumerate(accuracies):
        accuracy += acc * len(y_split[i])
    accuracy /= len(y)
        
    print(accuracy)
    total_acc.append(accuracy)
    
# plot best lambdas
plt.plot(lambdas, total_acc)

In [None]:
# pick set of top lambdas and make predictions
best_lambdas = [ lambdas[index] for index in np.argpartition(total_acc, -15)[-15:] ]
best_lambda = best_lambdas[0]

## Train final model

In [None]:
# train actual models

best_lambda = 3.5938136638046255e-12
best_deg_int = 10
best_deg_sq = 5

def model_split_data(X):
    return build_X(X, best_deg_int, best_deg_sq)

X_split_poly = [ model_split_data(X) for X in X_split_std ]
lambda_ = best_lambda
models = []
y_pred = np.ones(tX.shape[0])

for i in range(len(X_split_poly)):
    print(f"Building model for dataset {i}")
    lse = LeastSquaresL2(lambda_)
    lse.fit(y_split[i], X_split_poly[i])
    models.append(lse)
    y_pred[indices_split[i]] = lse.predict(X_split_poly[i])
    
print(np.mean(y == y_pred))

## Try basic methods

In [None]:
X = remove_NaN_features(tX, 0.2)
X = replace_NaN_by_median(X)
X, _, _ = standardize(X)

In [None]:
w_lse, _ = least_squares(y, X)
accuracy_lse = compute_accuracy(predict_labels(w_lse, X), y)
print(accuracy_lse)

In [None]:
w_lse_gd, _ = least_squares_GD(y, X, np.zeros(X.shape[1]), 600, 0.005)
accuracy_lse_gd = compute_accuracy(predict_labels(w_lse_gd, X), y)
print(accuracy_lse_gd)

In [None]:
w_lse_sgd, _ = least_squares_SGD(y, X, np.zeros(X.shape[1]), 40000, 0.0001)
accuracy_lse_sgd = compute_accuracy(predict_labels(w_lse_sgd, X), y)
print(accuracy_lse_sgd)

In [None]:
w_ridge, _ = ridge_regression(y, X, 0.001)
accuracy_ridge = compute_accuracy(predict_labels(w_ridge, X), y)
print(accuracy_ridge)

In [None]:
w_log_reg, _ = logistic_regression(y, X, np.zeros(X.shape[1]), 1000, 0.01)
accuracy_log_reg = compute_accuracy(predict_labels(w_log_reg, X), y)
print(accuracy_log_reg)

In [None]:
w_reg_log_reg, _ = reg_logistic_regression(y, X, 0.01, np.zeros(X.shape[1]), 1000, 0.01)
accuracy_reg_log_reg = compute_accuracy(predict_labels(w_reg_log_reg, X), y)
print(accuracy_reg_log_reg)

## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '../data/test.csv'
_, tX_test, ids_test, _ = load_csv_data(DATA_TEST_PATH)

In [None]:
# split
test_split_indices, X_test_split, _ = split_data(features, tX_test)

In [None]:
X_test_split_std = []
for X, mean, std in zip(X_test_split, mean_split, std_split): 
    # remove features with more than 20% of NaN and standardize
    X_test_std, _, _ = standardize(remove_NaN_features(X, 0.2), mean, std)
    
    X_test_split_std.append(X_test_std)

In [None]:
# predictions using new model
X_test_split_poly = [ model_split_data(X) for X in X_test_split_std ]
y_pred = np.ones(tX_test.shape[0])

for model, X, indices in zip(models, X_test_split_poly, test_split_indices):
    y_pred[indices] = model.predict(X)

In [None]:
OUTPUT_PATH = '../results/predictions.csv'
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)