In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import itertools
import matplotlib.pyplot as plt
import csv
import helpers as helpers
import implementations as impl
import cross_validation as cv
import hyperparameter_opti as hpopt

**Data process**

In [37]:
# Load train data
tx_train, y_train, ids_train = helpers.load_data('train.csv')

In [38]:
# Clean and standardize data
tx_train, mean_train, std_train = helpers.standardize(helpers.clean_data(tx_train))

In [39]:
#Load test data
tx_test, y_test, ids_test = helpers.load_data('test.csv', train=False)

#Standardize test data
tx_test, _, _ = helpers.standardize(helpers.clean_data(tx_test), mean_train, std_train)

**Least Squares:**

In [None]:
#Add bias to data:
tx_train_ls = np.c_[np.ones((tx_train.shape[0], 1)), tx_train]

In [6]:
# Compute weights and loss for Least Squares
w_ls, loss_ls = impl.least_squares(y_train, tx_train_ls)

In [7]:
# Predict training values with weights
y_predict_ls = helpers.predict(tx_train_ls, w_ls)

In [8]:
# Compute training accuracy
training_accuracy_ls = helpers.accuracy(y_predict_ls, y_train)

In [9]:
print(training_accuracy_ls)

0.682832


In [None]:
#Add bias to test data
tx_test_ls = np.c_[np.ones((tx_test.shape[0], 1)), tx_test]

In [10]:
# Predict test labels
y_test_ls = helpers.predict(tx_test_ls, w_ls)

In [None]:
# Output results in csv
helpers.create_csv_submission(ids_test, y_test_ls, 'Predictions_LS.csv')

**Ridge regression:**

In [14]:
# Build polynomial basis from the entire training set using the optimal degree
tx_train_ridge = cv.build_poly(tx_train, 2)

#Add bias to data:
tx_train_ridge = np.c_[np.ones((tx_train_ridge.shape[0], 1)), tx_train_ridge]

# Compute weights and loss for Ridge Regression
w_ridge, loss_ridge = impl.ridge_regression(y_train, tx_train_ridge, 0.001)

In [17]:
# Predict training labels with weights
y_predict_ridge = helpers.predict(tx_train_ridge, w_ridge)

# Compute training accuracy
training_accuracy_ridge = helpers.accuracy(y_predict_ridge, y_train)

print(training_accuracy_ridge)

0.701296


In [19]:
# Build polynomial basis from the test set
tx_test_ridge = cv.build_poly(tx_test, 2)

#Add bias to test data
tx_test_ridge = np.c_[np.ones((tx_test_ridge.shape[0], 1)), tx_test_ridge]

# Compute predicted labels
y_test_ridge = helpers.predict(tx_test_ridge, w_ridge)

In [None]:
# Output results in csv
helpers.create_csv_submission(ids_test, y_test_ridge, 'Predictions_RG.csv')

**Logistic Regression:**

In [20]:
# Refactor -1 to 0
y_train_log = y_train.copy()
y_train_log[y_train_log == -1] = 0

In [21]:
# Shuffle data
y_train_log, tx_train_log = helpers.shuffle_data(y_train_log, tx_train_log)

In [22]:
# Slice into training and validation sets
y_validation_log, y_train_log, tx_validation_log, tx_train_log = helpers.slice_data(y_train_log, tx_train_log, 0.25)

In [None]:
#Add bias to data:
tx_train_log = np.c_[np.ones((tx_train_log.shape[0], 1)), tx_train_log]
tx_validation_log = np.c_[np.ones((tx_validation_log.shape[0], 1)), tx_validation_log]

In [23]:
# Initialize the weights randomly according to a Gaussian distribution
initial_w = np.random.normal(0., 0.1, [tx_train_log.shape[1],])

In [25]:
# Train model
w_log, loss_log = impl.logistic_regression(y_train_log, tx_train_log, initial_w, max_iters=3000, gamma=0.003)

Current iteration=0, loss=0.7178237274451356
Current iteration=100, loss=0.6754602627854734
Current iteration=200, loss=0.6477562918396289
Current iteration=300, loss=0.6278412106289103
Current iteration=400, loss=0.6126128262571284
Current iteration=500, loss=0.6004933816783325
Current iteration=600, loss=0.5905862435316124
Current iteration=700, loss=0.5823326948137376
Current iteration=800, loss=0.5753584090569795
Current iteration=900, loss=0.5693985011413968
Current iteration=1000, loss=0.5642578353080218
Current iteration=1100, loss=0.5597883477005114
Current iteration=1200, loss=0.5558751759430723
Current iteration=1300, loss=0.5524276699169159
Current iteration=1400, loss=0.5493732879272637
Current iteration=1500, loss=0.5466532990540784
Current iteration=1600, loss=0.5442196717414793
Current iteration=1700, loss=0.5420327720351004
Current iteration=1800, loss=0.5400596313267655
Current iteration=1900, loss=0.5382726242123334
Current iteration=2000, loss=0.5366484472171642
Curr

In [26]:
# Cross validation
predict_validation_log = helpers.predict_logistic(tx_validation_log, w_log)
predict_train_log = helpers.predict_logistic(tx_train_log, w_log)

predict_validation_log[predict_validation_log == -1] = 0
predict_train_log[predict_train_log == -1] = 0

train_accuracy_log = helpers.accuracy(predict_train_log, y_train_log)
validation_accuracy_log = helpers.accuracy(predict_validation_log, y_validation_log)

print(f"train_accuracy = {train_accuracy_log}")
print(f"validation_accuracy = {validation_accuracy_log}")

train_accuracy = 0.7349653333333334
validation_accuracy = 0.734784


In [None]:
#Add bias to test data
tx_test_log = np.c_[np.ones((tx_test.shape[0], 1)), tx_test]

In [28]:
predict_test_log = helpers.predict_logistic(tx_test_log, w_log)
print(predict_test_log)

[-1. -1. -1. ... -1.  1. -1.]


In [None]:
helpers.create_csv_submission(ids_test, predict_test_log, 'Predictions_Logistics.csv')

**Logistic regression with polynomial features:**

In [40]:
#Build polynomial with the optimal degree found previously:
tx_train_log2 = helpers.build_poly_deg2(tx_train)
print(tx_train_log2.shape)

(250000, 495)


In [41]:
#Add bias to data:
tx_train_log2 = np.c_[np.ones((tx_train_log2.shape[0], 1)), tx_train_log2]

In [42]:
# Refactor -1 to 0
y_train_log2 = y_train.copy()
y_train_log2[y_train_log2 == -1] = 0

In [43]:
#Reinitialize weight vector such that the dimensions are adapted to the polynomial feature matrix:
initial_w = np.random.normal(0., 0.1, [tx_train_log2.shape[1],])
print(initial_w.shape)

(496,)


In [44]:
#Train the model with polynomial features:
w_log2, loss_log2 = impl.logistic_regression(y_train_log2, tx_train_log2, initial_w, max_iters=3000, gamma=0.003)

  return 1 / (1 + np.exp(-t))


Current iteration=0, loss=1.1767571947949713
Current iteration=100, loss=0.8252632049159756
Current iteration=200, loss=0.737520893109982
Current iteration=300, loss=0.6867324648049475
Current iteration=400, loss=0.6519676421654641
Current iteration=500, loss=0.6248088751731362
Current iteration=600, loss=0.6026980585469703
Current iteration=700, loss=0.5842254315378697
Current iteration=800, loss=0.5685323050621113
Current iteration=900, loss=0.5550148400626531
Current iteration=1000, loss=0.5432608119181062
Current iteration=1100, loss=0.5329644183554313
Current iteration=1200, loss=0.523890642773351
Current iteration=1300, loss=0.5158511692693404
Current iteration=1400, loss=0.5086934819979531
Current iteration=1500, loss=0.5022918980825637
Current iteration=1600, loss=0.49654196316337434
Current iteration=1700, loss=0.491357207667324
Current iteration=1800, loss=0.4866657856151398
Current iteration=1900, loss=0.4824071081971684
Current iteration=2000, loss=0.4785295010161574
Curren

In [48]:
# Predict train labels
predict_train_log2 = helpers.predict_logistic(tx_train_log2, w_log2)
predict_train_log2[predict_train_log2 == -1] = 0

# Compute training accuracy
training_accuracy_log2 = helpers.accuracy(y_train_log2, predict_train_log2)

print(training_accuracy_log2)

0.794816


In [51]:
#Build polynomial basis from test data:
tx_test_log2 = helpers.build_poly_deg2(tx_test)

#Add bias to test data
tx_test_log2 = np.c_[np.ones((tx_test_log2.shape[0], 1)), tx_test_log2]

#Compute predicted labels:
y_predict_log2 = helpers.predict_logistic(tx_test_log2, w_log2)

In [None]:
helpers.create_csv_submission(ids_test, y_predict_log2, 'Predictions_Logistics_Polynomial.csv')