In [32]:
# Useful starting lines
%matplotlib inline
import numpy as np
import itertools
import matplotlib.pyplot as plt
import csv
import helpers as helpers
import implementations as impl
import cross_validation as cv
import hyperparameter_opti as hpopt

In [2]:
# Load data
train_data, train_labels, ids = helpers.load_data('train.csv')

In [4]:
# Clean and standardize data
new_data = helpers.standardize(helpers.clean_data(train_data))
train_labels[train_labels==-1]=0

In [5]:
#Add bias to data:
new_data = np.c_[np.ones((new_data.shape[0], 1)), new_data]

In [6]:
#Load test data:
tx_test, y_test, ids_test = helpers.load_data('test.csv', train=False)
#Standardize test data:
tx_test = helpers.standardize(helpers.clean_data(tx_test))
#Add bias to test data:
tx_test = np.c_[np.ones((tx_test.shape[0], 1)), tx_test]

**Least Squares:**

In [6]:
weights_ls, loss_ls = impl.least_squares(train_labels, new_data)

In [8]:
y_test = impl.test_data(weights_ls, tx_test)

In [9]:
helpers.create_csv_submission(ids_test, y_test, 'Predictions_LS.csv')

**Ridge regression:**

In [19]:
#find the best degree to build a polynomial basis from the training data:
degree_opt, lambda_opt = cv.best_degree_selection_ridge(train_labels, new_data, np.arange(2,5), 4, np.logspace(-4,0,4))

In [20]:
print(degree_opt)
print(lambda_opt)

2
0.0001


In [21]:
print(new_data.shape)
print(tx_test.shape)

(250000, 31)
(568238, 31)


In [24]:
#build polynomial basis from the entire training set using the optimal degree:
xpoly = cv.build_poly(new_data, degree_opt)
print(xpoly.shape)
#compute weights and loss for the optimal lambda:
w_ridge, loss_ridge = impl.ridge_regression(train_labels, xpoly, lambda_opt)
print(w_ridge.shape)

(250000, 63)
(63,)


In [25]:
#build polynomial basis from the test set:
xpoly_test = cv.build_poly(tx_test, degree_opt)
print(xpoly_test.shape)
#Compute predicted labels:
y_test = impl.test_data(w_ridge, xpoly_test)

(568238, 63)


In [26]:
helpers.create_csv_submission(ids_test, y_test, 'Predictions_RG.csv')

**Logistic Regression:**

In [13]:
# Shuffle data
train_labels, new_data = helpers.shuffle_data(train_labels, new_data)

In [14]:
# Slice into training and validation sets
y_validation, y_train, tx_validation, tx_train = helpers.slice_data(train_labels, new_data, 0.25)

In [15]:
# Initialize the weights randomly according to a Gaussian distribution
initial_w = np.random.normal(0., 0.1, [tx_train.shape[1],])

In [16]:
#Find the value of gamma that minimizes the loss:
gamma_opt = hpopt.best_gamma_selection(y_train, tx_train, 1000)
# Train model
trained_weights, train_loss = impl.logistic_regression(y_train, tx_train, initial_w, max_iters=1000, gamma=gamma_opt)

Current gamma=1e-06
Current iteration=0, loss=0.7473898389524194
Current iteration=100, loss=0.747363250564084
Current iteration=200, loss=0.7473366665161033
Current iteration=300, loss=0.7473100868077831
Current iteration=400, loss=0.7472835114384289
Current iteration=500, loss=0.7472569404073467
Current iteration=600, loss=0.7472303737138422
Current iteration=700, loss=0.7472038113572215
Current iteration=800, loss=0.7471772533367894
Current iteration=900, loss=0.7471506996518528
training_loss = 0.7471241503017174, validation_loss = 0.51774464810816
Current gamma=7.498942093324558e-06
Current iteration=0, loss=0.7473898389524194
Current iteration=100, loss=0.7471905593727567
Current iteration=200, loss=0.746991523616922
Current iteration=300, loss=0.7467927313922089
Current iteration=400, loss=0.7465941824059665
Current iteration=500, loss=0.7463958763655983
Current iteration=600, loss=0.7461978129785657
Current iteration=700, loss=0.745999991952388
Current iteration=800, loss=0.7458

In [17]:
# Cross validation
predict_validation = helpers.predict_logistic(tx_validation, trained_weights)
predict_train = helpers.predict_logistic(tx_train, trained_weights)

predict_validation[predict_validation == -1] = 0
predict_train[predict_train == -1] = 0

train_accuracy = helpers.accuracy(predict_train, y_train)
validation_accuracy = helpers.accuracy(predict_validation, y_validation)

print(f"train_accuracy = {train_accuracy}")
print(f"validation_accuracy = {validation_accuracy}")

train_accuracy = 0.7263573333333333
validation_accuracy = 0.727312


In [18]:
predict_test = helpers.predict_logistic(tx_test, trained_weights)
print(predict_test)

[-1. -1. -1. ... -1.  1. -1.]


In [19]:
helpers.create_csv_submission(ids_test, predict_test, 'Predictions_Logistics.csv')

**Logistic regression with polynomial features:**

In [10]:
#Use cross validation to find the best degree (prevent overfitting):
degree_opt = cv.best_degree_selection_logistic(train_labels, new_data, 1000, 0.003, np.arange(0,5,1), 4)

Current iteration=0, loss=0.7134303278406444
Current iteration=100, loss=0.7038088924337598
Current iteration=200, loss=0.6955266212239213
Current iteration=300, loss=0.6883982334777878
Current iteration=400, loss=0.6822628318628193
Current iteration=500, loss=0.6769812224565813
Current iteration=600, loss=0.6724333364069965
Current iteration=700, loss=0.6685158271684682
Current iteration=800, loss=0.6651398817597853
Current iteration=900, loss=0.6622292601166271
Current iteration=0, loss=0.6821858949373834
Current iteration=100, loss=0.6767444908119747
Current iteration=200, loss=0.6720590033966609
Current iteration=300, loss=0.6680229347964532
Current iteration=400, loss=0.6645447631095672
Current iteration=500, loss=0.6615459254931555
Current iteration=600, loss=0.6589590320819722
Current iteration=700, loss=0.656726299868922
Current iteration=800, loss=0.6547981897204165
Current iteration=900, loss=0.6531322269199542
Current iteration=0, loss=0.6901601098315059
Current iteration=10

  return 1 / (1 + np.exp(-t))


Current iteration=100, loss=0.6432669368537081
Current iteration=200, loss=0.5807195350196399
Current iteration=300, loss=0.5575282307207049
Current iteration=400, loss=0.542708086681093
Current iteration=500, loss=0.5315256148032612
Current iteration=600, loss=0.5225995836671046
Current iteration=700, loss=0.5152607414736856
Current iteration=800, loss=0.509107187978798
Current iteration=900, loss=0.5038687979692179
Current iteration=0, loss=2.2759750033948687
Current iteration=100, loss=0.6752075428394545
Current iteration=200, loss=0.5949050736157269
Current iteration=300, loss=0.5542592376956457
Current iteration=400, loss=0.5384122151849594
Current iteration=500, loss=0.5268864816771814
Current iteration=600, loss=0.517736730629972
Current iteration=700, loss=0.5101829997370434
Current iteration=800, loss=0.5037439181019883
Current iteration=900, loss=0.4982305385945275
Current iteration=0, loss=2.274976829081178
Current iteration=100, loss=0.6983884206805918
Current iteration=200

In [27]:
#Build polynomial with the optimal degree found previously:
train_poly = cv.build_poly(new_data, 4)
print(train_poly.shape)

(250000, 125)


In [28]:
#Reinitialize weight vector such that the dimensions are adapted to the polynomial feature matrix:
initial_w = np.random.normal(0., 0.1, [train_poly.shape[1],])
print(initial_w.shape)

(125,)


In [29]:
#Train the model with polynomial features:
trained_weights_poly, train_loss_poly = impl.logistic_regression(train_labels, train_poly, initial_w, max_iters=1000, gamma=0.003)

Current iteration=0, loss=7.626934296379402
Current iteration=100, loss=4.360285865953201
Current iteration=200, loss=1.3865786653010366
Current iteration=300, loss=2.4064495870187135
Current iteration=400, loss=15.73407291954145
Current iteration=500, loss=1.7192420896345593
Current iteration=600, loss=2.263713930892244
Current iteration=700, loss=1.3609727607266202
Current iteration=800, loss=1.321732455500975
Current iteration=900, loss=3.6631454303775137


In [30]:
#Build polynomial basis from test data:
test_poly = cv.build_poly(tx_test, 4)
print(test_poly.shape)
print(trained_weights_poly.shape)
#Compute predicted labels:
predicted_labels = helpers.predict_logistic(test_poly, trained_weights_poly)

(568238, 125)
(125,)


In [31]:
helpers.create_csv_submission(ids_test, predicted_labels, 'Predictions_Logistics_Polynomial.csv')

**Regularized Logistic Regression:**

In [9]:
# Shuffle data
train_labels, new_data = helpers.shuffle_data(train_labels, new_data)

In [10]:
# Initialize the weights randomly according to a Gaussian distribution
initial_w = np.random.normal(0., 0.1, [new_data.shape[1],])

In [11]:
print(train_labels.shape)
print(new_data.shape)

(250000,)
(250000, 31)


In [12]:
#Find the most optimal values for the regularization term (lambda) and gamma:
gamma_opt = hpopt.best_gamma_selection(train_labels, train_data, 1000)
lambda_opt = hpopt.best_lambda_selection(train_labels, train_data, 1000, gamma=gamma_opt)

Current gamma=1e-06
Current iteration=0, loss=74.99274837100322
Current iteration=100, loss=63.90647630054176
Current iteration=200, loss=53.02343254839472
Current iteration=300, loss=44.368158332960675
Current iteration=400, loss=35.82468729084078
Current iteration=500, loss=27.30095531012157
Current iteration=600, loss=18.81347756457517
Current iteration=700, loss=11.95380385074749
Current iteration=800, loss=9.503195366473074
Current iteration=900, loss=7.751785287349152
training_loss = 6.1669070611112256, validation_loss = 9.739648296571119
Current gamma=7.498942093324558e-06
Current iteration=0, loss=74.99274837100322
Current iteration=100, loss=10.509610190754243
Current iteration=200, loss=3.701115962311486
Current iteration=300, loss=3.7455438017163165
Current iteration=400, loss=3.670723311723105
Current iteration=500, loss=3.653082424181691
Current iteration=600, loss=3.652305190781663
Current iteration=700, loss=3.3775144486288284
Current iteration=800, loss=2.50751200050300

  return 1 / (1 + np.exp(-t))


Current iteration=100, loss=288.5998504558666
Current iteration=200, loss=277.4259364801881
Current iteration=300, loss=311.6132322804477
Current iteration=400, loss=233.80921590042038
Current iteration=500, loss=130.36664938852536
Current iteration=600, loss=340.08096962468056
Current iteration=700, loss=352.2575230792863
Current iteration=800, loss=257.32160121572747
Current iteration=900, loss=230.92136708567338
training_loss = 133.61811692541337, validation_loss = -132.94876036141915
Current gamma=0.0031622776601683794
Current iteration=0, loss=74.99274837100322
Current iteration=100, loss=1803.5479482008486
Current iteration=200, loss=2698.8193117157975
Current iteration=300, loss=1850.6770403982462
Current iteration=400, loss=2333.647888915868
Current iteration=500, loss=878.2525509373368
Current iteration=600, loss=1606.302304471584
Current iteration=700, loss=1617.6304754495534
Current iteration=800, loss=908.8648642171975
Current iteration=900, loss=120.91549526963401
training

  return 1.0 / (1 + np.exp(-t))


Current it = 100
Current it = 200
Current it = 300
Current it = 400
Current it = 500
Current it = 600
Current it = 700
Current it = 800
Current it = 900
training_loss = 297314.1500673294, validation_loss = -295925.23013583047
Current lambda=0.00031622776601683794
Current it = 0
Current it = 100
Current it = 200
Current it = 300
Current it = 400
Current it = 500
Current it = 600
Current it = 700
Current it = 800
Current it = 900
training_loss = 229577.1774475597, validation_loss = -228541.58335607703
Current lambda=0.1
Current it = 0
Current it = 100
Current it = 200
Current it = 300
Current it = 400
Current it = 500
Current it = 600
Current it = 700
Current it = 800
Current it = 900
training_loss = 929082.2731807932, validation_loss = -803379.3659650225
Best lambda = 0.1, training_loss = 929082.2731807932, validation_loss = -803379.3659650225


In [13]:
trained_weights, train_loss = impl.reg_logistic_regression(train_labels, new_data, lambda_opt, initial_w, max_iters=1000, gamma=gamma_opt)

Current it = 0
Current it = 100
Current it = 200
Current it = 300
Current it = 400
Current it = 500
Current it = 600
Current it = 700
Current it = 800
Current it = 900


In [15]:
tx_test, y_test, ids_test = helpers.load_data('test.csv', train=False)
tx_test = helpers.standardize(helpers.clean_data(tx_test))

In [16]:
tx_test = np.c_[np.ones((y_test.shape[0], 1)), tx_test]
predict_test = helpers.predict_logistic(tx_test, trained_weights)
print(predict_test)

[-1. -1. -1. ...  1.  1. -1.]


In [17]:
helpers.create_csv_submission(ids_test, predict_test, 'Predictions_RegLogistics.csv')