In [31]:
# Useful starting lines
%matplotlib inline
import numpy as np
import itertools
import matplotlib.pyplot as plt
import csv
import helpers as helpers
import implementations as impl
import cross_validation as cv
import hyperparameter_opti as hpopt

In [2]:
# Load data
train_data, train_labels, ids = helpers.load_data('train.csv')

In [3]:
# Clean and standardize data
new_data = helpers.standardize(helpers.clean_data(train_data))
train_labels[train_labels==-1]=0

In [4]:
#Add bias to data:
new_data = np.c_[np.ones((new_data.shape[0], 1)), new_data]

**Least Squares:**

In [6]:
weights_ls, loss_ls = impl.least_squares(train_labels, new_data)

In [7]:
tx_test, y_test, ids_test = helpers.load_data('test.csv', train=False)
tx_test = helpers.standardize(helpers.clean_data(tx_test))
tx_test = np.c_[np.ones((tx_test.shape[0], 1)), tx_test]

In [8]:
y_test = impl.test_data(weights_ls, tx_test)

In [9]:
helpers.create_csv_submission(ids_test, y_test, 'Predictions_LS.csv')

**Ridge regression:**

In [19]:
#find the best degree to build a polynomial basis from the training data:
degree_opt, lambda_opt = cv.best_degree_selection(train_labels, new_data, np.arange(2,5), 4, np.logspace(-4,0,4))

In [20]:
print(degree_opt)
print(lambda_opt)

2
0.0001


In [21]:
print(new_data.shape)
print(tx_test.shape)

(250000, 31)
(568238, 31)


In [24]:
#build polynomial basis from the entire training set using the optimal degree:
xpoly = cv.build_poly(new_data, degree_opt)
print(xpoly.shape)
#compute weights and loss for the optimal lambda:
w_ridge, loss_ridge = impl.ridge_regression(train_labels, xpoly, lambda_opt)
print(w_ridge.shape)

(250000, 63)
(63,)


In [25]:
xpoly_test = cv.build_poly(tx_test, degree_opt)
print(xpoly_test.shape)
y_test = impl.test_data(w_ridge, xpoly_test)

(568238, 63)


In [26]:
helpers.create_csv_submission(ids_test, y_test, 'Predictions_RG.csv')

**Logistic Regression:**

In [7]:
# Shuffle data
train_labels, new_data = helpers.shuffle_data(train_labels, new_data)

In [8]:
# Slice into training and validation sets
y_validation, y_train, tx_validation, tx_train = helpers.slice_data(train_labels, new_data, 0.25)

In [9]:
# Add bias to data
tx_train = np.c_[np.ones((y_train.shape[0], 1)), tx_train]
tx_validation = np.c_[np.ones((y_validation.shape[0], 1)), tx_validation]

In [None]:
# Initialize the weights randomly according to a Gaussian distribution
initial_w = np.random.normal(0., 0.1, [tx_train.shape[1],])

In [10]:
#Find the value of gamma that minimizes the loss:
gamma_opt = hpopt.best_gamma_selection(y_train, tx_train, 1000)
# Train model
trained_weights, train_loss = impl.logistic_regression(y_train, tx_train, initial_w, max_iters=1000, gamma=gamma_opt)

Current iteration=0, loss=0.7163567950814637
Current iteration=100, loss=0.6227581517924388
Current iteration=200, loss=0.584904560206431
Current iteration=300, loss=0.5641307100362332
Current iteration=400, loss=0.5512503018068265
Current iteration=500, loss=0.5426562963788711
Current iteration=600, loss=0.5366105979203447
Current iteration=700, loss=0.5321775687371848
Current iteration=800, loss=0.5288137652729488
Current iteration=900, loss=0.526185117200647


In [None]:
# Cross validation
predict_validation = helpers.predict_logistic(tx_validation, trained_weights)
predict_train = helpers.predict_logistic(tx_train, trained_weights)

predict_validation[predict_validation == -1] = 0
predict_train[predict_train == -1] = 0

train_accuracy = helpers.accuracy(predict_train, y_train)
validation_accuracy = helpers.accuracy(predict_validation, y_validation)

print(f"train_accuracy = {train_accuracy}")
print(f"validation_accuracy = {validation_accuracy}")

In [12]:
tx_test, y_test, ids_test = helpers.load_data('test.csv', train=False)
tx_test = helpers.standardize(helpers.clean_data(tx_test))

In [13]:
tx_test = np.c_[np.ones((y_test.shape[0], 1)), tx_test]
predict_test = helpers.predict_logistic(tx_test, trained_weights)
print(predict_test)

[-1. -1. -1. ... -1.  1. -1.]


In [14]:
helpers.create_csv_submission(ids_test, predict_test, 'Predictions_Logistics.csv')

**Regularized Logistic Regression:**

In [48]:
# Shuffle data
train_labels, new_data = helpers.shuffle_data(train_labels, new_data)

In [49]:
# Initialize the weights randomly according to a Gaussian distribution
initial_w = np.random.normal(0., 0.1, [new_data.shape[1],])

In [50]:
print(train_labels.shape)
print(new_data.shape)

(250000,)
(250000, 31)


In [51]:
# Initialize the weights randomly according to a Gaussian distribution
initial_w = np.random.normal(0., 0.1, [train_data.shape[1],])

In [44]:
#Find the most optimal values for the regularization term (lambda) and gamma:
gamma_opt = hpopt.best_gamma_selection(train_labels, train_data, 1000)
lambda_opt = hpopt.best_lambda_selection(train_labels, train_data, 1000, gamma=gamma_opt)

Current gamma=1e-06
Current iteration=0, loss=0.778252425871402
Current iteration=100, loss=0.7782333628015712
Current iteration=200, loss=0.7782143023450276
Current iteration=300, loss=0.7781952445015405
Current iteration=400, loss=0.7781761892708785
Current iteration=500, loss=0.7781571366528102
Current iteration=600, loss=0.7781380866471043
Current iteration=700, loss=0.7781190392535294
Current iteration=800, loss=0.7780999944718548
Current iteration=900, loss=0.7780809523018483
training_loss = 0.7780619127432786, validation_loss = 0.9494525485397384
Current gamma=1e-05
Current iteration=0, loss=0.778252425871402
Current iteration=100, loss=0.7780619121559318
Current iteration=200, loss=0.7778716595623911
Current iteration=300, loss=0.7776816678591929
Current iteration=400, loss=0.7774919368144765
Current iteration=500, loss=0.7773024661961094
Current iteration=600, loss=0.777113255771688
Current iteration=700, loss=0.7769243053085393
Current iteration=800, loss=0.7767356145737221
C

  return 1.0 / (1 + np.exp(-t))


Current it = 100
Current it = 200
Current it = 300
Current it = 400
Current it = 500
Current it = 600
Current it = 700
Current it = 800
Current it = 900
training_loss = 10601.812560597451, validation_loss = 15784.931112396547
Current lambda=1.0
Current it = 0
Current it = 100
Current it = 200
Current it = 300
Current it = 400
Current it = 500
Current it = 600
Current it = 700
Current it = 800
Current it = 900


  loss = np.sum(np.logaddexp(0, tx @ w) - y * tx.dot(w))/tx.shape[0]
  loss_validation = (np.sum(np.logaddexp(0, tx_va.dot(w)) + y_va * tx_va.dot(w)) + lambda_*np.linalg.norm(w)**2)/y_va.shape[0]


training_loss = nan, validation_loss = nan
Current lambda=10.0
Current it = 0
Current it = 100


  grad = (tx.T.dot(sigmoid(tx.dot(w)) - y))/tx.shape[0] + 2*lambda_*w


Current it = 200
Current it = 300
Current it = 400
Current it = 500
Current it = 600
Current it = 700
Current it = 800
Current it = 900
training_loss = nan, validation_loss = nan
Best lambda = 1.0, training_loss = nan, validation_loss = nan


In [52]:
trained_weights, train_loss = impl.reg_logistic_regression(train_labels, new_data, 0.01, initial_w, max_iters=1000, gamma=1.5)

ValueError: shapes (250000,31) and (32,) not aligned: 31 (dim 1) != 32 (dim 0)

In [None]:
#Cross validation:
predict_validation = helpers.predict_logistic(tx_validation, trained_weights)
predict_train = helpers.predict_logistic(tx_train, trained_weights)

predict_validation[predict_validation == -1] = 0
predict_train[predict_train == -1] = 0

train_accuracy = helpers.accuracy(predict_train, y_train)
validation_accuracy = helpers.accuracy(predict_validation, y_validation)

print(f"train_accuracy = {train_accuracy}")
print(f"validation_accuracy = {validation_accuracy}")

In [None]:
tx_test, y_test, ids_test = helpers.load_data('test.csv', train=False)
tx_test = helpers.standardize(helpers.clean_data(tx_test))

In [None]:
tx_test = np.c_[np.ones((y_test.shape[0], 1)), tx_test]
predict_test = helpers.predict_logistic(tx_test, trained_weights)
print(predict_test)

In [None]:
helpers.create_csv_submission(ids_test, predict_test, 'Predictions_RegLogistics.csv')