In [5]:
# Useful starting lines
%matplotlib inline
import numpy as np
import itertools
import matplotlib.pyplot as plt
import csv
import helpers as helpers
import implementations as impl

In [6]:
# Load train data
tx_train, y_train, ids_train = helpers.load_data('train.csv')

In [7]:
# Refactor the -1 in 0 value for logistic regression
y_train[y_train==-1]=0

In [8]:
# Shuffle data
y_train, tx_train = helpers.shuffle_data(y_train, tx_train)

In [23]:
# Split, clean and standardize data into 4 sets according to 22nd feature
tx_train_0, y_0, _, miss_col_0 = helpers.split_i(tx_train, y_train, ids_train, 0)
tx_train_1, y_1, _, miss_col_1 = helpers.split_i(tx_train, y_train, ids_train, 1)
tx_train_2, y_2, _, miss_col_2 = helpers.split_i(tx_train, y_train, ids_train, 2)
tx_train_3, y_3, _, miss_col_3 = helpers.split_i(tx_train, y_train, ids_train, 3)

In [None]:
print(tx_train_0.shape)
print(tx_train_1.shape)
print(tx_train_2.shape)
print(tx_train_3.shape)

In [29]:
# Initialize the weights randomly according to a Gaussian distribution
initial_w_0 = np.random.normal(0., 0.1, [tx_train_0.shape[1],])
initial_w_1 = np.random.normal(0., 0.1, [tx_train_1.shape[1],])
initial_w_2 = np.random.normal(0., 0.1, [tx_train_2.shape[1],])
initial_w_3 = np.random.normal(0., 0.1, [tx_train_3.shape[1],])

# Train models
w_0, train_loss_0 = impl.logistic_regression(y_0, tx_train_0, initial_w_0, max_iters=20000, gamma=0.01)
w_1, train_loss_1 = impl.logistic_regression(y_1, tx_train_1, initial_w_1, max_iters=20000, gamma=0.01)
w_2, train_loss_2 = impl.logistic_regression(y_2, tx_train_2, initial_w_2, max_iters=20000, gamma=0.01)
w_3, train_loss_3 = impl.logistic_regression(y_3, tx_train_3, initial_w_3, max_iters=20000, gamma=0.01)

Current iteration=0, loss=1.223972875168135
Current iteration=100, loss=0.510475168028821
Current iteration=200, loss=0.453246077889422
Current iteration=300, loss=0.4285402418527632
Current iteration=400, loss=0.41465919547508046
Current iteration=500, loss=0.40558674150795776
Current iteration=600, loss=0.39908576146713154
Current iteration=700, loss=0.39414552570842004
Current iteration=800, loss=0.3902417691006348
Current iteration=900, loss=0.3870726471404292
Current iteration=1000, loss=0.384448585082879
Current iteration=1100, loss=0.38224219253225494
Current iteration=1200, loss=0.3803636134428381
Current iteration=1300, loss=0.3787471520045637
Current iteration=1400, loss=0.37734335638474653
Current iteration=1500, loss=0.37611409169068233
Current iteration=1600, loss=0.37502940102244536
Current iteration=1700, loss=0.3740654414943691
Current iteration=1800, loss=0.3732030551617844
Current iteration=1900, loss=0.37242672971626406
Current iteration=2000, loss=0.3717238191525737

Current iteration=2100, loss=0.43422542639761086
Current iteration=2200, loss=0.43301793040350234
Current iteration=2300, loss=0.4319136040637999
Current iteration=2400, loss=0.43089987880208175
Current iteration=2500, loss=0.4299660683951964
Current iteration=2600, loss=0.42910306015132516
Current iteration=2700, loss=0.4283030420671444
Current iteration=2800, loss=0.4275592816924598
Current iteration=2900, loss=0.42686594843588493
Current iteration=3000, loss=0.42621796977353965
Current iteration=3100, loss=0.4256109139650078
Current iteration=3200, loss=0.4250408937200493
Current iteration=3300, loss=0.4245044865833009
Current iteration=3400, loss=0.4239986687641739
Current iteration=3500, loss=0.42352075985233156
Current iteration=3600, loss=0.42306837639912176
Current iteration=3700, loss=0.42263939275984735
Current iteration=3800, loss=0.422231907913164
Current iteration=3900, loss=0.4218442172251153
Current iteration=4000, loss=0.42147478832297713
Current iteration=4100, loss=0.

In [33]:
print(f"train_loss_0 = {train_loss_0}")
print(f"train_loss_1 = {train_loss_1}")
print(f"train_loss_2 = {train_loss_2}")
print(f"train_loss_3 = {train_loss_3}")

train_loss_0 = 0.3630714840319055
train_loss_1 = 0.4493370750762544
train_loss_2 = 0.3948771461716632
train_loss_3 = 0.41853680541110877


In [34]:
# Compute training accuracies
predict_train_0 = helpers.predict_logistic(tx_train_0, w_0)
predict_train_1 = helpers.predict_logistic(tx_train_1, w_1)
predict_train_2 = helpers.predict_logistic(tx_train_2, w_2)
predict_train_3 = helpers.predict_logistic(tx_train_3, w_3)

predict_train_0[predict_train_0 == -1] = 0
predict_train_1[predict_train_1 == -1] = 0
predict_train_2[predict_train_2 == -1] = 0
predict_train_3[predict_train_3 == -1] = 0

train_accuracy_0 = helpers.accuracy(predict_train_0, y_0)
train_accuracy_1 = helpers.accuracy(predict_train_1, y_1)
train_accuracy_2 = helpers.accuracy(predict_train_2, y_2)
train_accuracy_3 = helpers.accuracy(predict_train_3, y_3)

print(f"train_accuracy_0 = {train_accuracy_0}")
print(f"train_accuracy_1 = {train_accuracy_1}")
print(f"train_accuracy_2 = {train_accuracy_2}")
print(f"train_accuracy_3 = {train_accuracy_3}")

train_accuracy_0 = 0.8397205568844895
train_accuracy_1 = 0.8004874651810585
train_accuracy_2 = 0.8342364874253161
train_accuracy_3 = 0.8206099981952716


In [35]:
# Load test data
tx_test, y_test, ids_test = helpers.load_data('test.csv', train=False)

In [40]:
# Split, clean, standardize, expand degree and add bias to data into 4 sets according to 22nd feature
tx_test_0, y_test_0, ids_test_0, _ = helpers.split_i(tx_test, y_test, ids_test, 0, miss_col_0)
tx_test_1, y_test_1, ids_test_1, _ = helpers.split_i(tx_test, y_test, ids_test, 1, miss_col_1)
tx_test_2, y_test_2, ids_test_2, _ = helpers.split_i(tx_test, y_test, ids_test, 2, miss_col_2)
tx_test_3, y_test_3, ids_test_3, _ = helpers.split_i(tx_test, y_test, ids_test, 3, miss_col_3)

In [44]:
# Predict labels
predict_test_0 = helpers.predict_logistic(tx_test_0, w_0)
predict_test_1 = helpers.predict_logistic(tx_test_1, w_1)
predict_test_2 = helpers.predict_logistic(tx_test_2, w_2)
predict_test_3 = helpers.predict_logistic(tx_test_3, w_3)

In [47]:
print(predict_test_0)
print(predict_test_1)
print(predict_test_2)
print(predict_test_3)

[-1. -1. -1. ... -1. -1. -1.]
[-1. -1. -1. ... -1.  1.  1.]
[-1.  1. -1. ... -1. -1. -1.]
[-1.  1. -1. ... -1. -1. -1.]


In [45]:
# Concatenate sets
predict_test = np.concatenate((predict_test_0, predict_test_1, predict_test_2, predict_test_3))
ids_test = np.concatenate((ids_test_0, ids_test_1, ids_test_2, ids_test_3))

In [None]:
print(predict_test.shape)

In [50]:
# Create csv file
helpers.create_csv_submission(ids_test, predict_test, 'Predictions_Logistics_degree2_split4.csv')