In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import itertools
import matplotlib.pyplot as plt
import csv
import helpers as helpers
import implementations as impl

In [2]:
# Load train data
tx_train, y_train, ids_train = helpers.load_data('train.csv')

In [3]:
# Refactor the -1 in 0 value for logistic regression
y_train[y_train==-1]=0

In [4]:
# Shuffle data
y_train, tx_train = helpers.shuffle_data(y_train, tx_train)

In [5]:
# Split and clean data into 4 sets according to 22nd feature
tx_train_0, y_0, _, miss_col_0 = helpers.split_i(tx_train, y_train, ids_train, 0)
tx_train_1, y_1, _, miss_col_1 = helpers.split_i(tx_train, y_train, ids_train, 1)
tx_train_2, y_2, _, miss_col_2 = helpers.split_i(tx_train, y_train, ids_train, 2)
tx_train_3, y_3, _, miss_col_3 = helpers.split_i(tx_train, y_train, ids_train, 3)

In [6]:
#Standardize the data
tx_train_0, mean_0, std_0 = helpers.standardize(tx_train_0)
tx_train_1, mean_1, std_1 = helpers.standardize(tx_train_1)
tx_train_2, mean_2, std_2 = helpers.standardize(tx_train_2)
tx_train_3, mean_3, std_3 = helpers.standardize(tx_train_3)

In [7]:
# Expand to degree 2
tx_train_0 = helpers.build_poly_deg2(tx_train_0)
tx_train_1 = helpers.build_poly_deg2(tx_train_1)
tx_train_2 = helpers.build_poly_deg2(tx_train_2)
tx_train_3 = helpers.build_poly_deg2(tx_train_3)

In [8]:
# Add bias to data
tx_train_0 = np.c_[np.ones((tx_train_0.shape[0], 1)), tx_train_0]
tx_train_1 = np.c_[np.ones((tx_train_1.shape[0], 1)), tx_train_1]
tx_train_2 = np.c_[np.ones((tx_train_2.shape[0], 1)), tx_train_2]
tx_train_3 = np.c_[np.ones((tx_train_3.shape[0], 1)), tx_train_3]

In [9]:
print(tx_train_0.shape)
print(tx_train_1.shape)
print(tx_train_2.shape)
print(tx_train_3.shape)

(99913, 190)
(77544, 276)
(50379, 465)
(22164, 465)


In [30]:
print(miss_col_0)
print(miss_col_1)
print(miss_col_2)
print(miss_col_3)

[ 4  5  6 12 23 24 25 26 27 28 22 29]
[ 4  5  6 12 26 27 28 22]
[22]
[22]


In [9]:
# Initialize the weights randomly according to a Gaussian distribution
initial_w_0 = np.random.normal(0., 0.1, [tx_train_0.shape[1],])
initial_w_1 = np.random.normal(0., 0.1, [tx_train_1.shape[1],])
initial_w_2 = np.random.normal(0., 0.1, [tx_train_2.shape[1],])
initial_w_3 = np.random.normal(0., 0.1, [tx_train_3.shape[1],])

# Train models
w_0, train_loss_0 = impl.reg_logistic_regression(y_0, tx_train_0, 0.001, initial_w_0, max_iters=15000, gamma=0.1)
w_1, train_loss_1 = impl.reg_logistic_regression(y_1, tx_train_1, 0.001, initial_w_1, max_iters=15000, gamma=0.01)
w_2, train_loss_2 = impl.reg_logistic_regression(y_2, tx_train_2, 0.001, initial_w_2, max_iters=15000, gamma=0.01)
w_3, train_loss_3 = impl.reg_logistic_regression(y_3, tx_train_3, 0.001, initial_w_3, max_iters=15000, gamma=0.1)

Current iteration=0, loss=0.930253206858558
Current iteration=100, loss=0.382231648815959
Current iteration=200, loss=0.37158384357855984
Current iteration=300, loss=0.3675310408997164
Current iteration=400, loss=0.36522891670223256
Current iteration=500, loss=0.3636809580723967
Current iteration=600, loss=0.36259163333904465
Current iteration=700, loss=0.361790686094909
Current iteration=800, loss=0.3611685286083176
Current iteration=900, loss=0.3607287349906466
Current iteration=1000, loss=0.3602406415502376
Current iteration=1100, loss=0.3598907625569575
Current iteration=1200, loss=0.3596794877141411
Current iteration=1300, loss=0.3593247356949532
Current iteration=1400, loss=0.35910474867285186
Current iteration=1500, loss=0.3590077399336714
Current iteration=1600, loss=0.3587252034891172
Current iteration=1700, loss=0.3585795133420588
Current iteration=1800, loss=0.35854627005205375
Current iteration=1900, loss=0.3583070581577359
Current iteration=2000, loss=0.3582075242080674
Cu

Current iteration=2000, loss=0.45590303776235325
Current iteration=2100, loss=0.4553047625405546
Current iteration=2200, loss=0.45476778781629074
Current iteration=2300, loss=0.45428306718982636
Current iteration=2400, loss=0.45384318106739274
Current iteration=2500, loss=0.45344200049436206
Current iteration=2600, loss=0.45307442906011614
Current iteration=2700, loss=0.4527362028150259
Current iteration=2800, loss=0.45242373370720596
Current iteration=2900, loss=0.45213398600724947
Current iteration=3000, loss=0.4518643780000884
Current iteration=3100, loss=0.4516127032222897
Current iteration=3200, loss=0.45137706695507634
Current iteration=3300, loss=0.45115583472038906
Current iteration=3400, loss=0.45094759028801
Current iteration=3500, loss=0.45075110126704265
Current iteration=3600, loss=0.45056529078014906
Current iteration=3700, loss=0.4503892140420168
Current iteration=3800, loss=0.4502220389112773
Current iteration=3900, loss=0.4500630296766027
Current iteration=4000, loss=0

Current iteration=3900, loss=0.3967613582477023
Current iteration=4000, loss=0.39643593135831745
Current iteration=4100, loss=0.3961270174027704
Current iteration=4200, loss=0.39583340617028484
Current iteration=4300, loss=0.39555400060010565
Current iteration=4400, loss=0.39528780404831976
Current iteration=4500, loss=0.3950339092136641
Current iteration=4600, loss=0.3947914884804651
Current iteration=4700, loss=0.394559785474835
Current iteration=4800, loss=0.3943381076621795
Current iteration=4900, loss=0.3941258198407918
Current iteration=5000, loss=0.39392233840863355
Current iteration=5100, loss=0.3937271262990464
Current iteration=5200, loss=0.39353968849672677
Current iteration=5300, loss=0.393359568058357
Current iteration=5400, loss=0.3931863425732409
Current iteration=5500, loss=0.3930196210085223
Current iteration=5600, loss=0.3928590408913437
Current iteration=5700, loss=0.39270426578689177
Current iteration=5800, loss=0.3925549830368618
Current iteration=5900, loss=0.3924

Current iteration=5800, loss=0.41833832367840607
Current iteration=5900, loss=0.4181445751152472
Current iteration=6000, loss=0.41795630821398705
Current iteration=6100, loss=0.41777326411965177
Current iteration=6200, loss=0.41759520087261565
Current iteration=6300, loss=0.41742189204174346
Current iteration=6400, loss=0.4172531254873546
Current iteration=6500, loss=0.4170887022399245
Current iteration=6600, loss=0.4169284354821603
Current iteration=6700, loss=0.41677214962356374
Current iteration=6800, loss=0.4166196794578858
Current iteration=6900, loss=0.41647086939499317
Current iteration=7000, loss=0.41632557275964016
Current iteration=7100, loss=0.41618365115048855
Current iteration=7200, loss=0.4160449738534642
Current iteration=7300, loss=0.4159094173041844
Current iteration=7400, loss=0.41577686459476965
Current iteration=7500, loss=0.41564720502084845
Current iteration=7600, loss=0.41552033366501295
Current iteration=7700, loss=0.4153961510133714
Current iteration=7800, loss

In [10]:
print(f"train_loss_0 = {train_loss_0}")
print(f"train_loss_1 = {train_loss_1}")
print(f"train_loss_2 = {train_loss_2}")
print(f"train_loss_3 = {train_loss_3}")

train_loss_0 = 0.35681458766582386
train_loss_1 = 0.4448985635255296
train_loss_2 = 0.38733970437420373
train_loss_3 = 0.41021735934726467


In [11]:
# Compute training accuracies
predict_train_0 = helpers.predict_logistic(tx_train_0, w_0)
predict_train_1 = helpers.predict_logistic(tx_train_1, w_1)
predict_train_2 = helpers.predict_logistic(tx_train_2, w_2)
predict_train_3 = helpers.predict_logistic(tx_train_3, w_3)

predict_train_0[predict_train_0 == -1] = 0
predict_train_1[predict_train_1 == -1] = 0
predict_train_2[predict_train_2 == -1] = 0
predict_train_3[predict_train_3 == -1] = 0

predict_train = np.concatenate((predict_train_0, predict_train_1, predict_train_2, predict_train_3))

train_accuracy_0 = helpers.accuracy(predict_train_0, y_0)
train_accuracy_1 = helpers.accuracy(predict_train_1, y_1)
train_accuracy_2 = helpers.accuracy(predict_train_2, y_2)
train_accuracy_3 = helpers.accuracy(predict_train_3, y_3)
train_accuracy = helpers.accuracy(predict_train, np.concatenate((y_0, y_1, y_2, y_3)))

print(f"train_accuracy_0 = {train_accuracy_0}")
print(f"train_accuracy_1 = {train_accuracy_1}")
print(f"train_accuracy_2 = {train_accuracy_2}")
print(f"train_accuracy_3 = {train_accuracy_3}")
print(f"train_accuracy = {train_accuracy}")

train_accuracy_0 = 0.8421226467026313
train_accuracy_1 = 0.800448777468276
train_accuracy_2 = 0.8378888028742134
train_accuracy_3 = 0.8253474102147627
train_accuracy = 0.826856


In [12]:
# Load test data
tx_test, y_test, ids_test = helpers.load_data('test.csv', train=False)

In [13]:
# Split and clean data into 4 sets according to 22nd feature
tx_test_0, y_test_0, ids_test_0, _ = helpers.split_i(tx_test, y_test, ids_test, 0, miss_col_0)
tx_test_1, y_test_1, ids_test_1, _ = helpers.split_i(tx_test, y_test, ids_test, 1, miss_col_1)
tx_test_2, y_test_2, ids_test_2, _ = helpers.split_i(tx_test, y_test, ids_test, 2, miss_col_2)
tx_test_3, y_test_3, ids_test_3, _ = helpers.split_i(tx_test, y_test, ids_test, 3, miss_col_3)

In [14]:
#Standardize the data
tx_test_0, _, _ = helpers.standardize(tx_test_0, mean_0, std_0)
tx_test_1, _, _ = helpers.standardize(tx_test_1, mean_1, std_1)
tx_test_2, _, _ = helpers.standardize(tx_test_2, mean_2, std_2)
tx_test_3, _, _ = helpers.standardize(tx_test_3, mean_3, std_3)

In [15]:
# Expand to degree 2
tx_test_0 = helpers.build_poly_deg2(tx_test_0)
tx_test_1 = helpers.build_poly_deg2(tx_test_1)
tx_test_2 = helpers.build_poly_deg2(tx_test_2)
tx_test_3 = helpers.build_poly_deg2(tx_test_3)

In [16]:
# Add bias to data
tx_test_0 = np.c_[np.ones((tx_test_0.shape[0], 1)), tx_test_0]
tx_test_1 = np.c_[np.ones((tx_test_1.shape[0], 1)), tx_test_1]
tx_test_2 = np.c_[np.ones((tx_test_2.shape[0], 1)), tx_test_2]
tx_test_3 = np.c_[np.ones((tx_test_3.shape[0], 1)), tx_test_3]

In [17]:
# Predict labels
predict_test_0 = helpers.predict_logistic(tx_test_0, w_0)
predict_test_1 = helpers.predict_logistic(tx_test_1, w_1)
predict_test_2 = helpers.predict_logistic(tx_test_2, w_2)
predict_test_3 = helpers.predict_logistic(tx_test_3, w_3)

In [18]:
print(predict_test_0)
print(predict_test_1)
print(predict_test_2)
print(predict_test_3)

[-1. -1.  1. ... -1.  1. -1.]
[-1. -1. -1. ...  1.  1. -1.]
[-1.  1. -1. ... -1.  1.  1.]
[-1. -1. -1. ...  1. -1. -1.]


In [19]:
# Concatenate sets
predict_test = np.concatenate((predict_test_0, predict_test_1, predict_test_2, predict_test_3))
ids_test = np.concatenate((ids_test_0, ids_test_1, ids_test_2, ids_test_3))

In [20]:
print(predict_test.shape)

(568238,)


In [21]:
# Create csv file
helpers.create_csv_submission(ids_test, predict_test, 'Predictions_Reg_Logistic.csv')