In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import itertools
import matplotlib.pyplot as plt
import csv
import helpers as helpers
import implementations as impl

In [2]:
# Load data
train_data, train_labels, ids = helpers.load_data('train.csv')

In [3]:
# Clean and standardize data
new_data = helpers.standardize(helpers.clean_data(train_data))
train_labels[train_labels==-1]=0

In [4]:
# Shuffle data
train_labels, new_data = helpers.shuffle_data(train_labels, new_data)

In [5]:
# Slice into training and validation sets
y_validation, y_train, tx_validation, tx_train = helpers.slice_data(train_labels, new_data, 0.25)

In [6]:
# Add bias to data
tx_train = np.c_[np.ones((y_train.shape[0], 1)), tx_train]
tx_validation = np.c_[np.ones((y_validation.shape[0], 1)), tx_validation]

In [8]:
# Initialize the weights randomly according to a Gaussian distribution
initial_w = np.random.normal(0., 0.1, [tx_train.shape[1],])

# Train model
trained_weights, train_loss = impl.logistic_regression(y_train, tx_train, initial_w, max_iters=1000, gamma=0.01)

Current iteration=0, loss=0.7163567950814637
Current iteration=100, loss=0.6227581517924388
Current iteration=200, loss=0.584904560206431
Current iteration=300, loss=0.5641307100362333
Current iteration=400, loss=0.5512503018068267
Current iteration=500, loss=0.5426562963788715
Current iteration=600, loss=0.5366105979203447
Current iteration=700, loss=0.5321775687371848
Current iteration=800, loss=0.5288137652729488
Current iteration=900, loss=0.526185117200647


In [9]:
print(trained_weights)

[-0.63052585 -0.04999879 -0.5173932  -0.08092778  0.00691948  0.09996688
  0.14235612 -0.07642688  0.26022932 -0.12728135  0.23888175 -0.23496718
  0.31161957  0.15073612  0.29079516  0.01716182 -0.00413932  0.10489579
 -0.02860195  0.01493111  0.05025261 -0.01344014 -0.03807062 -0.09740716
 -0.05989901  0.015122   -0.00314341 -0.15312257 -0.00116578 -0.00306518
  0.09697944]


In [13]:
# Cross validation
predict_validation = helpers.predict_logistic(tx_validation, trained_weights)
predict_train = helpers.predict_logistic(tx_train, trained_weights)

predict_validation[predict_validation == -1] = 0
predict_train[predict_train == -1] = 0

train_accuracy = helpers.accuracy(predict_train, y_train)
validation_accuracy = helpers.accuracy(predict_validation, y_validation)

print(f"train_accuracy = {train_accuracy}")
print(f"validation_accuracy = {validation_accuracy}")

train_accuracy = 0.7350986666666667
validation_accuracy = 0.735232


In [15]:
print(predict_validation)
print(y_validation)

[0. 0. 0. ... 1. 0. 0.]
[1. 0. 0. ... 1. 0. 0.]


In [16]:
tx_test, y_test, ids_test = helpers.load_data('test.csv', train=False)
tx_test = helpers.standardize(helpers.clean_data(tx_test))

KeyboardInterrupt: 

In [None]:
tx_test = np.c_[np.ones((y_test.shape[0], 1)), tx_test]
predict_test = helpers.predict_logistic(tx_test, trained_weights)
print(predict_test)

In [None]:
helpers.create_csv_submission(ids_test, predict_test, 'Predictions_Logistics.csv')