In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import itertools
import matplotlib.pyplot as plt
import csv
import helpers as helpers
import implementations as impl

In [2]:
# Load train data
tx_train, y_train, ids_train = helpers.load_data('train.csv')

In [3]:
# Refactor the -1 in 0 value for logistic regression
y_train[y_train==-1]=0

In [4]:
# Shuffle data
y_train, tx_train = helpers.shuffle_data(y_train, tx_train)

In [5]:
# Split and clean data into 4 sets according to 22nd feature
tx_train_0, y_0, _, miss_col_0 = helpers.split_i(tx_train, y_train, ids_train, 0)
tx_train_1, y_1, _, miss_col_1 = helpers.split_i(tx_train, y_train, ids_train, 1)
tx_train_2, y_2, _, miss_col_2 = helpers.split_i(tx_train, y_train, ids_train, 2)
tx_train_3, y_3, _, miss_col_3 = helpers.split_i(tx_train, y_train, ids_train, 3)

In [6]:
#Standardize the data
tx_train_0, mean_0, std_0 = helpers.standardize(tx_train_0)
tx_train_1, mean_1, std_1 = helpers.standardize(tx_train_1)
tx_train_2, mean_2, std_2 = helpers.standardize(tx_train_2)
tx_train_3, mean_3, std_3 = helpers.standardize(tx_train_3)

In [7]:
# Expand to degree 2
tx_train_0 = helpers.build_poly_deg2(tx_train_0)
tx_train_1 = helpers.build_poly_deg2(tx_train_1)
tx_train_2 = helpers.build_poly_deg2(tx_train_2)
tx_train_3 = helpers.build_poly_deg2(tx_train_3)

In [8]:
# Add bias to data
tx_train_0 = np.c_[np.ones((tx_train_0.shape[0], 1)), tx_train_0]
tx_train_1 = np.c_[np.ones((tx_train_1.shape[0], 1)), tx_train_1]
tx_train_2 = np.c_[np.ones((tx_train_2.shape[0], 1)), tx_train_2]
tx_train_3 = np.c_[np.ones((tx_train_3.shape[0], 1)), tx_train_3]

In [9]:
print(tx_train_0.shape)
print(tx_train_1.shape)
print(tx_train_2.shape)
print(tx_train_3.shape)

(99913, 190)
(77544, 276)
(50379, 465)
(22164, 465)


In [21]:
def mean_squared_error_gd(y, tx, w_init, max_iters, gamma):
    """Training function to compute mean squared error solution using gradient descent
    
    Args:
        y (np.array): Labels of shape (N,), N is the number of samples.
        tx (np.array): Dataset of shape (N,D), D is the number of features.
        w_init (np.array): Initial weights of shape (D,)
        max_iters (integer): Maximum number of iterations.
        gamma (float): Step size
        
    Returns:
        w (np.array): optimal weights, numpy array of shape(D,), D is the number of features.
        loss (float): Final value of the cost function.
    """
    w = w_init.copy()
    for n in range (max_iters):
        loss = np.sum(np.square(tx.dot(w) - y, dtype=float), dtype=float)/(2*tx.shape[0])
        gradient = np.dot(tx.dot(w) - y, tx)/tx.shape[0]
        print(gradient.mean())
        w += gamma * gradient
        # Log info
        if n % 100 == 0:
            print(f"Current iteration={n}, loss={loss}")
    loss = np.sum(np.square(tx.dot(w) - y))/(2*tx.shape[0])

    return w, loss

In [22]:
# Initialize the weights randomly according to a Gaussian distribution
initial_w_0 = np.random.normal(0., 0.1, [tx_train_0.shape[1],])
initial_w_1 = np.random.normal(0., 0.1, [tx_train_1.shape[1],])
initial_w_2 = np.random.normal(0., 0.1, [tx_train_2.shape[1],])
initial_w_3 = np.random.normal(0., 0.1, [tx_train_3.shape[1],])

# Train models
#w_0, train_loss_0 = impl.logistic_regression(y_0, tx_train_0, initial_w_0, max_iters=10000, gamma=0.1)
#w_1, train_loss_1 = impl.logistic_regression(y_1, tx_train_1, initial_w_1, max_iters=10000, gamma=0.01)
#w_2, train_loss_2 = impl.logistic_regression(y_2, tx_train_2, initial_w_2, max_iters=10000, gamma=0.01)
#w_3, train_loss_3 = impl.logistic_regression(y_3, tx_train_3, initial_w_3, max_iters=10000, gamma=0.01)
w_0, train_loss_0 = mean_squared_error_gd(y_0, tx_train_0, initial_w_0, max_iters=15000, gamma=0.1)
w_1, train_loss_1 = impl.mean_squared_error_gd(y_1, tx_train_1, initial_w_1, max_iters=15000, gamma=0.01)
w_2, train_loss_2 = impl.mean_squared_error_gd(y_2, tx_train_2, initial_w_2, max_iters=15000, gamma=0.01)
w_3, train_loss_3 = impl.mean_squared_error_gd(y_3, tx_train_3, initial_w_3, max_iters=15000, gamma=0.01)

-74.05522338571619
Current iteration=0, loss=411.76001092343887
293399.04547187936
-1156020536.8112617
4554374293578.085
-1.7942842337294204e+16
7.068931171597252e+19
-2.784942706038226e+23
1.0971822596098242e+27
-4.3225625726244665e+30
1.7029574649610666e+34
-6.709131629078555e+37
2.643192689333022e+41
-1.0413370878972057e+45
4.102549674136178e+48
-1.616279111189807e+52
6.367645422400747e+55
-2.5086575669206566e+59
9.88334363896714e+62
-3.8937351503822215e+66
1.534012574605415e+70
-6.043540426257617e+73
2.380970109922683e+77
-9.380294106604876e+80
3.6955490184319067e+84
-1.455933299363916e+88
5.735931959295529e+91
-2.2597817809402506e+95
8.90284915111973e+98
-3.507450306755539e+102
1.3818281592261122e+106
-5.443980369308496e+109
2.1447617826817303e+113
-8.449705532344571e+116
3.3289255785815994e+120
-1.3114948758054445e+124
5.166888741312101e+127
-2.035596155013748e+131
8.019626343365866e+134
-3.1594875304121145e+138
1.244741466425984e+142
-4.9039007222738717e+145
1.9319869179716216e+

  loss = np.sum(np.square(tx.dot(w) - y, dtype=float), dtype=float)/(2*tx.shape[0])


-4.417400626478098e+181
1.7403207579288123e+185
-6.85633157727108e+188
2.70118496738678e+192
-1.064184271983588e+196
4.192560592519713e+199
-1.6517406604013806e+203
6.507353081767882e+206
-2.5636981122995275e+210
1.0100186555764188e+214
-3.979164628308659e+217
1.567669176382308e+221
-6.1761371447046535e+224
2.4332091620392877e+228
-9.586099996675264e+231
3.7766302453522244e+235
-1.487876823218623e+239
5.861779674607035e+242
-2.309361932213356e+246
9.098179784987645e+249
-3.584404603076721e+253
1.412145798630761e+257
-5.563422597099737e+260
2.191818367758573e+264
-8.635094087133068e+267
3.401962999785103e+271
-1.3402693861960294e+275
5.280251512693626e+278
-2.0802576201815176e+282
8.195578858166407e+285
-3.2288074404247763e+289
1.2720513910929325e+293
-5.011493473790351e+296
1.974375171766036e+300
nan
nan
nan


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
Current iteration=100, loss=nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
Current iteration=200, loss=nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
Current iteration=300, loss=nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


KeyboardInterrupt: 

In [None]:
print(f"train_loss_0 = {train_loss_0}")
print(f"train_loss_1 = {train_loss_1}")
print(f"train_loss_2 = {train_loss_2}")
print(f"train_loss_3 = {train_loss_3}")

In [None]:
def predict(tx, w):
    y = tx.dot(w)
    y[y < 0.5] = -1
    y[y >= 0.5] = 1
    return y

In [None]:
# Compute training accuracies
#predict_train_0 = helpers.predict_logistic(tx_train_0, w_0)
#predict_train_1 = helpers.predict_logistic(tx_train_1, w_1)
#predict_train_2 = helpers.predict_logistic(tx_train_2, w_2)
#predict_train_3 = helpers.predict_logistic(tx_train_3, w_3)
predict_train_0 = predict(tx_train_0, w_0)
predict_train_1 = predict(tx_train_1, w_1)
predict_train_2 = predict(tx_train_2, w_2)
predict_train_3 = predict(tx_train_3, w_3)

predict_train_0[predict_train_0 == -1] = 0
predict_train_1[predict_train_1 == -1] = 0
predict_train_2[predict_train_2 == -1] = 0
predict_train_3[predict_train_3 == -1] = 0

predict_train = np.concatenate((predict_train_0, predict_train_1, predict_train_2, predict_train_3))

train_accuracy_0 = helpers.accuracy(predict_train_0, y_0)
train_accuracy_1 = helpers.accuracy(predict_train_1, y_1)
train_accuracy_2 = helpers.accuracy(predict_train_2, y_2)
train_accuracy_3 = helpers.accuracy(predict_train_3, y_3)
train_accuracy = helpers.accuracy(predict_train, np.concatenate((y_0, y_1, y_2, y_3)))

print(f"train_accuracy_0 = {train_accuracy_0}")
print(f"train_accuracy_1 = {train_accuracy_1}")
print(f"train_accuracy_2 = {train_accuracy_2}")
print(f"train_accuracy_3 = {train_accuracy_3}")
print(f"train_accuracy = {train_accuracy}")

In [None]:
# Load test data
tx_test, y_test, ids_test = helpers.load_data('test.csv', train=False)

In [None]:
# Split and clean data into 4 sets according to 22nd feature
tx_test_0, y_test_0, ids_test_0, _ = helpers.split_i(tx_test, y_test, ids_test, 0, miss_col_0)
tx_test_1, y_test_1, ids_test_1, _ = helpers.split_i(tx_test, y_test, ids_test, 1, miss_col_1)
tx_test_2, y_test_2, ids_test_2, _ = helpers.split_i(tx_test, y_test, ids_test, 2, miss_col_2)
tx_test_3, y_test_3, ids_test_3, _ = helpers.split_i(tx_test, y_test, ids_test, 3, miss_col_3)

In [None]:
#Standardize the data
tx_test_0, _, _ = helpers.standardize(tx_test_0, mean_0, std_0)
tx_test_1, _, _ = helpers.standardize(tx_test_1, mean_1, std_1)
tx_test_2, _, _ = helpers.standardize(tx_test_2, mean_2, std_2)
tx_test_3, _, _ = helpers.standardize(tx_test_3, mean_3, std_3)

In [None]:
# Expand to degree 2
tx_test_0 = helpers.build_poly_deg2(tx_test_0)
tx_test_1 = helpers.build_poly_deg2(tx_test_1)
tx_test_2 = helpers.build_poly_deg2(tx_test_2)
tx_test_3 = helpers.build_poly_deg2(tx_test_3)

In [None]:
# Add bias to data
tx_test_0 = np.c_[np.ones((tx_test_0.shape[0], 1)), tx_test_0]
tx_test_1 = np.c_[np.ones((tx_test_1.shape[0], 1)), tx_test_1]
tx_test_2 = np.c_[np.ones((tx_test_2.shape[0], 1)), tx_test_2]
tx_test_3 = np.c_[np.ones((tx_test_3.shape[0], 1)), tx_test_3]

In [None]:
# Predict labels
#predict_test_0 = helpers.predict_logistic(tx_test_0, w_0)
#predict_test_1 = helpers.predict_logistic(tx_test_1, w_1)
#predict_test_2 = helpers.predict_logistic(tx_test_2, w_2)
#predict_test_3 = helpers.predict_logistic(tx_test_3, w_3)
predict_test_0 = predict(tx_test_0, w_0)
predict_test_1 = predict(tx_test_1, w_1)
predict_test_2 = predict(tx_test_2, w_2)
predict_test_3 = predict(tx_test_3, w_3)

In [None]:
print(predict_test_0)
print(predict_test_1)
print(predict_test_2)
print(predict_test_3)

In [None]:
# Concatenate sets
predict_test = np.concatenate((predict_test_0, predict_test_1, predict_test_2, predict_test_3))
ids_test = np.concatenate((ids_test_0, ids_test_1, ids_test_2, ids_test_3))

In [None]:
print(predict_test.shape)

In [None]:
# Create csv file
helpers.create_csv_submission(ids_test, predict_test, 'Predictions_Ridge_Regression.csv')