# 003: Preprocessing the data

In [1]:
import sys

import numpy as np

sys.path.append("../")
import helpers

%load_ext autoreload
%autoreload 2

In [32]:
x_train, x_test, y_train, train_ids, test_ids = helpers.load_csv_data("../data/dataset", sub_sample=False)

In [34]:
missing_values = []
with open("../data/missing_values.txt", "r") as f:
    for line in f:
        line = line.strip().strip('"')  # remove whitespace and surrounding quotes
        # split by comma and convert to int
        if line == "":
            missing_values.append([])
        else:
            numbers = [int(x.strip()) for x in line.split(",")]
            missing_values.append(numbers)

In [35]:
for col, miss_vals in enumerate(missing_values):
    for miss_val in miss_vals:
        x_train[x_train[:, col] == miss_val, col] = np.nan
        x_test[x_test[:, col] == miss_val, col] = np.nan

In [36]:
# kick out features with no variance
stds = np.std(x_train, axis=0)
x_train = x_train[:, stds > 0]
stds = np.std(x_test, axis=0)
x_test = x_test[:, stds > 0]

In [37]:
# Missing value imputation
# Mean imputation for each column in x_train
def impute_missing_values(x):
    col_means = np.nanmean(x, axis=0)
    inds = np.where(np.isnan(x))
    x[inds] = np.take(col_means, inds[1])
    return x
x_train = impute_missing_values(x_train)
x_test = impute_missing_values(x_test)

In [None]:
x_train = (x_train - x_train.mean(axis=0)) / x_train.std(axis=0)
x_test = (x_test - x_test.mean(axis=0)) / x_test.std(axis=0)

In [27]:
y_train = (y_train + 1) / 2  # convert to 0/1

In [42]:
np.savez("../data/dataset_prep/train.npz", x_train=x_train, y_train=y_train)
np.savez("../data/dataset_prep/test.npz", x_test=x_test)