# 003: Preprocessing the data

In [10]:
import sys

import numpy as np

sys.path.append("../")
import helpers

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
x_train, x_test, y_train, train_ids, test_ids = helpers.load_csv_data("../data/dataset", sub_sample=False)

In [18]:
# load and parse missing values
missing_values = []
with open("../data/missing_values.txt", "r") as f:
    for line in f:
        line = line.strip().strip('"')  # remove whitespace and surrounding quotes
        # split by comma and convert to int
        if line == "":
            missing_values.append([])
        else:
            numbers = [int(x.strip()) for x in line.split(",")]
            missing_values.append(numbers)

# replace missing values with np.nan
for col, miss_vals in enumerate(missing_values):
    for miss_val in miss_vals:
        x_train[x_train[:, col] == miss_val, col] = np.nan
        x_test[x_test[:, col] == miss_val, col] = np.nan

In [23]:
# Missing value imputation
# Mean imputation for each column in x_train
def impute_missing_values(x):
    col_means = np.nanmean(x, axis=0)
    inds = np.where(np.isnan(x))
    x[inds] = np.take(col_means, inds[1])
    return x
x_train = impute_missing_values(x_train)
x_test = impute_missing_values(x_test)

In [24]:
# kick out features with no variance in training set
stds = np.std(x_train, axis=0)
x_train = x_train[:, stds > 0]
x_test = x_test[:, stds > 0]

In [27]:
# feature scaling by standardization
x_train = (x_train - x_train.mean(axis=0)) / x_train.std(axis=0)
x_test = (x_test - x_test.mean(axis=0)) / x_test.std(axis=0)

In [None]:
# recoding of nominal and maybe others
# TODO

In [28]:
y_train = (y_train + 1) / 2  # convert to 0/1

In [30]:
x_train = np.c_[np.ones((y_train.shape[0], 1)), x_train] # add bias term
x_test = np.c_[np.ones((x_test.shape[0], 1)), x_test] # add bias term

In [32]:
np.savez("../data/dataset_prep/train.npz", x_train=x_train, y_train=y_train)
np.savez("../data/dataset_prep/test.npz", x_test=x_test, test_ids=test_ids)