# 003: Preprocessing the data

In [6]:
import sys

import numpy as np

sys.path.append("../")
import helpers
from preprocessing import preprocess

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
x_train, x_test, y_train, train_ids, test_ids = helpers.load_csv_data("../data/dataset", sub_sample=False)

In [3]:
x_train_orig = x_train.copy()
x_test_orig = x_test.copy()
y_train_orig = y_train.copy()

In [4]:
x_train = x_train_orig.copy()
x_test = x_test_orig.copy()
y_train = y_train_orig.copy()

In [5]:
# load and parse missing values
missing_values = []
with open("../data/missing_values.txt", "r") as f:
    for line in f:
        line = line.strip().strip('"')  # remove whitespace and surrounding quotes
        # split by comma and convert to int
        numbers = [int(x.strip()) for x in line.split(",") if x.strip() != ""]
        missing_values.append(numbers)

assert x_train.shape[1] == len(missing_values), "Mismatch between features and missing values"

# replace missing values with np.nan
for col, miss_vals in enumerate(missing_values):
    for miss_val in miss_vals:
        x_train[x_train[:, col] == miss_val, col] = np.nan
        x_test[x_test[:, col] == miss_val, col] = np.nan


In [6]:
# recoding of nominal and maybe others -> one-hot encoding
variable_type = []
with open("../data/variable_type.txt", "r") as f:
    for line in f:
        variable_type.append(line.strip().strip('"')) # remove whitespace and surrounding quotes

nominal_features = np.where(np.array(variable_type)=="nominal")[0]
one_hot_encoded = []
for idx in nominal_features:
    unique_vals = np.unique(x_train[:, idx])
    print(f"Feature {idx}: {len(unique_vals)} unique values")
    if len(unique_vals) < 50:
        train_cols = [(x_train[:, idx] == val).astype(float) for val in unique_vals if val != np.nan]
        test_cols = [(x_test[:, idx] == val).astype(float) for val in unique_vals if val != np.nan]
        x_train = np.column_stack([x_train, *train_cols])
        x_test = np.column_stack([x_test, *test_cols])
        one_hot_encoded.append(idx)
# Delete the original columns
#x_train = np.delete(x_train, np.array(one_hot_encoded), axis=1)
#x_test = np.delete(x_test, np.array(one_hot_encoded), axis=1)

Feature 0: 53 unique values
Feature 34: 5 unique values
Feature 51: 7 unique values
Feature 53: 4 unique values
Feature 58: 9 unique values
Feature 88: 76 unique values
Feature 91: 77 unique values
Feature 102: 12 unique values
Feature 106: 10 unique values
Feature 119: 16 unique values
Feature 122: 14 unique values
Feature 125: 7 unique values
Feature 130: 9 unique values
Feature 182: 3 unique values
Feature 189: 6 unique values
Feature 190: 5 unique values
Feature 191: 7 unique values
Feature 194: 5 unique values
Feature 196: 5 unique values
Feature 198: 5 unique values
Feature 201: 7 unique values
Feature 216: 8 unique values
Feature 217: 4 unique values
Feature 218: 5 unique values
Feature 224: 8 unique values
Feature 225: 8 unique values
Feature 227: 3 unique values
Feature 239: 9 unique values
Feature 240: 8 unique values
Feature 242: 9 unique values
Feature 243: 3 unique values
Feature 244: 6 unique values
Feature 245: 6 unique values
Feature 282: 3 unique values
Feature 283: 3 

In [107]:
# Missing value imputation
# TODO: try other strategies
# Mean imputation for each column in x_train
def impute_missing_values(x):
    col_means = np.nanmean(x, axis=0)
    inds = np.where(np.isnan(x))
    x[inds] = np.take(col_means, inds[1])
    return x
x_train = impute_missing_values(x_train)
x_test = impute_missing_values(x_test)

In [108]:
# kick out features with no variance in training or test set
stds_train = np.std(x_train, axis=0)
stds_test = np.std(x_test, axis=0)
x_train = x_train[:, (stds_train > 0) & (stds_test > 0)]
x_test = x_test[:, (stds_train > 0) & (stds_test > 0)]

In [None]:
# feature scaling by standardization
x_train = (x_train - x_train.mean(axis=0)) / x_train.std(axis=0)
x_test = (x_test - x_train.mean(axis=0)) / x_train.std(axis=0)

In [110]:
y_train = (y_train + 1) / 2  # convert to 0/1

In [112]:
x_train = np.c_[np.ones((y_train.shape[0], 1)), x_train] # add bias term
x_test = np.c_[np.ones((x_test.shape[0], 1)), x_test] # add bias term

In [113]:
np.savez("../data/dataset_prep/train.npz", x_train=x_train, y_train=y_train)
np.savez("../data/dataset_prep/test.npz", x_test=x_test, test_ids=test_ids)

In [7]:
preprocess(one_hot_encoding=False, save_dir="../data/dataset_prep/")

Loading raw data...
Replacing missing value codes with np.nan...
Saving preprocessed data to ../data/dataset_prep/...


(array([[5.3000000e+01, 1.1000000e+01, 1.1162015e+07, ...,           nan,
                   nan, 2.0000000e+00],
        [3.3000000e+01, 1.2000000e+01, 1.2152015e+07, ...,           nan,
                   nan,           nan],
        [2.0000000e+01, 1.0000000e+01, 1.0202015e+07, ..., 1.0000000e+00,
         2.0000000e+00, 2.0000000e+00],
        ...,
        [3.9000000e+01, 1.0000000e+01, 1.0202015e+07, ..., 2.0000000e+00,
         2.0000000e+00, 2.0000000e+00],
        [3.3000000e+01, 1.2000000e+01, 1.2302015e+07, ...,           nan,
                   nan, 2.0000000e+00],
        [3.2000000e+01, 9.0000000e+00, 9.1220150e+06, ...,           nan,
                   nan, 2.0000000e+00]]),
 array([[4.4000000e+01, 2.0000000e+00, 2.0820150e+06, ..., 1.0000000e+00,
         1.0000000e+00, 2.0000000e+00],
        [2.7000000e+01, 1.0000000e+00, 1.1920150e+06, ...,           nan,
                   nan, 2.0000000e+00],
        [3.5000000e+01, 5.0000000e+00, 5.2620150e+06, ..., 1.0000000e+00,