In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

np.random.seed(69)

In [2]:
data = pd.read_csv('train.csv')
data = data.replace([np.inf, -np.inf], np.nan)

In [4]:
columns = list(data.columns)
y = np.array(data['y'])
np.save('Data/y.npy', y)

data = data[columns[1:-1]]

In [5]:
columns = data.columns

columns_without_nan = data.columns[data.isna().sum() == 0]
data_without_nan = data[columns_without_nan]

#### Drop columns with constant values

In [6]:
data_without_nan = data_without_nan[data_without_nan.columns[data_without_nan.std(axis=0) != 0]]

#### Split columns in two groups: categorial and real

In [8]:
def is_cat(column):
    if (column.astype(np.int) != column).sum() > 0:
        return False
    else:
        return True

In [9]:
real_columns = []
cat_columns = []

for c in data_without_nan.columns:
    if is_cat(data_without_nan[c]):
        cat_columns.append(c)
    else:
        real_columns.append(c)

In [10]:
len(real_columns), len(cat_columns)

(826, 221)

#### Drop high correlated real columns

In [11]:
temp = data_without_nan[real_columns]
corr_matrix = temp.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
temp = temp.drop(columns=to_drop)

print(temp.shape)

(1095, 407)


In [13]:
real_columns = list(temp.columns)

#### Merge result

In [22]:
_columns = real_columns + cat_columns
_columns = sorted(_columns)

data_without_nan = data_without_nan[_columns]
data_without_nan_real = data_without_nan[real_columns]
data_without_nan_cat = data_without_nan[cat_columns]

In [16]:
data_without_nan.shape

(1095, 628)

In [17]:
np.save('Data/data_without_nan.npy', np.array(data_without_nan))

In [23]:
np.save('Data/data_without_nan_real.npy', np.array(data_without_nan_real))
np.save('Data/data_without_nan_cat.npy', np.array(data_without_nan_cat))

In [25]:
columns_to_keep = list(data_without_nan_real) + list(data_without_nan_cat)

#### Transform test

In [26]:
test = pd.read_csv('test.csv')

In [27]:
test = test[columns_to_keep]

In [31]:
merged = pd.concat([data_without_nan_real, data_without_nan_cat], axis=1)

In [36]:
merged_data = np.hstack([np.array(data_without_nan_real), np.array(data_without_nan_cat)])

In [40]:
np.save('test_undamaged.npy', np.array(test))