# Preparing the data

In this notebook I prepare the data for the [portoseguro Kaggle competition](https://www.kaggle.com/c/porto-seguro-safe-driver-prediction).

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
import bcolz

from importlib import reload
import data_prep; reload(data_prep)
from data_prep import *

I have downloaded the files from [Kaggle](https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/data), and unzip them.

In [2]:
train_file = 'data/train.csv'
test_file = 'data/test.csv'

In [3]:
train_df = pd.read_csv(train_file, index_col='id')
test_df = pd.read_csv(test_file, index_col='id')
targets = train_df['target'].values

num_train = len(train_df)
whole_df = pd.concat([train_df.drop(['target'],axis=1),test_df])

categorical_columns = [col for col in whole_df.columns if 'cat' in col]
rest_of_columns = [col for col in whole_df.columns if not(col in categorical_columns)]

In [4]:
categorical_features = np.concatenate([pd.get_dummies(whole_df[col]).values for col in categorical_columns], axis=1)
numerical_features = np.transpose(np.concatenate([np.array([whole_df[col].values]) for col in rest_of_columns],axis=0))
all_features = np.concatenate([categorical_features,numerical_features],axis=1)

In [5]:
training_features = all_features[:num_train]
test_features = all_features[num_train:]

In [6]:
training_features.shape

(595212, 227)

In [7]:
test_features.shape

(892816, 227)

In [8]:
targets.shape

(595212,)

In [9]:
save_array('data/train_features.bc', training_features)
save_array('data/targets.bc', targets)
save_array('data/test_features.bc', test_features)

In [10]:
test_array = load_array('data/train_features.bc/')
test_array.shape

(595212, 227)