# Preparacion del dataset para train/test

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("data/dataset_joined.csv")

## Creacion de variable target

In [3]:
data['target'] = data.pregunta_13 != 'C'

## Definicion de variables categoricas y continuas

In [4]:
categorical_columns = ['genero', 'barrio_completo'] + [col for col in data.columns if 'pregunta_13' not in col and 'pregunta' in col]

In [5]:
numerical_columns = ['edad', 'poblacion_por_barrio']

## Creacion de dummies

In [6]:
categorical_features_df = pd.get_dummies(data.loc[:, categorical_columns])

In [7]:
data = pd.concat([data, categorical_features_df], axis = 1)

In [8]:
categorical_features = categorical_features_df.columns.tolist()

## Division en train/test

Divido utilizando los indices solamente

In [9]:
from sklearn.model_selection import train_test_split
train_index, test_index = train_test_split(data.index, stratify = data.target, random_state = 42)

Entreno el standard scaler en test, transformo para train y test

## Escalado de variables continuas

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(data.loc[train_index, numerical_columns])
train_numerical_scaled = scaler.transform(data.loc[train_index, numerical_columns])
test_numerical_scaled = scaler.transform(data.loc[test_index, numerical_columns])

In [11]:
numerical_features = [f"{col}_scaled" for col in numerical_columns]

In [12]:
train_numerical_df = pd.DataFrame(train_numerical_scaled, index = train_index, columns = numerical_features)
test_numerical_df = pd.DataFrame(test_numerical_scaled, index = test_index, columns = numerical_features)
numerical_features_df = train_numerical_df.append(test_numerical_df)

Agrego features escaladas al dataset

In [13]:
data = pd.concat([data, numerical_features_df], axis = 1)

## Generacion de train/test datasets solo con features

In [14]:
feature_cols = categorical_features + numerical_features

In [15]:
dataset_to_use = data.loc[:, feature_cols + ['target']]
train = dataset_to_use.loc[train_index]
test = dataset_to_use.loc[test_index]

## Guardo train y test en formato csv para compartir

In [16]:
train.to_csv("data/train.csv")
test.to_csv("data/test.csv")