# Laboratorio 4 - Feature Engineering
## Product Development - Ing. Preng Biba
### Alumno: Hugo Brian Bay Rojas - Carnet 20002544

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import joblib

In [2]:
data = pd.read_csv('train.csv')
print(data.shape)

(891, 12)


In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 1. Split Train y Test y eliminación de variables que no serán utilizadas

In [24]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['PassengerId','Name','Ticket'], axis = 1),
    data['Survived'],
    test_size = 0.15,
    random_state = 2021
)

In [25]:
X_train.shape, X_test.shape

((757, 9), (134, 9))

### 2. Missing Values

#### 2.1. Missing values para variables categóricas

In [26]:
cat_vars = [var for var in X_train.columns if (data[var].dtype == 'O' and var != 'Survived')]
cat_vars = cat_vars + ['Pclass']

In [27]:
X_train[cat_vars] = X_train[cat_vars].astype('O')
X_test[cat_vars] = X_test[cat_vars].astype('O')

In [28]:
len(cat_vars)

4

##### 2.1.1. Detección de NaN en variables categóricas

In [29]:
cat_vars_with_na = [var for var in cat_vars if X_train[var].isnull().sum() >0]

In [30]:
# Porcentajes de valores faltantes en variables
X_train[cat_vars_with_na].isnull().mean().sort_values(ascending = False)

Cabin       0.764861
Embarked    0.001321
dtype: float64

In [31]:
# Separación de variables que serán tratadas con etiqueta de faltante y de las que se tratarán con procedimiento de frecuencia según el peso de sus faltantes
vars_with_missing_string = [var for var in cat_vars_with_na if X_train[var].isnull().mean() > 0.2]
vars_freq_category = [var for var in cat_vars_with_na if X_train[var].isnull().mean() <= 0.2]

##### Aplicamos criterio para data faltante

In [32]:
# Sustituir faltantes con etiqueta de missing
X_train[vars_with_missing_string] = X_train[vars_with_missing_string].fillna('Missing')
X_test[vars_with_missing_string] = X_test[vars_with_missing_string].fillna('Missing')

In [33]:
# Sustituir valores faltantes con la moda para los que tienen pocos faltantes
for var in vars_freq_category:
    mode = X_train[var].mode()[0]

    X_train[var].fillna(mode, inplace=True)
    X_test[var].fillna(mode, inplace=True)

    print(var, "----------", mode)

Embarked ---------- S


In [34]:
# Porcentajes de valores faltantes en variables posterior al tratamiento
X_train[cat_vars_with_na].isnull().mean().sort_values(ascending = False)

Embarked    0.0
Cabin       0.0
dtype: float64

#### 2.2. Missing Values para variables numéricas

In [35]:
num_vars = [var for var in X_train.columns if var not in cat_vars and var !='Survived']

In [36]:
len(num_vars)

4

In [37]:
# Numéricas con faltantes
num_vars_with_na = [var for var in num_vars if X_train[var].isnull().sum() > 0]
X_train[num_vars_with_na].isnull().mean()

Age    0.200793
dtype: float64

##### Aplicamos criterio para data faltante

In [38]:
for var in num_vars_with_na:
    mean_val = X_train[var].mean()

    X_train[var].fillna(mean_val, inplace = True)
    X_test[var].fillna(mean_val, inplace = True)

    print(var, mean_val)

Age 29.51267768595041


In [39]:
# Numéricas con faltantes después del tratamiento
X_train[num_vars_with_na].isnull().mean()

Age    0.0
dtype: float64

### 3. Transformación de variables numéricas

In [40]:
# Aplicamos transformación logaritmica para variable Fare
X_train['Fare'] = np.log(X_train['Fare']+1)
X_test['Fare'] = np.log(X_test['Fare']+1)

### 4. Codificación de variables categóricas

#### 4.1. Transformación de variable 'Cabin' en variable 'Cubierta'

In [41]:
# Creación de la variable 'Cubierta' y eliminación de la variable 'Cabin'
X_train['Cubierta'] = X_train['Cabin'].str[0]
X_train.drop(['Cabin'], axis=1, inplace=True)

X_test['Cubierta'] = X_test['Cabin'].str[0]
X_test.drop(['Cabin'], axis=1, inplace=True)

In [47]:
X_train['Cubierta'].value_counts()

M    579
C     50
B     44
D     29
E     27
A     14
F     10
G      3
T      1
Name: Cubierta, dtype: int64

#### 4.2. Codificación por orden de frecuencia de etiqueta

In [46]:
def replace_category_vals(train, test, var):

    order_labels = train[var].value_counts().index

    ordinal_values = {k: i for i, k in enumerate(order_labels, 0)}

    print(var, ordinal_values)

    train[var] = train[var].map(ordinal_values)
    test[var] = test[var].map(ordinal_values)

In [49]:
# Definimos las categorías que se codificarán
freq_cat_vars = [var for var in X_train.columns if X_train[var].dtype == 'O' and var != 'Pclass']
freq_cat_vars

['Sex', 'Embarked', 'Cubierta']

In [50]:
for var in freq_cat_vars:
    replace_category_vals(X_train, X_test, var)

Sex {'male': 0, 'female': 1}
Embarked {'S': 0, 'C': 1, 'Q': 2}
Cubierta {'M': 0, 'C': 1, 'B': 2, 'D': 3, 'E': 4, 'A': 5, 'F': 6, 'G': 7, 'T': 8}


### 5. Feature Scaling

In [53]:
scaler = MinMaxScaler()

scaler.fit(X_train)

X_train = pd.DataFrame(
    scaler.transform(X_train),
    columns = X_train.columns
    )

X_test = pd.DataFrame(
    scaler.transform(X_test),
    columns = X_train.columns
    )

In [54]:
X_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cubierta
0,1.0,0.0,1.0,0.560191,0.125,0.166667,0.81898,0.0,0.0
1,0.0,0.5,0.0,0.308872,0.125,0.333333,0.601092,0.5,0.0
2,0.0,1.0,0.0,0.365578,0.0,0.0,0.352955,0.0,0.0
3,0.0,1.0,1.0,0.120382,0.0,0.333333,0.516728,0.0,0.0
4,1.0,1.0,1.0,0.195778,0.0,0.0,0.347554,1.0,0.0


In [55]:
#Guardamos dataset con data preparada para entrenamiento.
X_train.to_csv('Preprocessed_data/prep_Xtrain.csv', index=False)
X_test.to_csv('Preprocessed_data/prep_Xtest.csv', index=False)

y_train.to_csv('Preprocessed_data/prep_ytrain.csv', index=False)
y_test.to_csv('Preprocessed_data/prep_ytest.csv', index=False)

In [56]:
joblib.dump(scaler, 'Preprocessed_data/minmax_scaler.joblib')

['Preprocessed_data/minmax_scaler.joblib']