# Laboratorio 4 - Machine Learning Pipeline
## Product Development - Ing. Preng Biba
### Alumno: Hugo Brian Bay Rojas - Carnet 20002544

In [37]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Binarizer

from sklearn.pipeline import Pipeline

from feature_engine.imputation import(
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder
)

from feature_engine.transformation import LogTransformer
from feature_engine.transformation import YeoJohnsonTransformer

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

import joblib

In [30]:
import my_preprocessors as mypp #nuestra libraria

In [31]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [32]:
#Cast de Variable Pclass
data['Pclass'] = data['Pclass'].astype('O')

In [33]:
X_train, X_test, y_train, y_test = train_test_split(
        data.drop(['PassengerId','Name','Ticket', 'Survived'], axis=1),
        data['Survived'],
        test_size=0.15,
        random_state=2021)

X_train.shape, X_test.shape

((757, 8), (134, 8))

In [34]:
X_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
210,3,male,24.0,0,0,7.0500,,S
876,3,male,20.0,0,0,9.8458,,S
666,2,male,25.0,0,0,13.0000,,S
819,3,male,10.0,3,2,27.9000,,S
736,3,female,48.0,1,3,34.3750,,S
...,...,...,...,...,...,...,...,...
41,2,female,27.0,1,0,21.0000,,S
187,1,male,45.0,0,0,26.5500,,S
46,3,male,,1,0,15.5000,,Q
179,3,male,36.0,0,0,0.0000,,S


## Configuración del Machine Learning Pipeline

In [47]:
#Variables categoricas con NA
CATEGORICAL_VARS_WITH_NA_FREQUENT = ['Embarked']

#Variable categoricas con NA pero indicador de Missing
CATEGORICAL_VARS_WITH_NA_MISSING = ['Cabin']

#Variables numéricas con NA
NUMERICAL_VARS_WITH_NA = ['Age', 'Fare']

#Varibles para transformación logaritmia
NUMERICALS_LOG_VARS = ["Fare"]

#Variable a extraer la primera letra
FIRST_CHAR_VARS = ['Cabin']

#Variables categoricas a codificar sin ordinalidad
CATEGORICAL_VARS = ['Sex', 'Embarked', 'Cabin']

#Variables seleccionadas para la predicción
FEATURES = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']

## Machine Learing PipeLine

In [90]:
Titanic_pipeline = Pipeline([
    
    #============= IMPUTACIONES ===================#
    
    #1. Imputación de varaibles categoricas
    ('missing_imputation', 
         CategoricalImputer(imputation_method='missing', variables=CATEGORICAL_VARS_WITH_NA_MISSING)
    ),
    
    #2. Imputación de variables categoricas con NA basado en frequiencia.
    ('frequent_imputation', 
         CategoricalImputer(imputation_method='frequent', variables=CATEGORICAL_VARS_WITH_NA_FREQUENT)
    ),
    
    #3. Indicamos Faltante en variables numéricas para imputar
    ('missing_indicator', AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA)),
    
    #4. Imputación de mediana para variables categoricas
    ('mean_imputation', MeanMedianImputer(
        imputation_method='mean', variables=NUMERICAL_VARS_WITH_NA)
    ),
    
    #============= TRANSFORMACIÓN DE VARIABLES NUMÉRICAS =============
    
    #5. Transformación logaritmica
    ('log', YeoJohnsonTransformer(variables=NUMERICALS_LOG_VARS)),
    
    #=============== EXTRACCION DE PRIMER CARACTER DE VARIABLES ==============
    ('extract_first_char', mypp.ExtractFirstCharacter(
        variables=FIRST_CHAR_VARS)),
    
    #============ CODIFICACION DE VARIABLES CATEGORICAS NOMINALES ============
    
    ('categorical_encoder', OrdinalEncoder(
        encoding_method='ordered', variables=CATEGORICAL_VARS)),
    
    #=========== SCALER ==============
    ('scaler', MinMaxScaler()),
    
    #=========== ENTRENAMIENTO DEL MODELO ============
    ('Random_forest', RandomForestClassifier(n_estimators=100)),
])

In [91]:
Titanic_pipeline.fit(X_train, y_train)

Pipeline(steps=[('missing_imputation', CategoricalImputer(variables=['Cabin'])),
                ('frequent_imputation',
                 CategoricalImputer(imputation_method='frequent',
                                    variables=['Embarked'])),
                ('missing_indicator',
                 AddMissingIndicator(variables=['Age', 'Fare'])),
                ('mean_imputation',
                 MeanMedianImputer(imputation_method='mean',
                                   variables=['Age', 'Fare'])),
                ('log', YeoJohnsonTransformer(variables=['Fare'])),
                ('extract_first_char',
                 ExtractFirstCharacter(variables=['Cabin'])),
                ('categorical_encoder',
                 OrdinalEncoder(variables=['Sex', 'Embarked', 'Cabin'])),
                ('scaler', MinMaxScaler()),
                ('Random_forest', RandomForestClassifier())])

In [92]:
preds = Titanic_pipeline.predict(X_test)

In [93]:
auc = round(roc_auc_score(y_test, preds), 4)
acc = round(accuracy_score(y_test, preds), 4)
print("ROC_AUC: " + str(auc) + " ACC: " + str(acc))

ROC_AUC: 0.7512 ACC: 0.7687


In [94]:
#Guardamos pipeline
joblib.dump(Titanic_pipeline, 'Titanic_pipeline.pkl')

['Titanic_pipeline.pkl']

In [95]:
Test = pd.read_csv('test.csv')
Test = Test[FEATURES]
Test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,34.5,0,0,7.8292,,Q
1,3,female,47.0,1,0,7.0000,,S
2,2,male,62.0,0,0,9.6875,,Q
3,3,male,27.0,0,0,8.6625,,S
4,3,female,22.0,1,1,12.2875,,S
...,...,...,...,...,...,...,...,...
413,3,male,,0,0,8.0500,,S
414,1,female,39.0,0,0,108.9000,C105,C
415,3,male,38.5,0,0,7.2500,,S
416,3,male,,0,0,8.0500,,S


In [96]:
predicts = Titanic_pipeline.predict(Test)
predicts

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,