# Laboratorio 4 - Machine Learning Pipeline
## Product Development - Ing. Preng Biba
### Alumno: Hugo Brian Bay Rojas - Carnet 20002544

In [20]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Binarizer

from sklearn.pipeline import Pipeline

from feature_engine.imputation import(
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder
)

from feature_engine.transformation import LogTransformer
from feature_engine.transformation import YeoJohnsonTransformer

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

import joblib

In [7]:
import my_preprocessors as mypp #nuestra libraria

In [2]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
#Cast de Variable Pclass
data['Pclass'] = data['Pclass'].astype('O')

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
        data.drop(['PassengerId','Name','Ticket'], axis=1),
        data['Survived'],
        test_size=0.15,
        random_state=2021)

X_train.shape, X_test.shape

((757, 9), (134, 9))

## Configuración del Machine Learning Pipeline

In [6]:
#Variables categoricas con NA
CATEGORICAL_VARS_WITH_NA_FREQUENT = ['Embarked']

#Variable categoricas con NA pero indicador de Missing
CATEGORICAL_VARS_WITH_NA_MISSING = ['Cabin']

#Variables numéricas con NA
NUMERICAL_VARS_WITH_NA = ['Age']

#Varibles para transformación logaritmia
NUMERICALS_LOG_VARS = ["Fare"]

#Variable a extraer la primera letra
FIRST_CHAR_VARS = ['Cabin']

#Variables categoricas a codificar sin ordinalidad
CATEGORICAL_VARS = ['Sex', 'Embarked', 'Cabin']

## Machine Learing PipeLine

In [16]:
Titanic_pipeline = Pipeline([
    
    #============= IMPUTACIONES ===================#
    
    #1. Imputación de varaibles categoricas
    ('missing_imputation', 
         CategoricalImputer(imputation_method='missing', variables=CATEGORICAL_VARS_WITH_NA_MISSING)
    ),
    
    #2. Imputación de variables categoricas con NA basado en frequiencia.
    ('frequent_imputation', 
         CategoricalImputer(imputation_method='frequent', variables=CATEGORICAL_VARS_WITH_NA_FREQUENT)
    ),
    
    #3. Indicamos Faltante en variables numéricas para imputar
    ('missing_indicator', AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA)),
    
    #4. Imputación de mediana para variables categoricas
    ('mean_imputation', MeanMedianImputer(
        imputation_method='mean', variables=NUMERICAL_VARS_WITH_NA)
    ),
    
    #============= TRANSFORMACIÓN DE VARIABLES NUMÉRICAS =============
    
    #5. Transformación logaritmica
    ('log', YeoJohnsonTransformer(variables=NUMERICALS_LOG_VARS)),
    
    #=============== EXTRACCION DE PRIMER CARACTER DE VARIABLES ==============
    ('extract_first_char', mypp.ExtractFirstCharacter(
        variables=FIRST_CHAR_VARS)),
    
    #============ CODIFICACION DE VARIABLES CATEGORICAS NOMINALES ============
    
    ('categorical_encoder', OrdinalEncoder(
        encoding_method='ordered', variables=CATEGORICAL_VARS)),
    
    #=========== SCALER ==============
    ('scaler', MinMaxScaler()),
    
    #=========== ENTRENAMIENTO DEL MODELO ============
    ('Logit', LogisticRegression()),
])

In [17]:
Titanic_pipeline.fit(X_train, y_train)

Pipeline(steps=[('missing_imputation', CategoricalImputer(variables=['Cabin'])),
                ('frequent_imputation',
                 CategoricalImputer(imputation_method='frequent',
                                    variables=['Embarked'])),
                ('missing_indicator', AddMissingIndicator(variables=['Age'])),
                ('mean_imputation',
                 MeanMedianImputer(imputation_method='mean',
                                   variables=['Age'])),
                ('log', YeoJohnsonTransformer(variables=['Fare'])),
                ('extract_first_char',
                 ExtractFirstCharacter(variables=['Cabin'])),
                ('categorical_encoder',
                 OrdinalEncoder(variables=['Sex', 'Embarked', 'Cabin'])),
                ('scaler', MinMaxScaler()), ('Logit', LogisticRegression())])

In [18]:
preds = Titanic_pipeline.predict(X_test)

In [21]:
auc = round(roc_auc_score(y_test, preds), 4)
acc = round(accuracy_score(y_test, preds), 4)
print("ROC_AUC: " + str(auc) + " ACC: " + str(acc))

ROC_AUC: 1.0 ACC: 1.0


In [22]:
#Guardamos pipeline
joblib.dump(Titanic_pipeline, 'Titanic_pipeline.pkl')

['Titanic_pipeline.pkl']