# Laboratorio 4 - Pipeline
Andrea Cecilia Rivas Castañeda
16001120

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from feature_engine.encoding import OneHotEncoder

from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Binarizer
from sklearn.pipeline import Pipeline

from feature_engine.imputation import(
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
    RandomSampleImputer
)

from feature_engine import transformation as vt

from feature_engine.encoding import CountFrequencyEncoder, OneHotEncoder

from feature_engine.selection import DropFeatures

from feature_engine.wrappers import SklearnTransformerWrapper

import my_preprocessors as mypp

import joblib

In [2]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


data.drop([], axis = 1, inplace=True)
data.head()

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
        data.drop(['Survived'], axis=1),
        data['Survived'],
        test_size=0.3,
        random_state=2022)

In [4]:
# TRATAMIENTO DE NAs
CATEGORICAL_VARS_WITH_NA_MISSING = ['Embarked']

NUMERICAL_VARS_WITH_NA = ['Age', 'Fare']

RANDOM_NA = ['SibSp', 'Parch','Ticket', 'PassengerId', 'Cabin', 'Name', 'Pclass']

# TRANSFORMACIÓN DE VARS CATEGÓRICAS
OH_ENCODING_VARS = ['Sex']

FREQUENCY_ENCODING_VARS = ['Ticket']

CAT_ORDINAL_ENCODING_VARS = ['Embarked']

# TRANSFORMACIÓN DE VARS NUMERICAS
#Variables para binarización por sesgo fuerte
BINARIZE_VARS = ['SibSp', 'Parch']

YJ_TRANSFORM = ['Age']
LOG_TRANSFORM = ['Fare']

# VARIABLES SELECCIONADAS
FEATURES = ['Pclass', 'Sex'] 

DROP_FEATURES = ['SibSp', 'Parch', 'Age', 'Ticket', 'Fare', 'Embarked', 'PassengerId', 'Cabin', 'Name', 'Age_na']# Age_na --> variable para imputación de NA's 

In [5]:
survivedTitanic_pipeline = Pipeline([
    # IMPUTACIONES
    # Categoricas - Imputación por fecuencia
    ('frequency_imputation', 
         CategoricalImputer(imputation_method='frequent', variables=CATEGORICAL_VARS_WITH_NA_MISSING)
    ),
    # Numericas
    ('missing_indicator', AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA)),
    
    # Imputación de mediana
    ('mean_imputation', MeanMedianImputer(
        imputation_method='mean', variables=NUMERICAL_VARS_WITH_NA)
    ),

    ('random_imputation', RandomSampleImputer(variables=RANDOM_NA)),

    # TRANSFORMACIONES
    # Categoricas
    ('ohe_transformation', 
        OneHotEncoder(variables=OH_ENCODING_VARS, drop_last=True)),
    # Numericas
    ('yeoJ', vt.YeoJohnsonTransformer(variables=YJ_TRANSFORM)),

    ('mlog_plusOne', mypp.PlusOneVariableTransformer(variables=LOG_TRANSFORM)),

    ('log', vt.LogTransformer(variables=LOG_TRANSFORM)),

    ('binarizer', SklearnTransformerWrapper(
        transformer=Binarizer(threshold=0), variables=BINARIZE_VARS)
    ),

    # Drop de variables
    ('drop_features', DropFeatures(features_to_drop=DROP_FEATURES)),

    # MODELO
    ('Logistic_Reg', LogisticRegression())

])

In [6]:
survivedTitanic_pipeline.fit(X_train, y_train)

Pipeline(steps=[('frequency_imputation',
                 CategoricalImputer(imputation_method='frequent',
                                    variables=['Embarked'])),
                ('missing_indicator',
                 AddMissingIndicator(variables=['Age', 'Fare'])),
                ('mean_imputation',
                 MeanMedianImputer(imputation_method='mean',
                                   variables=['Age', 'Fare'])),
                ('random_imputation',
                 RandomSampleImputer(variables=['SibSp', 'Parch', 'Ticket',
                                                'Pa...
                 PlusOneVariableTransformer(variables=['Fare'])),
                ('log', LogTransformer(variables=['Fare'])),
                ('binarizer',
                 SklearnTransformerWrapper(transformer=Binarizer(threshold=0),
                                           variables=['SibSp', 'Parch'])),
                ('drop_features',
                 DropFeatures(features_to_drop=['Sib

In [7]:
preds = survivedTitanic_pipeline.predict(X_test)

In [8]:
accuracy_score(y_test, preds)

0.7985074626865671

In [9]:
joblib.dump(survivedTitanic_pipeline, 'survivedTitanic_pipeline.pkl')

['survivedTitanic_pipeline.pkl']