# Proyecto - Machine Learning Pipeline
Andrea Cecilia Rivas Castañeda - 16001120

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

from sklearn.impute import KNNImputer, SimpleImputer

import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Binarizer
from feature_engine.encoding import OneHotEncoder

from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.pipeline import Pipeline

from feature_engine.imputation import(
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
    RandomSampleImputer
)

from feature_engine import transformation as vt
from feature_engine.encoding import CountFrequencyEncoder, OneHotEncoder
from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper
import my_preprocessors as mypp
import joblib

In [2]:
pd.pandas.set_option('display.max_columns', None)

In [3]:
data = pd.read_csv("kidney_disease.csv")
data.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
        data.drop(['classification'], axis=1),
        data['classification'],
        test_size=0.3,
        random_state=2022)

In [5]:
target_mapping = {'notckd':0, 'ckd': 1}
y_train = y_train.map(target_mapping)
y_test = y_test.map(target_mapping)

In [6]:
# ------------------------------------- IMPUTACIONES -------------------------------------

## Variables Numericas
NUMERICAL_VARS_WITH_NA = ['sg', 'al', 'sc']

NUMERICAL_VARS_WITH_NA_MEDIAN = ['sg']
NUMERICAL_VARS_WITH_NA_MEAN = ['al', 'sc']

## Variables Categóricas
CATEGORICAL_VARS_WITH_NA_UNDEFINED = ['rbc', 'htn']

# ------------------------------------- TRANSFORMACIONES -------------------------------------

## Variables Numericas
YJ_TRANSFORM = ['sc']
BINARIZE_VARS = ['al']

## Variables Categoricas
NORMAL_MAPPING_VARS = ['rbc']
YES_MAPPING_VARS = ['htn']

## Diccionarios para mapping
NORMAL_DIC_MAP = {'normal':1, 'undefined': 0, 'abnormal':-1}
YES_DIC_MAP = {'yes':1, '\tyes':1, ' yes':1, 'undefined': 0, 'no':-1, '\tno':-1}

# Selección de variables  ---------------------------------------------------------------------------------
DROP_FEATURES = ['sg_na', 'al_na','sc_na', 'id', 'age', 'bp', 'su', 'pc', 'pcc', 'ba', 'bgr',
       'bu', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'dm', 'cad', 'appet', 'pe', 'ane']

## Pipeline

In [7]:
kidney_disease_pipeline = Pipeline([
# ------------------------------------- IMPUTACIONES -------------------------------------
    # Variables Numericas
    ## Indicador de variable faltante
    ('missing_indicator', AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA)),
    
    ## Imputación de mediana
    ('mean_imputation', MeanMedianImputer(
        imputation_method='mean', variables=NUMERICAL_VARS_WITH_NA_MEAN)
    ),

    ## Imputación de media
    ('median_imputation', MeanMedianImputer(
        imputation_method='median', variables=NUMERICAL_VARS_WITH_NA_MEDIAN)
    ),

    # Variables Categóricas
    ## Imputación por valor 'undefined'
    ('missing_imputation', CategoricalImputer(imputation_method='missing',
        fill_value='undefined',variables=CATEGORICAL_VARS_WITH_NA_UNDEFINED)
    ),

# ------------------------------------ TRANSFORMACIONES ------------------------------------
    # Variables Numericas
    ## Transformación YeoJohnson
    ('yeoJ', vt.YeoJohnsonTransformer(variables=YJ_TRANSFORM)),

    ## Binarizar variables sesgadas
    ('binarizer', SklearnTransformerWrapper(
        transformer=Binarizer(threshold=1), variables=BINARIZE_VARS)
    ),

    # Variables Categoricas
    ('mapper_normal', mypp.Mapper(variables=NORMAL_MAPPING_VARS, mappings=NORMAL_DIC_MAP)),

    ('mapper_yesNo', mypp.Mapper(variables=YES_MAPPING_VARS, mappings=YES_DIC_MAP)),

# ---------------------------------- SELECCIÓN DE VARIABLES ----------------------------------
    ('drop_features', DropFeatures(features_to_drop=DROP_FEATURES)),

# ------------------------------------------ SCALER ------------------------------------------
    ('scaler', MinMaxScaler()),

# --------------------------------- ENTRENAMIENTO DEL MODELO ---------------------------------
    ('Logistic_Reg', LogisticRegression())
])

In [8]:
kidney_disease_pipeline.fit(X_train, y_train)

Pipeline(steps=[('missing_indicator',
                 AddMissingIndicator(variables=['sg', 'al', 'sc'])),
                ('mean_imputation',
                 MeanMedianImputer(imputation_method='mean',
                                   variables=['al', 'sc'])),
                ('median_imputation', MeanMedianImputer(variables=['sg'])),
                ('missing_imputation',
                 CategoricalImputer(fill_value='undefined',
                                    variables=['rbc', 'htn'])),
                ('yeoJ', YeoJohnsonTransformer(...
                ('mapper_yesNo',
                 Mapper(mappings={'\tno': -1, '\tyes': 1, ' yes': 1, 'no': -1,
                                  'undefined': 0, 'yes': 1},
                        variables=['htn'])),
                ('drop_features',
                 DropFeatures(features_to_drop=['sg_na', 'al_na', 'sc_na', 'id',
                                                'age', 'bp', 'su', 'pc', 'pcc',
                                

In [9]:
preds = kidney_disease_pipeline.predict(X_test)

In [10]:
accuracy_score(y_test, preds)

0.9583333333333334

In [11]:
joblib.dump(kidney_disease_pipeline, 'kidney_disease_pipeline.pkl')

['kidney_disease_pipeline.pkl']