## Categoria Missing 

Este notebook tem como objetivo apresentar o uso do classe sklearn.impute para imputação da categoria missing .  Vou utilizar como exemplo o Dataset do Titanic. 

### Importação das Bibliotecas

In [196]:
# trabalhar com dataframes
import pandas as pd
# trabalhar com arrays
import numpy as np
# gráficos
import matplotlib.pyplot as plt
# técnica de imputação
from sklearn.impute import SimpleImputer
# Divisão em Conjunto de Teste e Treinamento
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import MissingIndicator

### Importação do Dataset

In [197]:
data = pd.read_csv('/home/vivas/Pesquisa/Datasets/Titanic/train.csv')
print(data.shape)
data.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [198]:
# verficar valores faltantes 
data.isnull().mean()

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

###  Verificar Tipos

In [199]:
data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

## Dividir o dataset
Aplicamos sempre a imputação no conjunto de treinamento  

In [200]:
features=data.columns
print(features)
# remover a feature Survived pois ela é o target
features.drop('Survived')
print(features)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [201]:
X_train, X_test, y_train, y_test = train_test_split(data[features], # just the features
                                                    data['Survived'], # the target
                                                    test_size=0.3, # the percentage of obs in the test set
                                                    random_state=0) # for reproducibility
X_train.shape, X_test.shape

((623, 12), (268, 12))

In [202]:
type(X_train)

pandas.core.frame.DataFrame

# Usando o Missing Indicator

In [203]:
indicator = MissingIndicator(error_on_new=True, features='missing-only')
indicator.fit(X_train)  

MissingIndicator(error_on_new=True, features='missing-only', missing_values=nan,
                 sparse='auto')

### Índices das features onde serão aplicadas os indicadores

In [204]:
indicator.features_

array([ 5, 10, 11])

### Nome das features

In [205]:
X_train.columns[indicator.features_]

Index(['Age', 'Cabin', 'Embarked'], dtype='object')

### Matriz dos indicadores

In [206]:
indices=indicator.transform(X_train)

### Inserir indicadores no conjunto de teste e treinamento

In [207]:
indicator_cols = [i+'_VA' for i in X_train.columns[indicator.features_]]

# and now we concatenate
X_train = pd.concat([
    X_train.reset_index(),
    pd.DataFrame(indices, columns = indicator_cols)],
    axis=1)

X_train.head()

Unnamed: 0,index,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_VA,Cabin_VA,Embarked_VA
0,857,858,1,1,"Daly, Mr. Peter Denis",male,51.0,0,0,113055,26.55,E17,S,False,False,False
1,52,53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.0,1,0,PC 17572,76.7292,D33,C,False,False,False
2,386,387,0,3,"Goodwin, Master. Sidney Leonard",male,1.0,5,2,CA 2144,46.9,,S,False,True,False
3,124,125,0,1,"White, Mr. Percival Wayland",male,54.0,0,1,35281,77.2875,D26,S,False,False,False
4,578,579,0,3,"Caram, Mrs. Joseph (Maria Elias)",female,,1,0,2689,14.4583,,C,True,True,False


In [208]:
tmp = indicator.transform(X_test)
indicator_cols = [i+'_VA' for i in X_test.columns[indicator.features_]]

X_test = pd.concat([
    X_test.reset_index(),
    pd.DataFrame(tmp, columns = indicator_cols)],
    axis=1)

X_test.head()

Unnamed: 0,index,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_VA,Cabin_VA,Embarked_VA
0,495,496,0,3,"Yousseff, Mr. Gerious",male,,0,0,2627,14.4583,,C,True,True,False
1,648,649,0,3,"Willey, Mr. Edward",male,,0,0,S.O./P.P. 751,7.55,,S,True,True,False
2,278,279,0,3,"Rice, Master. Eric",male,7.0,4,1,382652,29.125,,Q,False,True,False
3,31,32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C,True,False,False
4,255,256,1,3,"Touma, Mrs. Darwis (Hanne Youssef Razi)",female,29.0,0,2,2650,15.2458,,C,False,True,False


In [209]:
# Vamos definir as estratégias 
numericas = ['PassengerId','Pclass','Age','SibSp','Parch','Fare']
categoricas = ['Name','Sex','Cabin','Embarked','Ticket','Age_VA','Cabin_VA','Embarked_VA']

Imputer_numericas = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
])

Imputer_categoricas = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant',fill_value = 'Missing')),
])

# aplicação do ColumnTransformer

Processamento = ColumnTransformer(transformers=[
    ('Imputer_numericas', Imputer_numericas, numericas),
    ('Imputer_categoricas', Imputer_categoricas, categoricas)
])

# now we fit the preprocessor
Processamento.fit(X_train)

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('Imputer_numericas',
                                 Pipeline(memory=None,
                                          steps=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0))],
                                          verbose=False),
                                 ['PassengerId', 'Pclass', 'Age', 'SibSp',
                                  'Parch', 'Fare']),
                                ('Imputer_cat

In [210]:
# Podemos olhar o valor imputado imputada
Processamento.transformers

[('Imputer_numericas', Pipeline(memory=None,
           steps=[('imputer',
                   SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                                 missing_values=nan, strategy='mean',
                                 verbose=0))],
           verbose=False), ['PassengerId',
   'Pclass',
   'Age',
   'SibSp',
   'Parch',
   'Fare']),
 ('Imputer_categoricas', Pipeline(memory=None,
           steps=[('imputer',
                   SimpleImputer(add_indicator=False, copy=True,
                                 fill_value='Missing', missing_values=nan,
                                 strategy='constant', verbose=0))],
           verbose=False), ['Name',
   'Sex',
   'Cabin',
   'Embarked',
   'Ticket',
   'Age_VA',
   'Cabin_VA',
   'Embarked_VA'])]

In [211]:
X_train

Unnamed: 0,index,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_VA,Cabin_VA,Embarked_VA
0,857,858,1,1,"Daly, Mr. Peter Denis",male,51.0,0,0,113055,26.5500,E17,S,False,False,False
1,52,53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.0,1,0,PC 17572,76.7292,D33,C,False,False,False
2,386,387,0,3,"Goodwin, Master. Sidney Leonard",male,1.0,5,2,CA 2144,46.9000,,S,False,True,False
3,124,125,0,1,"White, Mr. Percival Wayland",male,54.0,0,1,35281,77.2875,D26,S,False,False,False
4,578,579,0,3,"Caram, Mrs. Joseph (Maria Elias)",female,,1,0,2689,14.4583,,C,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
618,835,836,1,1,"Compton, Miss. Sara Rebecca",female,39.0,1,1,PC 17756,83.1583,E49,C,False,False,False
619,192,193,1,3,"Andersen-Jensen, Miss. Carla Christine Nielsine",female,19.0,1,0,350046,7.8542,,S,False,True,False
620,629,630,0,3,"O'Connell, Mr. Patrick D",male,,0,0,334912,7.7333,,Q,True,True,False
621,559,560,1,3,"de Messemaeker, Mrs. Guillaume Joseph (Emma)",female,36.0,1,0,345572,17.4000,,S,False,True,False


In [212]:
X_test

Unnamed: 0,index,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_VA,Cabin_VA,Embarked_VA
0,495,496,0,3,"Yousseff, Mr. Gerious",male,,0,0,2627,14.4583,,C,True,True,False
1,648,649,0,3,"Willey, Mr. Edward",male,,0,0,S.O./P.P. 751,7.5500,,S,True,True,False
2,278,279,0,3,"Rice, Master. Eric",male,7.0,4,1,382652,29.1250,,Q,False,True,False
3,31,32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C,True,False,False
4,255,256,1,3,"Touma, Mrs. Darwis (Hanne Youssef Razi)",female,29.0,0,2,2650,15.2458,,C,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263,263,264,0,1,"Harrison, Mr. William",male,40.0,0,0,112059,0.0000,B94,S,False,False,False
264,718,719,0,3,"McEvoy, Mr. Michael",male,,0,0,36568,15.5000,,Q,True,True,False
265,620,621,0,3,"Yasbeck, Mr. Antoni",male,27.0,1,0,2659,14.4542,,C,False,True,False
266,786,787,1,3,"Sjoblom, Miss. Anna Sofia",female,18.0,0,0,3101265,7.4958,,S,False,True,False


### verificação dos valores

In [213]:
Processamento.named_transformers_['Imputer_numericas'].named_steps['imputer'].statistics_

array([4.47678973e+02, 2.28731942e+00, 2.99153386e+01, 5.31300161e-01,
       3.93258427e-01, 3.24582726e+01])

In [214]:
X_train["Age"].mean()

29.915338645418327

In [215]:
Processamento.named_transformers_['Imputer_categoricas'].named_steps['imputer'].statistics_

array(['Missing', 'Missing', 'Missing', 'Missing', 'Missing', 'Missing',
       'Missing', 'Missing'], dtype=object)

In [216]:
# Agora vamos imputar nos conjuntos de treinamento e teste

X_train = Processamento.transform(X_train)
X_test = Processamento.transform(X_test)

In [218]:
X_train

array([[858.0, 1.0, 51.0, ..., False, False, False],
       [53.0, 1.0, 49.0, ..., False, False, False],
       [387.0, 3.0, 1.0, ..., False, True, False],
       ...,
       [630.0, 3.0, 29.915338645418327, ..., True, True, False],
       [560.0, 3.0, 36.0, ..., False, True, False],
       [685.0, 2.0, 60.0, ..., False, True, False]], dtype=object)

In [219]:
X_test

array([[496.0, 3.0, 29.915338645418327, ..., True, True, False],
       [649.0, 3.0, 29.915338645418327, ..., True, True, False],
       [279.0, 3.0, 7.0, ..., False, True, False],
       ...,
       [621.0, 3.0, 27.0, ..., False, True, False],
       [787.0, 3.0, 18.0, ..., False, True, False],
       [65.0, 1.0, 29.915338645418327, ..., True, True, False]],
      dtype=object)