## Categoria Missing 

Este notebook tem como objetivo apresentar o uso do classe sklearn.impute para imputação da categoria missing .  Vou utilizar como exemplo o Dataset do Titanic. 

### Importação das Bibliotecas

In [17]:
# trabalhar com dataframes
import pandas as pd
# trabalhar com arrays
import numpy as np
# gráficos
import matplotlib.pyplot as plt
# técnica de imputação
from sklearn.impute import SimpleImputer
# Divisão em Conjunto de Teste e Treinamento
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

### Importação do Dataset

In [18]:
data = pd.read_csv('/home/vivas/Pesquisa/Datasets/Titanic/train.csv')
print(data.shape)
data.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [19]:
# verficar valores faltantes 
data.isnull().mean()

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

###  Verificar Tipos

In [20]:
data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

## Dividir o dataset
Aplicamos sempre a imputação no conjunto de treinamento  

In [21]:
features=data.columns
print(features)
# remover a feature Survived pois ela é o target
features.drop('Survived')
print(features)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [22]:
X_train, X_test, y_train, y_test = train_test_split(data[features], # just the features
                                                    data['Survived'], # the target
                                                    test_size=0.3, # the percentage of obs in the test set
                                                    random_state=0) # for reproducibility
X_train.shape, X_test.shape

((623, 12), (268, 12))

In [23]:
type(X_train)

pandas.core.frame.DataFrame

In [24]:
# Vamos definir as estratégias 

numericas = ['Age']
categoricas = ['Cabin', 'Embarked']

Imputer_numericas = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
])

Imputer_categoricas = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant',fill_value = 'Missing')),
])

# aplicação do ColumnTransformer

Processamento = ColumnTransformer(transformers=[
    ('Imputer_numericas', Imputer_numericas, numericas),
    ('Imputer_categoricas', Imputer_categoricas, categoricas)
])

# now we fit the preprocessor
Processamento.fit(X_train)

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('Imputer_numericas',
                                 Pipeline(memory=None,
                                          steps=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0))],
                                          verbose=False),
                                 ['Age']),
                                ('Imputer_categoricas',
                                 Pipeline(memory=None,
                   

In [25]:
# Podemos olhar o valor imputado imputada
Processamento.transformers

[('Imputer_numericas', Pipeline(memory=None,
           steps=[('imputer',
                   SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                                 missing_values=nan, strategy='mean',
                                 verbose=0))],
           verbose=False), ['Age']),
 ('Imputer_categoricas', Pipeline(memory=None,
           steps=[('imputer',
                   SimpleImputer(add_indicator=False, copy=True,
                                 fill_value='Missing', missing_values=nan,
                                 strategy='constant', verbose=0))],
           verbose=False), ['Cabin', 'Embarked'])]

### verificação dos valores

In [26]:
Processamento.named_transformers_['Imputer_numericas'].named_steps['imputer'].statistics_

array([29.91533865])

In [27]:
X_train["Age"].mean()

29.915338645418327

In [28]:
Processamento.named_transformers_['Imputer_categoricas'].named_steps['imputer'].statistics_

array(['Missing', 'Missing'], dtype=object)

In [29]:
# Agora vamos imputar nos conjuntos de treinamento e teste

X_train = Processamento.transform(X_train)
X_test = Processamento.transform(X_test)

### Verificação dos valores do Dataset

In [30]:
data.isnull().mean()

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

### Imputação do valor obtido

In [31]:
data["Age"].fillna(29.91533865, inplace=True)
data["Cabin"].fillna('Missing', inplace=True)
data["Embarked"].fillna('Missing', inplace=True)


In [32]:
data.isnull().mean()

PassengerId    0.0
Survived       0.0
Pclass         0.0
Name           0.0
Sex            0.0
Age            0.0
SibSp          0.0
Parch          0.0
Ticket         0.0
Fare           0.0
Cabin          0.0
Embarked       0.0
dtype: float64