## Imputação do Valor Mais Frequente

Este notebook tem como objetivo apresentar o uso do classe sklearn.impute para imputação do valor mais frequente tanto para valores categóricos quanto numéricos. Vou utilizar como exemplo o Dataset do Titanic. 

### Importação das Bibliotecas

In [16]:
# trabalhar com dataframes
import pandas as pd
# trabalhar com arrays
import numpy as np
# gráficos
import matplotlib.pyplot as plt
# técnica de imputação
from sklearn.impute import SimpleImputer
# Divisão em Conjunto de Teste e Treinamento
from sklearn.model_selection import train_test_split

### Importação do Dataset

In [17]:
data = pd.read_csv('/home/vivas/Pesquisa/Datasets/Titanic/train.csv')
print(data.shape)
data.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [18]:
# verficar valores faltantes 
data.isnull().mean()

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

###  Verificar Tipos

In [19]:
data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

## Dividir o dataset
Aplicamos sempre a imputação no conjunto de treinamento  

In [20]:
features=data.columns
print(features)
# remover a feature Survived pois ela é o target
features.drop('Survived')
print(features)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [21]:
X_train, X_test, y_train, y_test = train_test_split(data[features], # just the features
                                                    data['Survived'], # the target
                                                    test_size=0.3, # the percentage of obs in the test set
                                                    random_state=0) # for reproducibility
X_train.shape, X_test.shape

((623, 12), (268, 12))

In [22]:
type(X_train)

pandas.core.frame.DataFrame

In [23]:
# Vamos definir as estratégias 

# criando uma instância
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(X_train)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='most_frequent', verbose=0)

In [24]:
# Podemos olhar o valor imputado imputada
imputer.statistics_

array([1, 0, 3, 'Abbing, Mr. Anthony', 'male', 24.0, 0, 0, 'CA. 2343',
       13.0, 'B96 B98', 'S'], dtype=object)

In [25]:
# vamos verificar o dataset
X_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
857,858,1,1,"Daly, Mr. Peter Denis",male,51.0,0,0,113055,26.55,E17,S
52,53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.0,1,0,PC 17572,76.7292,D33,C
386,387,0,3,"Goodwin, Master. Sidney Leonard",male,1.0,5,2,CA 2144,46.9,,S
124,125,0,1,"White, Mr. Percival Wayland",male,54.0,0,1,35281,77.2875,D26,S
578,579,0,3,"Caram, Mrs. Joseph (Maria Elias)",female,,1,0,2689,14.4583,,C


In [26]:
# Agora vamos imputar nos conjuntos de treinamento e teste

X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

X_train

array([[858, 1, 1, ..., 26.55, 'E17', 'S'],
       [53, 1, 1, ..., 76.7292, 'D33', 'C'],
       [387, 0, 3, ..., 46.9, 'B96 B98', 'S'],
       ...,
       [630, 0, 3, ..., 7.7333, 'B96 B98', 'Q'],
       [560, 1, 3, ..., 17.4, 'B96 B98', 'S'],
       [685, 0, 2, ..., 39.0, 'B96 B98', 'S']], dtype=object)

In [27]:
# SimpleImputer retorna um array e precisamos então transformar para Dataframe

X_train=pd.DataFrame(X_train)
X_train.columns = features
X_train.head()
X_train.shape

(623, 12)

### Verificação dos valores do Dataset

In [28]:
data.isnull().mean()

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

### Imputação do valor obtido

In [29]:
print("features : ", features) 
print(imputer.statistics_)


features :  Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
[1 0 3 'Abbing, Mr. Anthony' 'male' 24.0 0 0 'CA. 2343' 13.0 'B96 B98' 'S']


In [30]:
data["Age"].fillna(24, inplace=True)
data["Cabin"].fillna('B96 B98', inplace=True)
data["Embarked"].fillna('S', inplace=True)


In [31]:
data.isnull().mean()

PassengerId    0.0
Survived       0.0
Pclass         0.0
Name           0.0
Sex            0.0
Age            0.0
SibSp          0.0
Parch          0.0
Ticket         0.0
Fare           0.0
Cabin          0.0
Embarked       0.0
dtype: float64