In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

Exploração dos dados

In [3]:
#Importando a base de dados
base_census = pd.read_csv('adult.data')

#Ela veio sem o nome das colunas, criei uma variavel com o nome das colunas
column_names = ['age', 'workclass', 'final-weight', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loos', 'hour-per-week', 'native-country', 'income']

#Atribui os nomes para as colunas utilizando o .columns
base_census.columns = column_names

display(base_census )


Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32555,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32556,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32557,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32558,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [None]:
#Para visualizar algumas estatisticas, analise inicial
base_census.describe()

In [None]:
#Verificar se existe valores faltantes
base_census.isnull().sum()

Visualização dos dados

In [None]:
np.unique(base_census['income'], return_counts=True)

In [None]:
#Gráfico do seaborn
sns.countplot(x = base_census['income']);

In [None]:
#Histogramas
plt.hist(x = base_census['age']);

In [None]:
#Histograma
plt.hist(x= base_census['education-num']);

In [None]:
#Histograma
plt.hist(x = base_census['hour-per-week']);

In [None]:
#Gráfico dinamico
grafico = px.treemap(base_census, path=['workclass', 'age'])
grafico.show()

In [None]:
#Gráfico dinâmico 2 - Treemap é bom para ver agrupamento de dados e relações 
grafico = px.treemap(base_census, path=['occupation', 'relationship', 'age'])
grafico.show()

In [None]:
#Grafico de categoria paralelas
grafico = px.parallel_categories(base_census, dimensions=['occupation', 'relationship'])
grafico.show()

In [None]:
#Grafico de categoria paralelas
grafico = px.parallel_categories(base_census, dimensions=['workclass', 'occupation', 'income'])
grafico.show()

In [None]:
#Grafico de categoria paralelas
grafico = px.parallel_categories(base_census, dimensions=['education', 'income'])
grafico.show()

Divisão entre previsores e classe

In [4]:
x_census = base_census.iloc[:, 0:14].values

x_census

array([[50, ' Self-emp-not-inc', 83311, ..., 0, 13, ' United-States'],
       [38, ' Private', 215646, ..., 0, 40, ' United-States'],
       [53, ' Private', 234721, ..., 0, 40, ' United-States'],
       ...,
       [58, ' Private', 151910, ..., 0, 40, ' United-States'],
       [22, ' Private', 201490, ..., 0, 20, ' United-States'],
       [52, ' Self-emp-inc', 287927, ..., 0, 40, ' United-States']],
      dtype=object)

In [5]:
y_census = base_census.iloc[:, 14].values
y_census

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' >50K'],
      dtype=object)

Tratamento de atributos categóricos 

LabelEncoder

In [6]:
from sklearn.preprocessing import LabelEncoder

In [None]:
#Transformando strings em numeros - teste
label_encoder_teste = LabelEncoder()
teste = label_encoder_teste.fit_transform(x_census[:,1])

teste

In [10]:
# Variaveis com Label Encoder para todas as colunas string
label_encoder_workclass = LabelEncoder()
label_encoder_education = LabelEncoder()
label_encoder_marital = LabelEncoder()
label_encoder_occupation = LabelEncoder()
label_encoder_relationship = LabelEncoder()
label_encoder_race = LabelEncoder()
label_encoder_sex = LabelEncoder()
label_encoder_country = LabelEncoder()

In [12]:
#Label Encoder sendo aplicado em todas colunas strings
x_census[:, 1] = label_encoder_workclass.fit_transform(x_census[:, 1])
x_census[:, 3] = label_encoder_education.fit_transform(x_census[:, 3])
x_census[:, 5] = label_encoder_marital.fit_transform(x_census[:, 5])
x_census[:, 6] = label_encoder_occupation.fit_transform(x_census[:, 6])
x_census[:, 7] = label_encoder_relationship.fit_transform(x_census[:, 7])
x_census[:, 8] = label_encoder_race.fit_transform(x_census[:, 8])
x_census[:, 9] = label_encoder_sex.fit_transform(x_census[:, 9])
x_census[:, 13] = label_encoder_country.fit_transform(x_census[:, 13])
x_census 

array([[50, 6, 83311, ..., 0, 13, 39],
       [38, 4, 215646, ..., 0, 40, 39],
       [53, 4, 234721, ..., 0, 40, 39],
       ...,
       [58, 4, 151910, ..., 0, 40, 39],
       [22, 4, 201490, ..., 0, 20, 39],
       [52, 5, 287927, ..., 0, 40, 39]], dtype=object)

OneHotEncoder

In [14]:
""" Carro 

-------------------------------------------------------------
GOL  PALIO   UNO - Algoritmo vai dar prioridade para o Uno
1      2      3 
------------------------------------------------------------- 

-------------------------------------------------------------
Com o OneHotEncoder - Será codificado

GOL   1 0 0
Palio 0 1 0
Uno   0 0 1
-------------------------------------------------------------
"""

len(np.unique(base_census['workclass']))

9

In [16]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [18]:
#Criando a variavel com o OnehotEncoder
onehotencoder_census = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(), [1, 3, 5, 6, 7, 8, 9, 13])], remainder='passthrough')

In [21]:
#Transformando a base census
x_census = onehotencoder_census.fit_transform(x_census).toarray()

In [22]:
x_census

array([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.3000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+01],
       ...,
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        2.0000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.5024e+04, 0.0000e+00,
        4.0000e+01]])

In [23]:
x_census.shape

(32560, 108)

Escanolamento de valores - deixar todos na mesma escala

In [25]:
from sklearn.preprocessing import StandardScaler #Para aplicar a padronização
scaler_census = StandardScaler()
x_census = scaler_census.fit_transform(x_census)
x_census[0]

array([-0.24445418, -0.17429787, -0.26210166, -0.01466404, -1.51686919,
       -0.18839232,  3.43712857, -0.20368294, -0.02074031, -0.17175596,
       -0.19348971, -0.11609376, -0.07201712, -0.10165112, -0.14227403,
       -0.12664692, -0.18406669, -0.21053771,  2.25420373, -0.11334564,
       -0.68995763, -0.23637774, -0.03960803, -0.13419763, -0.53715487,
       -0.39751513, -0.02658736,  1.08357985, -0.11403855, -0.69875106,
       -0.18028746, -0.17736093, -0.24494765, -0.36181344, -0.01662796,
       -0.37950184,  2.64723821, -0.17745303, -0.20958133, -0.25595851,
       -0.33554707, -0.06780269, -0.3816701 , -0.14261071, -0.35532224,
       -0.17128158, -0.22710722,  1.21160036, -0.58510536, -0.17625251,
       -0.42935363, -0.34403822, -0.22493044, -0.09820239, -0.18155482,
       -0.32577377, -0.09161305,  0.41302713, -0.70308748,  0.70308748,
       -0.13502538, -0.02416358, -0.06107436, -0.04804954, -0.04260668,
       -0.05409462, -0.04641669, -0.02933754, -0.05715034, -0.05