<i>Técnica Utilizada para identificar os atributos mais importantes para passarmos ao algoritmo.</i>

# Importação de bibliotecas básicas

In [68]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split

# Seleção de atributos - melhores atributos para o algoritmo

In [3]:
base_census = pd.read_csv(r'C:\Users\brcalazans\Desktop\Pessoal\Machine Learning e Data Science\Bases de dados\census.csv')

In [4]:
base_census.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
colunas = base_census.columns[:-1]
colunas

Index(['age', 'workclass', 'final-weight', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loos', 'hour-per-week', 'native-country'],
      dtype='object')

In [7]:
X_census = base_census.iloc[:, 0:14].values
y_census = base_census.iloc[:, 14].values

In [8]:
X_census

array([[39, ' State-gov', 77516, ..., 0, 40, ' United-States'],
       [50, ' Self-emp-not-inc', 83311, ..., 0, 13, ' United-States'],
       [38, ' Private', 215646, ..., 0, 40, ' United-States'],
       ...,
       [58, ' Private', 151910, ..., 0, 40, ' United-States'],
       [22, ' Private', 201490, ..., 0, 20, ' United-States'],
       [52, ' Self-emp-inc', 287927, ..., 0, 40, ' United-States']],
      dtype=object)

In [9]:
y_census

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' >50K'],
      dtype=object)

## Label Encoder

In [11]:
label_encoder_workclass    = LabelEncoder()
label_encoder_education    = LabelEncoder()
label_econder_marital      = LabelEncoder()
label_econder_occupation   = LabelEncoder()
label_econder_relationship = LabelEncoder()
label_econder_race         = LabelEncoder()
label_econder_sex          = LabelEncoder()
label_econder_country      = LabelEncoder()

X_census[:,1]  = label_encoder_workclass.fit_transform(X_census[:,1])
X_census[:,3]  = label_encoder_education.fit_transform(X_census[:,3])
X_census[:,5]  = label_econder_marital.fit_transform(X_census[:,5])
X_census[:,6]  = label_econder_occupation.fit_transform(X_census[:,6])
X_census[:,7]  = label_econder_relationship.fit_transform(X_census[:,7])
X_census[:,8]  = label_econder_race.fit_transform(X_census[:,8])
X_census[:,9]  = label_econder_sex.fit_transform(X_census[:,9])
X_census[:,13] = label_econder_country.fit_transform(X_census[:,13])

In [12]:
X_census

array([[39, 7, 77516, ..., 0, 40, 39],
       [50, 6, 83311, ..., 0, 13, 39],
       [38, 4, 215646, ..., 0, 40, 39],
       ...,
       [58, 4, 151910, ..., 0, 40, 39],
       [22, 4, 201490, ..., 0, 20, 39],
       [52, 5, 287927, ..., 0, 40, 39]], dtype=object)

## Normalização dos Dados

In [14]:
scaler = MinMaxScaler()
X_census_scaler = scaler.fit_transform(X_census)
X_census_scaler

array([[0.30136986, 0.875     , 0.0443019 , ..., 0.        , 0.39795918,
        0.95121951],
       [0.45205479, 0.75      , 0.0482376 , ..., 0.        , 0.12244898,
        0.95121951],
       [0.28767123, 0.5       , 0.13811345, ..., 0.        , 0.39795918,
        0.95121951],
       ...,
       [0.56164384, 0.5       , 0.09482688, ..., 0.        , 0.39795918,
        0.95121951],
       [0.06849315, 0.5       , 0.12849934, ..., 0.        , 0.19387755,
        0.95121951],
       [0.47945205, 0.625     , 0.18720338, ..., 0.        , 0.39795918,
        0.95121951]])

# Low Variance

In [49]:
for i in range(X_census.shape[1]):
    print(X_census_scaler[:, i].var())

0.034913808595952486
0.03312115190663569
0.005138537590667898
0.06657103564450892
0.029416385024073417
0.06301761677301636
0.09123816653931152
0.10326534394406342
0.04502805169292987
0.22136950173699113
0.00545419549240862
0.008557270623428908
0.015874043397822807
0.03641266114220053


In [50]:
selecao = VarianceThreshold(threshold=0.05)
X_census_variancia = selecao.fit_transform(X_census_scaler)
X_census_variancia.shape

(32561, 5)

In [51]:
X_census_variancia

array([[0.6       , 0.66666667, 0.07142857, 0.2       , 1.        ],
       [0.6       , 0.33333333, 0.28571429, 0.        , 1.        ],
       [0.73333333, 0.        , 0.42857143, 0.2       , 1.        ],
       ...,
       [0.73333333, 1.        , 0.07142857, 0.8       , 0.        ],
       [0.73333333, 0.66666667, 0.07142857, 0.6       , 1.        ],
       [0.73333333, 0.33333333, 0.28571429, 1.        , 0.        ]])

In [52]:
selecao.variances_

array([0.03491381, 0.03312115, 0.00513854, 0.06657104, 0.02941639,
       0.06301762, 0.09123817, 0.10326534, 0.04502805, 0.2213695 ,
       0.0054542 , 0.00855727, 0.01587404, 0.03641266])

In [53]:
indices = np.where(selecao.variances_ > 0.05)

In [54]:
indices

(array([3, 5, 6, 7, 9], dtype=int64),)

In [55]:
colunas[indices]

Index(['education', 'marital-status', 'occupation', 'relationship', 'sex'], dtype='object')

In [56]:
base_census_variancia = base_census.drop(columns=['age', 'workclass', 'final-weight',
                                                  'education-num', 'race', 'capital-gain',
                                                  'capital-loos', 'hour-per-week',
                                                  'native-country'], axis=1)

base_census_variancia.head()

Unnamed: 0,education,marital-status,occupation,relationship,sex,income
0,Bachelors,Never-married,Adm-clerical,Not-in-family,Male,<=50K
1,Bachelors,Married-civ-spouse,Exec-managerial,Husband,Male,<=50K
2,HS-grad,Divorced,Handlers-cleaners,Not-in-family,Male,<=50K
3,11th,Married-civ-spouse,Handlers-cleaners,Husband,Male,<=50K
4,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Female,<=50K


In [57]:
X_census_variancia = base_census_variancia.iloc[:, 0:5].values
y_census_variancia = base_census_variancia.iloc[:, 5].values

In [58]:
X_census_variancia

array([[' Bachelors', ' Never-married', ' Adm-clerical',
        ' Not-in-family', ' Male'],
       [' Bachelors', ' Married-civ-spouse', ' Exec-managerial',
        ' Husband', ' Male'],
       [' HS-grad', ' Divorced', ' Handlers-cleaners', ' Not-in-family',
        ' Male'],
       ...,
       [' HS-grad', ' Widowed', ' Adm-clerical', ' Unmarried', ' Female'],
       [' HS-grad', ' Never-married', ' Adm-clerical', ' Own-child',
        ' Male'],
       [' HS-grad', ' Married-civ-spouse', ' Exec-managerial', ' Wife',
        ' Female']], dtype=object)

In [59]:
y_census_variancia

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' >50K'],
      dtype=object)

## Label Encoder

In [60]:
label_encoder_education    = LabelEncoder()
label_econder_marital      = LabelEncoder()
label_econder_occupation   = LabelEncoder()
label_econder_relationship = LabelEncoder()
label_econder_sex          = LabelEncoder()


X_census_variancia[:,0]  = label_encoder_education.fit_transform(X_census_variancia[:,0])
X_census_variancia[:,1]  = label_econder_marital.fit_transform(X_census_variancia[:,1])
X_census_variancia[:,2]  = label_econder_occupation.fit_transform(X_census_variancia[:,2])
X_census_variancia[:,3]  = label_econder_relationship.fit_transform(X_census_variancia[:,3])
X_census_variancia[:,4]  = label_econder_sex.fit_transform(X_census_variancia[:,4])

In [61]:
X_census_variancia

array([[9, 4, 1, 1, 1],
       [9, 2, 4, 0, 1],
       [11, 0, 6, 1, 1],
       ...,
       [11, 6, 1, 4, 0],
       [11, 4, 1, 3, 1],
       [11, 2, 4, 5, 0]], dtype=object)

## One Hot Encoder

In [62]:
onehotenconder = ColumnTransformer(transformers=[("OneHot", OneHotEncoder(), [0,1,2,3,4])], remainder='passthrough')
X_census_variancia = onehotenconder.fit_transform(X_census_variancia).toarray()
X_census_variancia

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 1., 0.]])

## Normalização dos dados

In [63]:
scaler = MinMaxScaler()
X_census_variancia = scaler.fit_transform(X_census_variancia)
X_census_variancia

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 1., 0.]])

## Divisão de base de dados de teste e treinamento

In [64]:
X_census_treinamento_var, X_census_teste_var, y_census_treinamento_var, y_census_teste_var = train_test_split(X_census_variancia, y_census_variancia, test_size=0.15, random_state=0)
X_census_treinamento_var.shape, X_census_teste_var.shape

((27676, 46), (4885, 46))

## Random Forest

In [65]:
random_forest_var = RandomForestClassifier(criterion='entropy', min_samples_leaf=1, min_samples_split=5, n_estimators=100)
random_forest_var.fit(X_census_treinamento_var, y_census_treinamento_var)

In [67]:
previsoes = random_forest_var.predict(X_census_teste_var)
accuracy_score(y_census_teste_var, previsoes)

0.8171954964176049

# Extra Tree

In [70]:
X_census_scaler.shape

(32561, 14)

In [71]:
selecao = ExtraTreesClassifier()
selecao.fit(X_census_scaler, y_census)

In [72]:
importancias = selecao.feature_importances_
importancias

array([0.15000434, 0.0451636 , 0.16457796, 0.03619204, 0.08831314,
       0.07640483, 0.07549833, 0.09132407, 0.01462286, 0.02913337,
       0.08962896, 0.02796685, 0.09357287, 0.01759678])

In [73]:
importancias.sum()

1.0000000000000002

In [78]:
indices = []
for i in range(len(importancias)):
    #print(i)
    if importancias[i] >= 0.029:
        indices.append(i)

In [79]:
indices

[0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 12]

In [81]:
colunas[indices]

Index(['age', 'workclass', 'final-weight', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'sex', 'capital-gain',
       'hour-per-week'],
      dtype='object')

In [82]:
X_census_extra = X_census[:, indices]
X_census_extra

array([[9, 4, 1, ..., 1, 2174, 40],
       [9, 2, 4, ..., 1, 0, 13],
       [11, 0, 6, ..., 1, 0, 40],
       ...,
       [11, 6, 1, ..., 0, 0, 40],
       [11, 4, 1, ..., 1, 0, 20],
       [11, 2, 4, ..., 0, 15024, 40]], dtype=object)

## One Hot Encoder

In [83]:
onehotenconder = ColumnTransformer(transformers=[("OneHot", OneHotEncoder(), [1,3,5,6,7])], remainder='passthrough')
X_census_extra = onehotenconder.fit_transform(X_census_extra).toarray()
X_census_extra

array([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0000e+00, 2.1740e+03,
        4.0000e+01],
       [0.0000e+00, 0.0000e+00, 1.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        1.3000e+01],
       [1.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        4.0000e+01],
       ...,
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        2.0000e+01],
       [0.0000e+00, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 1.5024e+04,
        4.0000e+01]])

In [84]:
X_census_extra.shape

(32561, 47)

## Divisão de base de dados de treinamento e teste

In [85]:
X_census_treinamento_extra, X_census_teste_extra, y_census_treinamento_extra, y_census_teste_extra = train_test_split(X_census_extra, y_census, test_size=0.15, random_state=0)
X_census_treinamento_extra.shape, X_census_teste_extra.shape

((27676, 47), (4885, 47))

## Random Forest

In [87]:
random_forest_extra = RandomForestClassifier(criterion='entropy', min_samples_leaf=1, min_samples_split=5, n_estimators=100)
random_forest_extra.fit(X_census_treinamento_extra, y_census_treinamento_extra)

In [88]:
previsoes = random_forest_extra.predict(X_census_teste_extra)
accuracy_score(y_census_teste_extra, previsoes)

0.8384851586489253