In [2]:
import pandas as pd
base = pd.read_csv('censo.csv')
base.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


#### Verificando os atributos que o Censo possui

In [7]:
base.columns

Index(['age', 'workclass', 'final-weight', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loos', 'hour-per-week', 'native-country',
       'income'],
      dtype='object')

In [8]:
previsores = base.iloc[:, 0:14].values
classe = base.iloc[:, 14].values

In [9]:
classe

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' >50K'],
      dtype=object)

In [11]:
previsores

array([[39, ' State-gov', 77516, ..., 0, 40, ' United-States'],
       [50, ' Self-emp-not-inc', 83311, ..., 0, 13, ' United-States'],
       [38, ' Private', 215646, ..., 0, 40, ' United-States'],
       ...,
       [58, ' Private', 151910, ..., 0, 40, ' United-States'],
       [22, ' Private', 201490, ..., 0, 20, ' United-States'],
       [52, ' Self-emp-inc', 287927, ..., 0, 40, ' United-States']],
      dtype=object)

### Preprocessamento dos previsores - dados em formato de texto para número

In [12]:
from sklearn.preprocessing import LabelEncoder
labelencoder_previsores = LabelEncoder()

In [13]:
previsores[:, 1] = labelencoder_previsores.fit_transform(previsores[:, 1])  # workclass
previsores[:, 3] = labelencoder_previsores.fit_transform(previsores[:, 3])  # education
previsores[:, 5] = labelencoder_previsores.fit_transform(previsores[:, 5])  # marital-status
previsores[:, 6] = labelencoder_previsores.fit_transform(previsores[:, 6])  # occupation
previsores[:, 7] = labelencoder_previsores.fit_transform(previsores[:, 7])  # relationship
previsores[:, 8] = labelencoder_previsores.fit_transform(previsores[:, 8])  # race
previsores[:, 9] = labelencoder_previsores.fit_transform(previsores[:, 9])  # sex
previsores[:, 13] = labelencoder_previsores.fit_transform(previsores[:, 13])  # native-country

### Resultado do preprocessamento *LabelEncoder*

In [16]:
previsores

array([[39, 7, 77516, ..., 0, 40, 39],
       [50, 6, 83311, ..., 0, 13, 39],
       [38, 4, 215646, ..., 0, 40, 39],
       ...,
       [58, 4, 151910, ..., 0, 40, 39],
       [22, 4, 201490, ..., 0, 20, 39],
       [52, 5, 287927, ..., 0, 40, 39]], dtype=object)

In [18]:
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder(categorical_features=[1, 3, 5, 6, 7, 8, 9, 13])

TypeError: __init__() got an unexpected keyword argument 'categorical_features'

### Por conta dos problemas que há sobre o atributo *categorical_features*, será utilizado outra forma:
<a href="https://stackoverflow.com/questions/59476165/typeerror-init-got-an-unexpected-keyword-argument-categorical-features">Solução</a>

In [19]:
from sklearn.compose import ColumnTransformer
columnTransformer = ColumnTransformer([('OneHot', OneHotEncoder(), [1, 3, 5, 6, 7, 8, 9, 13])], 
                                      remainder='passthrough')

In [22]:
previsores = columnTransformer.fit_transform(previsores).toarray()

In [23]:
previsores

array([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.1740e+03, 0.0000e+00,
        4.0000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.3000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+01],
       ...,
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        2.0000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.5024e+04, 0.0000e+00,
        4.0000e+01]])

### Preprocessamento da classe

In [24]:
classe

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' >50K'],
      dtype=object)

In [25]:
labelencoder_classe = LabelEncoder()
classe = labelencoder_classe.fit_transform(classe)

In [26]:
classe

array([0, 0, 0, ..., 0, 0, 1])

### Realizando o StandardScaler nos previsores

In [27]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [28]:
previsores = scaler.fit_transform(previsores)
previsores

array([[-0.2444502 , -0.17429511, -0.26209736, ...,  0.1484529 ,
        -0.21665953, -0.03542945],
       [-0.2444502 , -0.17429511, -0.26209736, ..., -0.14592048,
        -0.21665953, -2.22215312],
       [-0.2444502 , -0.17429511, -0.26209736, ..., -0.14592048,
        -0.21665953, -0.03542945],
       ...,
       [-0.2444502 , -0.17429511, -0.26209736, ..., -0.14592048,
        -0.21665953, -0.03542945],
       [-0.2444502 , -0.17429511, -0.26209736, ..., -0.14592048,
        -0.21665953, -1.65522476],
       [-0.2444502 , -0.17429511, -0.26209736, ...,  1.88842434,
        -0.21665953, -0.03542945]])

### Realizando a divisão de dados de treino e teste

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
previsores_train, previsores_test, classe_train, classe_test = train_test_split(previsores,
                                                                                classe,
                                                                                test_size=0.25,
                                                                                random_state = 0)

### Criando o Classificador da RF e declarando seus hiperparâmetros

In [33]:
from sklearn.ensemble import RandomForestClassifier
classificador = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=42)

In [35]:
classificador.fit(previsores_train, classe_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=42)

In [37]:
previsoes = classificador.predict(previsores_test)

In [39]:
from sklearn.metrics import confusion_matrix, accuracy_score
precisao = accuracy_score(classe_test, previsoes)
matriz = confusion_matrix(classe_test, previsoes)

In [40]:
precisao

0.8400687876182287

In [41]:
matriz

array([[5722,  437],
       [ 865, 1117]], dtype=int64)