# Importação das Bibliotecas Básicas

In [32]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier 
from sklearn.decomposition import PCA, KernelPCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Exploração dos dados

In [2]:
base_census = pd.read_csv(r'C:\Users\brcalazans\Desktop\Pessoal\Machine Learning e Data Science\Bases de dados\census.csv')
base_census.shape

(32561, 15)

In [3]:
base_census.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# Tratamento dos dados

## Pré-processamento

### Divisão entre atributos e classe

In [4]:
X_census = base_census.iloc[:, 0:14].values
X_census

array([[39, ' State-gov', 77516, ..., 0, 40, ' United-States'],
       [50, ' Self-emp-not-inc', 83311, ..., 0, 13, ' United-States'],
       [38, ' Private', 215646, ..., 0, 40, ' United-States'],
       ...,
       [58, ' Private', 151910, ..., 0, 40, ' United-States'],
       [22, ' Private', 201490, ..., 0, 20, ' United-States'],
       [52, ' Self-emp-inc', 287927, ..., 0, 40, ' United-States']],
      dtype=object)

In [5]:
y_census = base_census.iloc[:, 14].values
y_census

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' >50K'],
      dtype=object)

### Label Encoder

In [6]:
label_encoder_workclass    = LabelEncoder()
label_encoder_education    = LabelEncoder()
label_econder_marital      = LabelEncoder()
label_econder_occupation   = LabelEncoder()
label_econder_relationship = LabelEncoder()
label_econder_race         = LabelEncoder()
label_econder_sex          = LabelEncoder()
label_econder_country      = LabelEncoder()

In [7]:
X_census[:,1]  = label_encoder_workclass.fit_transform(X_census[:,1])
X_census[:,3]  = label_encoder_education.fit_transform(X_census[:,3])
X_census[:,5]  = label_econder_marital.fit_transform(X_census[:,5])
X_census[:,6]  = label_econder_occupation.fit_transform(X_census[:,6])
X_census[:,7]  = label_econder_relationship.fit_transform(X_census[:,7])
X_census[:,8]  = label_econder_race.fit_transform(X_census[:,8])
X_census[:,9]  = label_econder_sex.fit_transform(X_census[:,9])
X_census[:,13] = label_econder_country.fit_transform(X_census[:,13])

In [8]:
X_census[0]

array([39, 7, 77516, 9, 13, 4, 1, 1, 4, 1, 2174, 0, 40, 39], dtype=object)

### Escalonamento dos dados

In [9]:
scaler_census = StandardScaler()
X_census = scaler_census.fit_transform(X_census)

In [10]:
X_census

array([[ 0.03067056,  2.15057856, -1.06361075, ..., -0.21665953,
        -0.03542945,  0.29156857],
       [ 0.83710898,  1.46373585, -1.008707  , ..., -0.21665953,
        -2.22215312,  0.29156857],
       [-0.04264203,  0.09005041,  0.2450785 , ..., -0.21665953,
        -0.03542945,  0.29156857],
       ...,
       [ 1.42360965,  0.09005041, -0.35877741, ..., -0.21665953,
        -0.03542945,  0.29156857],
       [-1.21564337,  0.09005041,  0.11095988, ..., -0.21665953,
        -1.65522476,  0.29156857],
       [ 0.98373415,  0.77689313,  0.92989258, ..., -0.21665953,
        -0.03542945,  0.29156857]])

### Divisão da base de Treinamento e Teste

In [11]:
X_census_treinamento, X_census_teste, y_census_treinamento, y_census_teste = train_test_split(X_census, y_census, test_size=0.15, random_state=0)
X_census_treinamento.shape, X_census_teste.shape

((27676, 14), (4885, 14))

# PCA(Principal component analysis)

In [12]:
pca = PCA(n_components=8)

In [13]:
X_census_treinamento_pca = pca.fit_transform(X_census_treinamento)
X_census_teste_pca = pca.transform(X_census_teste)

In [14]:
X_census_treinamento_pca.shape, X_census_teste_pca.shape

((27676, 8), (4885, 8))

In [15]:
X_census_treinamento_pca

array([[ 0.72756979, -0.63707837, -0.92959677, ...,  0.4750941 ,
         1.26767572, -0.70907366],
       [ 1.94861194,  0.52336888, -0.11546966, ...,  0.20608772,
        -0.28547192, -1.10996543],
       [ 0.90984576, -0.49631087,  0.21043617, ...,  0.22000605,
         1.54790866, -0.6082255 ],
       ...,
       [-1.55068874,  1.8706316 , -1.18829813, ..., -0.15217589,
        -1.23888631,  0.23631102],
       [ 1.50148337,  0.90027   ,  1.39911867, ...,  0.83827868,
        -0.03400375,  0.51140853],
       [ 0.39972037,  0.73809833, -0.78746921, ...,  0.39215363,
        -0.75329109, -0.00358243]])

In [16]:
pca.explained_variance_ratio_

array([0.151561  , 0.10109701, 0.08980379, 0.08076277, 0.07627678,
       0.07357646, 0.06772289, 0.06690789])

In [17]:
pca.explained_variance_ratio_.sum()

0.7077085943199353

## Random Forest

In [28]:
random_forest_census_pca = RandomForestClassifier(n_estimators=40, random_state=0, criterion='entropy')
random_forest_census_pca.fit(X_census_treinamento_pca, y_census_treinamento)

In [29]:
previsoes = random_forest_census_pca.predict(X_census_teste_pca)
previsoes

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' >50K'],
      dtype=object)

In [30]:
y_census

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' >50K'],
      dtype=object)

In [31]:
accuracy_score(y_census_teste, previsoes)

0.8372569089048106

# Kernel PCA

In [27]:
kpca = KernelPCA(n_components=1, kernel='rbf')
X_census_treinamento_kpca = kpca.fit_transform(X_census_treinamento)
X_census_teste_kpca = kpca.transform(X_census_teste)

MemoryError: Unable to allocate 5.71 GiB for an array with shape (27676, 27676) and data type float64

In [25]:
X_census_treinamento_kpca.shape, X_census_teste_kpca.shape

NameError: name 'X_census_treinamento_kpca' is not defined

In [26]:
X_census_treinamento_kpca

NameError: name 'X_census_treinamento_kpca' is not defined

## Random Forest

In [None]:
random_forest_census_kpca = RandomForestClassifier(n_estimators=40, criterion = 'entropy', random_state=0)
random_forest_census_kpca.fit(X_census_treinamento_kpca, y_census_treinamento)

In [None]:
previsoes = random_forest_census_kpca.predict(X_census_treinamento_kpca)
previsoes

In [None]:
accuracy_score(y_census_teste, previsoes)

# LDA (Linear Discriminant Analysis)

In [42]:
lda = LinearDiscriminantAnalysis(n_components=1)

In [43]:
X_census_treinamento_lda = lda.fit_transform(X_census_treinamento, y_census_treinamento)
X_census_teste_lda = lda.transform(X_census_teste)

In [44]:
X_census_treinamento_lda.shape, X_census_teste_lda.shape

((27676, 1), (4885, 1))

In [45]:
X_census_treinamento_lda

array([[-0.30226006],
       [ 0.64945145],
       [ 1.57912192],
       ...,
       [-1.37887999],
       [ 2.44698101],
       [-0.19212323]])

## Random Forest

In [47]:
random_forest_census_lda = RandomForestClassifier(n_estimators=40, criterion='entropy', random_state=0)
random_forest_census_lda.fit(X_census_treinamento_lda, y_census_treinamento)

In [48]:
previsoes = random_forest_census_lda.predict(X_census_teste_lda)
previsoes

array([' <=50K', ' >50K', ' <=50K', ..., ' >50K', ' >50K', ' >50K'],
      dtype=object)

In [49]:
accuracy_score(y_census_teste, previsoes)

0.7334698055271238