# **PONTIFÍCIA UNIVERSIDADE CATÓLICA DE MINAS GERAIS - NÚCLEO DE EDUCAÇÃO A DISTÂNCIA**

### Pós-graduação Lato Sensu em Ciência de Dados e Big Data


### **6. Pré-Processamento - Aplicação do PCA**

### Carrega as bibliotecas

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.decomposition import PCA
# Scaling variables
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer

np.set_printoptions(precision=4, suppress=True)  # Formatando impressão de Array Numpy
pd.set_option('display.float_format', lambda x: '%.6f' % x)  # Formatando impressão de numeros no Pandas

### Carrega os dados

In [None]:
df_rais = pd.read_csv('/content/BD_Rais_2019_CE_03.csv')
df_rais.info()

In [None]:
## eliminando os campos que não serão utilizados no PCA
df_rais = df_rais.drop(columns=['id_municipio','natureza_juridica','tamanho_estabelecimento','cnae_2','cnae_2_subclasse', 'subsetor_ibge',
                                 'cep_estabelecimento', 'cnae_2_secao'])

df_rais.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94480 entries, 0 to 94479
Data columns (total 55 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   qtde_vinculos_ativos        94480 non-null  int64
 1   qtde_vinculos_clt           94480 non-null  int64
 2   qtde_vinculos_estatutarios  94480 non-null  int64
 3   indicador_simples           94480 non-null  int64
 4   grupo_nj_1                  94480 non-null  int64
 5   grupo_nj_2                  94480 non-null  int64
 6   grupo_nj_3                  94480 non-null  int64
 7   grupo_nj_4                  94480 non-null  int64
 8   grupo_nj_5                  94480 non-null  int64
 9   cnae_2_secao_A              94480 non-null  int64
 10  cnae_2_secao_B              94480 non-null  int64
 11  cnae_2_secao_C              94480 non-null  int64
 12  cnae_2_secao_D              94480 non-null  int64
 13  cnae_2_secao_E              94480 non-null  int64
 14  cnae_2

In [None]:
## Realizando transformação antes do PCA
df_rais[['qtde_vinculos_ativos']] = PowerTransformer().fit_transform(df_rais[['qtde_vinculos_ativos']])
df_rais[['qtde_vinculos_clt']] = PowerTransformer().fit_transform(df_rais[['qtde_vinculos_clt']])
df_rais[['qtde_vinculos_estatutarios']] = PowerTransformer().fit_transform(df_rais[['qtde_vinculos_estatutarios']])


In [None]:
pca = PCA(n_components=5)
pca.fit(df_rais)

PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [None]:
pca2 = pca.transform(df_rais)
display(pca2.shape)
pca2

(94480, 5)

array([[-0.2966,  0.023 ,  0.7415,  0.1834, -0.5089],
       [-1.0171,  0.1591,  0.7786,  0.7019, -0.265 ],
       [-1.0121,  0.1605,  1.0984,  1.0624,  0.7618],
       ...,
       [-1.0813,  0.0161,  0.6309, -0.6065, -0.1141],
       [-1.0813,  0.0161,  0.6309, -0.6065, -0.1141],
       [-1.0813,  0.0161,  0.6309, -0.6065, -0.1141]])

In [None]:
colunas = df_rais.columns
df_pca_components = pd.DataFrame(pca.components_.T,index=colunas)

In [None]:
df_pca_0 = df_pca_components.drop(columns=[1,2,3,4])
df_pca_1 = df_pca_components.drop(columns=[0,2,3,4])
df_pca_2 = df_pca_components.drop(columns=[0,1,3,4])
df_pca_3 = df_pca_components.drop(columns=[0,1,2,4])
df_pca_4 = df_pca_components.drop(columns=[0,1,2,3])
print(df_pca_0.sort_values(by=0, ascending=False, key=lambda col: col.abs()).head())
print(df_pca_1.sort_values(by=1, ascending=False, key=lambda col: col.abs()).head())
print(df_pca_2.sort_values(by=2, ascending=False, key=lambda col: col.abs()).head())
print(df_pca_3.sort_values(by=3, ascending=False, key=lambda col: col.abs()).head())
print(df_pca_4.sort_values(by=4, ascending=False, key=lambda col: col.abs()).head())

                                   0
qtde_vinculos_ativos        0.705983
qtde_vinculos_clt           0.698246
indicador_simples          -0.066233
qtde_vinculos_estatutarios  0.062841
sub_ibge_16                -0.050553
                                   1
qtde_vinculos_estatutarios  0.969794
qtde_vinculos_clt          -0.157632
grupo_nj_2                 -0.081644
grupo_nj_1                  0.075113
cnae_2_secao_O              0.070893
                          2
cnae_2_secao_G    -0.640698
sub_ibge_16       -0.628093
sub_ibge_19        0.199359
indicador_simples -0.161572
sub_ibge_21        0.151648
                          3
indicador_simples -0.812743
grupo_nj_2        -0.250152
grupo_nj_3         0.207792
sub_ibge_19        0.198757
sub_ibge_21       -0.192371
                          4
sub_ibge_19        0.590038
cnae_2_secao_N     0.423343
indicador_simples  0.417904
grupo_nj_3         0.241880
grupo_nj_2        -0.215397


In [None]:
n_pca_comp = pca.n_components_
df_pca_var_ratio = pd.DataFrame(pca.explained_variance_ratio_,columns=['variance_ratio'])
df_pca_exp_var = pd.DataFrame(pca.explained_variance_,columns=['explained_variance'])
df_pca_sgl_values = pd.DataFrame(pca.singular_values_,columns=['singular_values'])

In [None]:
df_pca_var_ratio.head(10)

Unnamed: 0,variance_ratio
0,0.398421
1,0.210034
2,0.106728
3,0.051083
4,0.034821


In [None]:
df_pca_exp_var.head(10)

Unnamed: 0,explained_variance
0,1.98916
1,1.048618
2,0.53285
3,0.255039
4,0.173848


In [None]:
df_pca_sgl_values.head(10)

Unnamed: 0,singular_values
0,433.513382
1,314.757616
2,224.372779
3,155.228322
4,128.159861


In [None]:
pca.score_samples(df_rais)

array([ -5.055 , -42.9732, -25.6926, ...,  -2.7784,  -2.7784,  -2.7784])

In [None]:
lower_dimension_data = pca.fit_transform(df_rais)
lower_dimension_data.shape

(94480, 5)

In [None]:
lower_dimension_data

array([[-0.2966,  0.023 ,  0.7415,  0.1834, -0.5089],
       [-1.0171,  0.1591,  0.7786,  0.7019, -0.265 ],
       [-1.0121,  0.1605,  1.0984,  1.0624,  0.7618],
       ...,
       [-1.0813,  0.0161,  0.6309, -0.6065, -0.1141],
       [-1.0813,  0.0161,  0.6309, -0.6065, -0.1141],
       [-1.0813,  0.0161,  0.6309, -0.6065, -0.1141]])