In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from boruta import BorutaPy

### 1) Loading data

In [3]:
df = pd.read_csv(r"..\Dados\Tabela_final\tabela_final_com_cluster.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1170 entries, 0 to 1169
Data columns (total 64 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   IBGE7                1170 non-null   int64  
 1   UF                   1170 non-null   object 
 2   NOME                 1170 non-null   object 
 3   LATITUDE             1170 non-null   float64
 4   LONGITUDE            1170 non-null   float64
 5   DIST_EF_PUB          1170 non-null   float64
 6   DIST_EM_PUB          1170 non-null   float64
 7   IDEB_AI              1170 non-null   float64
 8   IDEB_AF              1170 non-null   float64
 9   DOCSUP_EF_PUB        1170 non-null   float64
 10  DOCSUP_EM_PUB        1170 non-null   float64
 11  TXNASC7C             1170 non-null   float64
 12  TXNBAIXOP            1170 non-null   float64
 13  PINTERSAP            1170 non-null   float64
 14  PINTERDRSAI          1170 non-null   float64
 15  REN_PIBPC_D          1170 non-null   f

In [5]:
X = df.drop(columns=["Labels"])
y = df["Labels"]

In [6]:
lista_colunas = [coluna for coluna in X.columns if (X[coluna].dtype == 'float64')&(coluna not in ['LATITUDE', 'LONGITUDE'])]

In [7]:
X = X[lista_colunas]

In [8]:
X.columns.shape

(58,)

### 2) Preprocessing

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### 3) Feature Selection

#### 3.1) Boruta

In [11]:
rf_model = RandomForestClassifier(n_jobs=-1, random_state=42)
boruta = BorutaPy(rf_model, n_estimators='auto', verbose=2, random_state=42)
boruta.fit(X_train_scaled, y_train)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0


Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	58
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	39
Tentative: 	10
Rejected: 	9
Iteration: 	9 / 100
Confirmed: 	39
Tentative: 	10
Rejected: 	9
Iteration: 	10 / 100
Confirmed: 	39
Tentative: 	10
Rejected: 	9
Iteration: 	11 / 100
Confirmed: 	39
Tentative: 	10
Rejected: 	9
Iteration: 	12 / 100
Confirmed: 	39
Tentative: 	10
Rejected: 	9
Iteration: 	13 / 100
Confirmed: 	39
Tentative: 	10
Rejected: 	9
Iteration: 	14 / 100
Confirmed: 	39
Tentative: 	9
Rejected: 	10
Iteration: 	15 / 100
Confirmed: 	39
Tentative: 	9
Rejected: 	10
Iteration: 	16 / 100
Confirmed: 	40
Tentative: 	8
Rejected: 	10
Iteration: 	17 / 100
Confirmed: 	40
Tentative: 	8
Reje

In [12]:
mask_colunas_selected = boruta.support_.tolist()
df_selecionado = X.iloc[:, mask_colunas_selected]

In [15]:
df_selecionado.columns

Index(['DIST_EF_PUB', 'DIST_EM_PUB', 'IDEB_AI', 'IDEB_AF', 'DOCSUP_EF_PUB',
       'DOCSUP_EM_PUB', 'TXNASC7C', 'REN_PIBPC_D', 'PDEFAGUA', 'PDEFESGOTO',
       'PANALF15', 'PIND_POS', 'POP_TOT_30KM', 'IDHM', 'T_BANAGUA', 'T_DENS',
       'T_LUZ', 'AGUA_ESGOTO', 'PIND', 'PINDCRI', 'T_FBBAS', 'T_FBFUND',
       'T_FBPRE', 'T_FBSUPER', 'T_ATRASO_1_BASICO', 'T_ATRASO_1_FUND',
       'T_ATRASO_1_MED', 'T_ATRASO_2_BASICO', 'T_ATRASO_2_FUND',
       'T_ATRASO_2_MED', 'T_ANALF11A14', 'T_ANALF15A17', 'T_ANALF15M',
       'T_ANALF18A24', 'T_ANALF18M', 'T_ANALF25A29', 'T_ANALF25M',
       'DIST_CORPO_AGUA', 'TOT_ESC_POR_POP', 'PMATPUB_EF'],
      dtype='object')

## Random Forest importance:

In [14]:
df_selecionado.to_csv(r"..\Dados\Tabela_final\df_selecionado_cluster.csv", index=False)