## Pré processamento dos dados

### Bibliotecas

In [1]:
#Bibliotecas
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

#Estilizar conteúdo
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

def estilo_tabelas(df, max_altura='300px', casas_decimais=3):
    return (
        df.style.set_table_styles(
            [
                {'selector': 'thead th', 'props': [('font-size', '12px'), ('text-align', 'center'), ('border-bottom', '2px solid #007BFF')]},  # Azul abaixo do nome das colunas
                {'selector': 'td', 'props': [('font-size', '10px'), ('text-align', 'center'), ('max-height', '40px'), ('white-space', 'nowrap'), ('text-overflow', 'ellipsis'), ('overflow', 'hidden'), ('max-width', '100px')]},
                {'selector': 'tr:nth-child(even)', 'props': [('background-color', '#f9f9f9')]},  # Fundo alternado
                {'selector': 'tr:nth-child(odd)', 'props': [('background-color', '#ffffff')]},
                {'selector': 'table', 'props': [('width', '90%'), ('margin-left', 'auto'), ('margin-right', 'auto'), ('border-collapse', 'collapse')]},
                {'selector': 'td, th', 'props': [('border', '1px solid #666')]},  # Bordas cinza escuro
            ]
        ).set_properties(
            **{'background-color': '#f4f4f4', 'border-color': 'darkgray', 'border-style': 'solid', 'border-width': '1px'}
        ).set_table_attributes(
            f'style="height:auto; overflow:auto; max-height:{max_altura}; display:block;"'  
        ).format(
            precision=casas_decimais  
        )
    )

### Base de dados

In [2]:
object_columns = ['Protocolo_S2iD', 'Nome_Municipio', 'Sigla_UF', 'regiao',
                  'Setores Censitários', 'Status', 'DH_Descricao', 'DM_Descricao',
                  'DA_Descricao', 'DA_Polui/cont da água', 'DA_Polui/cont do ar',
                  'DA_Polui/cont do solo', 'DA_Dimi/exauri hídrico',
                  "DA_Incêndi parques/APA's/APP's", 'PEPL_Descricao', 'PEPR_Descricao',
                  'Categoria', 'Grupo', 'Subgrupo', 'Tipo', 'Subtipo']

dtype = {col: 'object' for col in object_columns}

df_eventos = pd.read_csv(
    "https://raw.githubusercontent.com/brunagmoura/PrevisorReconhecimento/refs/heads/main/df_eventos_desastres_rec_nrec.csv",
    sep=';',
    dtype = dtype,
    decimal=',',
)

estilo_tabelas(df_eventos.head(5))

Unnamed: 0,Protocolo_S2iD,Nome_Municipio,Sigla_UF,regiao,Data_Registro,Data_Evento,codigo_ibge,Setores Censitários,Status,DH_Descricao,DH_MORTOS,DH_FERIDOS,DH_ENFERMOS,DH_DESABRIGADOS,DH_DESALOJADOS,DH_DESAPARECIDOS,DH_OUTROS AFETADOS,DH_total_danos_humanos,DM_Descricao,DM_Uni Habita Danificadas,DM_Uni Habita Destruidas,DM_Uni Habita Valor,DM_Inst Saúde Danificadas,DM_Inst Saúde Destruidas,DM_Inst Saúde Valor,DM_Inst Ensino Danificadas,DM_Inst Ensino Destruidas,DM_Inst Ensino Valor,DM_Inst Serviços Danificadas,DM_Inst Serviços Destruidas,DM_Inst Serviços Valor,DM_Inst Comuni Danificadas,DM_Inst Comuni Destruidas,DM_Inst Comuni Valor,DM_Obras de Infra Danificadas,DM_Obras de Infra Destruidas,DM_Obras de Infra Valor,DM_total_danos_materiais,DA_Descricao,DA_Polui/cont da água,DA_Polui/cont do ar,DA_Polui/cont do solo,DA_Dimi/exauri hídrico,DA_Incêndi parques/APA's/APP's,PEPL_Descricao,PEPL_Assis_méd e emergên(R$),PEPL_Abast de água pot(R$),PEPL_sist de esgotos sanit(R$),PEPL_Sis limp e rec lixo (R$),PEPL_Sis cont pragas (R$),PEPL_distrib energia (R$),PEPL_Telecomunicações (R$),PEPL_Tran loc/reg/l_curso (R$),PEPL_Distrib combustíveis(R$),PEPL_Segurança pública (R$),PEPL_Ensino (R$),PEPL_total_publico,PEPR_Descricao,PEPR_Agricultura (R$),PEPR_Pecuária (R$),PEPR_Indústria (R$),PEPR_Comércio (R$),PEPR_Serviços (R$),PEPR_total_privado,PE_PLePR,Ano_Evento,Empenhado,DensidadePop,Area,Município - UF,PIB,DOMICILIO_AREARURAL,PDEFAGUA,PDEFESGOTO,PDEFLIXO,PDEFSAN,QTDE_FAMILIAS_ATUALIZADAS,Categoria,Grupo,Subgrupo,Tipo,Subtipo,COBRADE,Pop
0,SP-A-3550001-12200-20100101,São Luiz do Paraitinga,SP,Sudeste,2010-01-01,2010-01-01,3550001,,Reconhecido,,0,0,0,93,4030,0,16,4139,,49,20,20643486.932,3.0,0,1121928.638,9.0,1,5235666.976,0,0,0.0,3,3,32411.272,25.0,0.0,65321178.458,92354672.275,,,,,,,,7978.159,673157.183,623293.688,2493.175,0.0,330993.88,3739.762,58609.552,0.0,0.0,1087024.191,2787289.589,,1642902.433,1149353.56,379785.31,0.0,24433.113,3196474.416,5983764.005,2010,,,,São Luiz do Paraitinga - SP,86448.0,,,,,,,Natural,Hidrológico,Enxurradas,,,12200,10397
1,SP-A-3518305-11321-20100101,Guararema,SP,Sudeste,2010-01-01,2010-01-01,3518305,,Reconhecido,,4,2,0,66,417,0,14430,14919,,161,91,18604069.987,0.0,0,0.0,2.0,0,24931.748,0,0,0.0,0,0,37397.621,9000.0,0.0,57771845.314,76438244.669,,,,,,,,0.0,179508.582,264276.524,12465.874,0.0,349044.465,74795.243,134631.437,0.0,0.0,77288.417,1092010.541,,441291.931,2049389.645,0.0,0.0,112192.889,2602874.464,3694885.005,2010,,,,Guararema - SP,1111405.0,,,,,,,Natural,Geológico,Movimento de massa,Deslizamentos,Deslizamentos de solo e ou rocha,11321,25844
2,BA-A-2905909-14110-20100101,Campo Alegre de Lourdes,BA,Nordeste,2010-01-01,2010-01-01,2905909,,Reconhecido,,0,0,0,0,0,0,0,0,,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,,,,,,,,747.952,448771.455,0.0,0.0,0.0,0.0,0.0,249317.475,0.0,0.0,177015.407,875852.29,,254303.825,147097.31,0.0,0.0,0.0,401401.135,1277253.425,2010,,,2914.587,Campo Alegre de Lourdes - BA,107459.0,,,,,,,Natural,Climatológico,Seca,Estiagem,,14110,28090
3,SP-A-3513603-12100-20100101,Cunha,SP,Sudeste,2010-01-01,2010-01-01,3513603,,Reconhecido,,6,0,0,38,492,0,12000,12536,,89,9,3340854.165,0.0,0,0.0,0.0,0,0.0,0,0,0.0,0,0,0.0,3.0,400.0,8052097.614,11392951.779,,,,,,,,0.0,1246.587,0.0,0.0,0.0,1994539.8,0.0,125656.007,0.0,0.0,0.0,2121442.395,,49863.495,1383711.986,0.0,0.0,925466.467,2359041.949,4480484.344,2010,,,1407.25,Cunha - SP,112225.0,,,,,,,Natural,Hidrológico,Inundações,,,12100,21866
4,BA-P-2917334-12200-20100101,Iuiu,BA,Nordeste,2010-01-01,2010-01-01,2917334,,Reconhecido,,0,0,0,0,0,0,0,0,,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2010,,,1525.142,Iuiu - BA,50315.0,,,,,,,Natural,Hidrológico,Enxurradas,,,12200,10900


### Exclusão de variáveis

In [3]:
# Excluir variáveis que não serão adicionadas ao modelo

df_eventos = df_eventos.drop(
    ['Protocolo_S2iD', 'Nome_Municipio', 'Data_Registro', 'Data_Evento', 'codigo_ibge'], axis=1)

In [4]:
#Excluir colunas que tenham mais do que 5% de dados ausentes
df_na_counts = df_eventos.isna().sum().reset_index().rename(columns={0: 'Qtde. dados ausentes', 'index': 'Variável'})
df_na_counts['% de dados ausentes'] = (df_na_counts['Qtde. dados ausentes'] / len(df_eventos)) * 100
colunas_mantidas = df_na_counts[df_na_counts['% de dados ausentes'] < 5]['Variável']
df_eventos = df_eventos[colunas_mantidas]
print(df_eventos.columns)

Index(['Sigla_UF', 'regiao', 'Status', 'DH_MORTOS', 'DH_FERIDOS',
       'DH_ENFERMOS', 'DH_DESABRIGADOS', 'DH_DESALOJADOS', 'DH_DESAPARECIDOS',
       'DH_OUTROS AFETADOS', 'DH_total_danos_humanos',
       'DM_Uni Habita Danificadas', 'DM_Uni Habita Destruidas',
       'DM_Uni Habita Valor', 'DM_Inst Saúde Danificadas',
       'DM_Inst Saúde Destruidas', 'DM_Inst Saúde Valor',
       'DM_Inst Ensino Danificadas', 'DM_Inst Ensino Destruidas',
       'DM_Inst Ensino Valor', 'DM_Inst Serviços Danificadas',
       'DM_Inst Serviços Destruidas', 'DM_Inst Serviços Valor',
       'DM_Inst Comuni Danificadas', 'DM_Inst Comuni Destruidas',
       'DM_Inst Comuni Valor', 'DM_Obras de Infra Danificadas',
       'DM_Obras de Infra Destruidas', 'DM_Obras de Infra Valor',
       'DM_total_danos_materiais', 'PEPL_Assis_méd e emergên(R$)',
       'PEPL_Abast de água pot(R$)', 'PEPL_sist de esgotos sanit(R$)',
       'PEPL_Sis limp e rec lixo (R$)', 'PEPL_Sis cont pragas (R$)',
       'PEPL_distrib en

### Encoding das variáveis categóricas

In [5]:
encoder = OneHotEncoder(sparse_output=False, drop=None)  
categorias = ['Subgrupo', 'Categoria', 'Grupo']
one_hot_encoded = encoder.fit_transform(df_eventos[categorias])

one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorias))

df_eventos = pd.concat([df_eventos.drop(columns=categorias), one_hot_df], axis=1)

df_eventos.head(3)

Unnamed: 0,Sigla_UF,regiao,Status,DH_MORTOS,DH_FERIDOS,DH_ENFERMOS,DH_DESABRIGADOS,DH_DESALOJADOS,DH_DESAPARECIDOS,DH_OUTROS AFETADOS,DH_total_danos_humanos,DM_Uni Habita Danificadas,DM_Uni Habita Destruidas,DM_Uni Habita Valor,DM_Inst Saúde Danificadas,DM_Inst Saúde Destruidas,DM_Inst Saúde Valor,DM_Inst Ensino Danificadas,DM_Inst Ensino Destruidas,DM_Inst Ensino Valor,DM_Inst Serviços Danificadas,DM_Inst Serviços Destruidas,DM_Inst Serviços Valor,DM_Inst Comuni Danificadas,DM_Inst Comuni Destruidas,DM_Inst Comuni Valor,DM_Obras de Infra Danificadas,DM_Obras de Infra Destruidas,DM_Obras de Infra Valor,DM_total_danos_materiais,PEPL_Assis_méd e emergên(R$),PEPL_Abast de água pot(R$),PEPL_sist de esgotos sanit(R$),PEPL_Sis limp e rec lixo (R$),PEPL_Sis cont pragas (R$),PEPL_distrib energia (R$),PEPL_Telecomunicações (R$),PEPL_Tran loc/reg/l_curso (R$),PEPL_Distrib combustíveis(R$),PEPL_Segurança pública (R$),PEPL_Ensino (R$),PEPL_total_publico,PEPR_Agricultura (R$),PEPR_Pecuária (R$),PEPR_Indústria (R$),PEPR_Comércio (R$),PEPR_Serviços (R$),PEPR_total_privado,PE_PLePR,Ano_Evento,COBRADE,Subgrupo_Alagamentos,Subgrupo_Colapso de edificações,Subgrupo_Desastres relacionados à contaminação da água,Subgrupo_Enxurradas,Subgrupo_Epidemias,Subgrupo_Erosão,Subgrupo_Incêndios urbanos,Subgrupo_Infestações/Pragas,Subgrupo_Inundações,Subgrupo_Movimento de massa,Subgrupo_Rompimento/colapso de barragens,Subgrupo_Seca,Subgrupo_Sistemas de Grande Escala/Escala Regional,Subgrupo_Temperaturas Extremas,Subgrupo_Tempestades,Subgrupo_Terremoto,Subgrupo_Transporte aquaviário,Subgrupo_Transporte rodoviário,Subgrupo_nan,Categoria_Natural,Categoria_Tecnológico,Grupo_Biológico,Grupo_Climatológico,Grupo_Desastres Relacionados a Incêndios Urbanos,Grupo_Desastres Relacionados a Produtos Perigosos,Grupo_Desastres relacionados a obras civis,Grupo_Desastres relacionados a transporte de passageiros e cargas não perigosas,Grupo_Geológico,Grupo_Hidrológico,Grupo_Meteorológico
0,SP,Sudeste,Reconhecido,0,0,0,93,4030,0,16,4139,49,20,20643490.0,3.0,0,1121929.0,9.0,1,5235667.0,0,0,0.0,3,3,32411.271754,25.0,0.0,65321180.0,92354670.0,7978.159201,673157.18258,623293.687574,2493.17475,0.0,330993.879849,3739.762125,58609.55203,0.0,0.0,1087024.0,2787290.0,1642902.0,1149354.0,379785.309713,0.0,24433.112553,3196474.0,5983764.0,2010,12200,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,SP,Sudeste,Reconhecido,4,2,0,66,417,0,14430,14919,161,91,18604070.0,0.0,0,0.0,2.0,0,24931.75,0,0,0.0,0,0,37397.621254,9000.0,0.0,57771850.0,76438240.0,0.0,179508.582021,264276.523531,12465.873751,0.0,349044.465042,74795.242509,134631.436516,0.0,0.0,77288.42,1092011.0,441291.9,2049390.0,0.0,0.0,112192.888695,2602874.0,3694885.0,2010,11321,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,BA,Nordeste,Reconhecido,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,747.952425,448771.455053,0.0,0.0,0.0,0.0,0.0,249317.47503,0.0,0.0,177015.4,875852.3,254303.8,147097.3,0.0,0.0,0.0,401401.1,1277253.0,2010,14110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df_eventos['Status'] = df_eventos['Status'].map({'Reconhecido': 0, 'Não reconhecido': 1})

In [7]:
df_eventos.head(3)

Unnamed: 0,Sigla_UF,regiao,Status,DH_MORTOS,DH_FERIDOS,DH_ENFERMOS,DH_DESABRIGADOS,DH_DESALOJADOS,DH_DESAPARECIDOS,DH_OUTROS AFETADOS,DH_total_danos_humanos,DM_Uni Habita Danificadas,DM_Uni Habita Destruidas,DM_Uni Habita Valor,DM_Inst Saúde Danificadas,DM_Inst Saúde Destruidas,DM_Inst Saúde Valor,DM_Inst Ensino Danificadas,DM_Inst Ensino Destruidas,DM_Inst Ensino Valor,DM_Inst Serviços Danificadas,DM_Inst Serviços Destruidas,DM_Inst Serviços Valor,DM_Inst Comuni Danificadas,DM_Inst Comuni Destruidas,DM_Inst Comuni Valor,DM_Obras de Infra Danificadas,DM_Obras de Infra Destruidas,DM_Obras de Infra Valor,DM_total_danos_materiais,PEPL_Assis_méd e emergên(R$),PEPL_Abast de água pot(R$),PEPL_sist de esgotos sanit(R$),PEPL_Sis limp e rec lixo (R$),PEPL_Sis cont pragas (R$),PEPL_distrib energia (R$),PEPL_Telecomunicações (R$),PEPL_Tran loc/reg/l_curso (R$),PEPL_Distrib combustíveis(R$),PEPL_Segurança pública (R$),PEPL_Ensino (R$),PEPL_total_publico,PEPR_Agricultura (R$),PEPR_Pecuária (R$),PEPR_Indústria (R$),PEPR_Comércio (R$),PEPR_Serviços (R$),PEPR_total_privado,PE_PLePR,Ano_Evento,COBRADE,Subgrupo_Alagamentos,Subgrupo_Colapso de edificações,Subgrupo_Desastres relacionados à contaminação da água,Subgrupo_Enxurradas,Subgrupo_Epidemias,Subgrupo_Erosão,Subgrupo_Incêndios urbanos,Subgrupo_Infestações/Pragas,Subgrupo_Inundações,Subgrupo_Movimento de massa,Subgrupo_Rompimento/colapso de barragens,Subgrupo_Seca,Subgrupo_Sistemas de Grande Escala/Escala Regional,Subgrupo_Temperaturas Extremas,Subgrupo_Tempestades,Subgrupo_Terremoto,Subgrupo_Transporte aquaviário,Subgrupo_Transporte rodoviário,Subgrupo_nan,Categoria_Natural,Categoria_Tecnológico,Grupo_Biológico,Grupo_Climatológico,Grupo_Desastres Relacionados a Incêndios Urbanos,Grupo_Desastres Relacionados a Produtos Perigosos,Grupo_Desastres relacionados a obras civis,Grupo_Desastres relacionados a transporte de passageiros e cargas não perigosas,Grupo_Geológico,Grupo_Hidrológico,Grupo_Meteorológico
0,SP,Sudeste,0,0,0,0,93,4030,0,16,4139,49,20,20643490.0,3.0,0,1121929.0,9.0,1,5235667.0,0,0,0.0,3,3,32411.271754,25.0,0.0,65321180.0,92354670.0,7978.159201,673157.18258,623293.687574,2493.17475,0.0,330993.879849,3739.762125,58609.55203,0.0,0.0,1087024.0,2787290.0,1642902.0,1149354.0,379785.309713,0.0,24433.112553,3196474.0,5983764.0,2010,12200,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,SP,Sudeste,0,4,2,0,66,417,0,14430,14919,161,91,18604070.0,0.0,0,0.0,2.0,0,24931.75,0,0,0.0,0,0,37397.621254,9000.0,0.0,57771850.0,76438240.0,0.0,179508.582021,264276.523531,12465.873751,0.0,349044.465042,74795.242509,134631.436516,0.0,0.0,77288.42,1092011.0,441291.9,2049390.0,0.0,0.0,112192.888695,2602874.0,3694885.0,2010,11321,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,BA,Nordeste,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,747.952425,448771.455053,0.0,0.0,0.0,0.0,0.0,249317.47503,0.0,0.0,177015.4,875852.3,254303.8,147097.3,0.0,0.0,0.0,401401.1,1277253.0,2010,14110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Shuffle das variáveis para evitar a ordenação no treino

In [8]:
df_eventos = df_eventos.sample(frac=1).reset_index()

In [9]:
#df_eventos.to_csv('df_eventos_preprocessado.csv',
#                        index=False,
#                        sep=';',
#                        decimal=',')