## Pré processamento dos dados

### Bibliotecas e base de dados

In [13]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [14]:
pd.set_option('display.max_rows', None)              
pd.set_option('display.max_columns', None)          
pd.set_option('display.max_colwidth', 50)           
pd.set_option('display.width', 1000)                 
pd.set_option('display.float_format', '{:.2f}'.format)  
pd.set_option('display.colheader_justify', 'left')  
pd.set_option('display.float_format', '{:,.2f}'.format)
def estilo_minimalista(df):
    return df.style.set_properties(**{
        'font-size': '8px',    
        'border-color': 'black', 
        'border-width': '0.4px',   
        'border-style': 'solid',
        'padding': '1px',        
    })
sns.set_theme(style="white", palette="deep", context="talk",font_scale=0.8)
plt.rcParams['figure.figsize'] = (12, 6)

In [15]:
object_columns = ['Protocolo_S2iD', 'Nome_Municipio', 'Sigla_UF', 'regiao',
                  'Setores Censitários', 'Status', 'DH_Descricao', 'DM_Descricao',
                  'DA_Descricao', 'DA_Polui/cont da água', 'DA_Polui/cont do ar',
                  'DA_Polui/cont do solo', 'DA_Dimi/exauri hídrico',
                  "DA_Incêndi parques/APA's/APP's", 'PEPL_Descricao', 'PEPR_Descricao',
                  'Categoria', 'Grupo', 'Subgrupo', 'Tipo', 'Subtipo']

dtype = {col: 'object' for col in object_columns}

df_eventos = pd.read_csv(
    "https://raw.githubusercontent.com/brunagmoura/PrevisorReconhecimento/refs/heads/main/df_eventos_desastres_rec_nrec.csv",
    sep=';',
    decimal=',',
    dtype=dtype)

df_eventos.head(5)

Unnamed: 0,Protocolo_S2iD,Nome_Municipio,Sigla_UF,regiao,Data_Registro,Data_Evento,codigo_ibge,Setores Censitários,Status,DH_Descricao,DH_MORTOS,DH_FERIDOS,DH_ENFERMOS,DH_DESABRIGADOS,DH_DESALOJADOS,DH_DESAPARECIDOS,DH_OUTROS AFETADOS,DH_total_danos_humanos,DM_Descricao,DM_Uni Habita Danificadas,DM_Uni Habita Destruidas,DM_Uni Habita Valor,DM_Inst Saúde Danificadas,DM_Inst Saúde Destruidas,DM_Inst Saúde Valor,DM_Inst Ensino Danificadas,DM_Inst Ensino Destruidas,DM_Inst Ensino Valor,DM_Inst Serviços Danificadas,DM_Inst Serviços Destruidas,DM_Inst Serviços Valor,DM_Inst Comuni Danificadas,DM_Inst Comuni Destruidas,DM_Inst Comuni Valor,DM_Obras de Infra Danificadas,DM_Obras de Infra Destruidas,DM_Obras de Infra Valor,DM_total_danos_materiais,DA_Descricao,DA_Polui/cont da água,DA_Polui/cont do ar,DA_Polui/cont do solo,DA_Dimi/exauri hídrico,DA_Incêndi parques/APA's/APP's,PEPL_Descricao,PEPL_Assis_méd e emergên(R$),PEPL_Abast de água pot(R$),PEPL_sist de esgotos sanit(R$),PEPL_Sis limp e rec lixo (R$),PEPL_Sis cont pragas (R$),PEPL_distrib energia (R$),PEPL_Telecomunicações (R$),PEPL_Tran loc/reg/l_curso (R$),PEPL_Distrib combustíveis(R$),PEPL_Segurança pública (R$),PEPL_Ensino (R$),PEPL_total_publico,PEPR_Descricao,PEPR_Agricultura (R$),PEPR_Pecuária (R$),PEPR_Indústria (R$),PEPR_Comércio (R$),PEPR_Serviços (R$),PEPR_total_privado,PE_PLePR,Empenhado,DensidadePop,Hab,Area,Ano,PIB,Categoria,Grupo,Subgrupo,Tipo,Subtipo,COBRADE
0,SC-F-4200903-12200-20141226,Angelina,SC,Sul,2015-01-02 00:00:00,2015-01-02,4200903,,Reconhecido,Moradores da comunidade que ficarem desprovido...,0,0,0,0,0,0,1500,1500,Os danos são representados pelas avarias/destr...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,,,,,,,"No caso exposto, os prejuízos públicos são rep...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"Quebra/perda da produção agrícola/pecuária, oc...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,,10.72,5358.0,500.0,2015.0,103375.0,Natural,Hidrológico,Enxurradas,,,12200
1,MS-F-5000708-12200-20141222,Anastácio,MS,Centro-oeste,2015-01-05 00:00:00,2015-01-05,5000708,,Reconhecido,Ficaram prejudicados um total de 800 famílias ...,0,0,0,0,0,0,3200,3200,Foram danificadas tum total de 8 pontes que in...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,6271293.52,6271293.52,,,,,,,Foram danificadas tum total de 8 pontes que in...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2332356.82,0.0,0.0,0.0,2332356.82,"Produtores da Agricultura Familiar, produtores...",326156.77,1630783.87,0.0,0.0,0.0,1956940.64,4289297.46,,8.28,24114.0,2913.18,2015.0,394893.0,Natural,Hidrológico,Enxurradas,,,12200
2,RS-F-4301925-12200-20150101,Barra do Rio Azul,RS,Sul,2015-01-06 00:00:00,2015-01-06,4301925,,Não reconhecido,,0,0,0,0,0,0,0,0,Danos e destruição de 35 Bueiros e danos em 15...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0,0,0.0,0,0,54435.57,30.0,5.0,1041889.69,1096325.26,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,11.54,1696.0,147.0,2015.0,42201.0,Natural,Hidrológico,Enxurradas,,,12200
3,RS-F-4311601-12200-20141231,Liberato Salzano,RS,Sul,2015-01-07 00:00:00,2015-01-07,4311601,,Reconhecido,,0,0,0,0,0,0,0,0,Devido a alta precipitação pluviométrica e con...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,,,,,,,"Destruição das vias de acesso do Município, co...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1740983.83,0.0,0.0,0.0,1740983.83,Devido as fortes precipitações ocorridas ocor...,9718266.89,0.0,0.0,0.0,0.0,9718266.89,11459250.72,,19.46,4781.0,245.63,2015.0,116822.0,Natural,Hidrológico,Enxurradas,,,12200
4,BA-F-2931608-12200-20141216,Teolândia,BA,Nordeste,2015-01-07 00:00:00,2015-01-07,2931608,,Reconhecido,05 pessoas deram entrada no hospital municipal...,0,5,2,150,25,0,0,182,Parte do cais que sustenta a toda a rua que da...,7,30,3315927.2,1.0,0,1087189.25,1.0,0,90599.1,0,0,0.0,0,0,0.0,8.0,0.0,7791522.93,12285238.47,comprometimento no abastecimento de água da po...,,,,,,"Diante deste evento adverso, houve danos no si...",90599.1,45299.55,3823282.18,217437.85,0.0,0.0,0.0,4167558.77,0.0,0.0,90599.1,8434776.56,Os prejuízos da agricultura estão ligados aos ...,905991.04,54359.46,0.0,0.0,108718.92,1069069.42,9503845.99,,52.91,15332.0,289.78,2015.0,112279.0,Natural,Hidrológico,Enxurradas,,,12200


### Exclusão de variáveis

In [23]:
# Excluir variáveis que não serão adicionadas ao modelo

df_eventos = df_eventos.drop(
    ['Protocolo_S2iD', 'Nome_Municipio', 'Sigla_UF', 'regiao', 'Data_Registro', 'Data_Evento', 'codigo_ibge',
     'Setores Censitários', 'COBRADE', "Ano"], axis=1)

KeyError: "['Protocolo_S2iD', 'Nome_Municipio', 'Sigla_UF', 'regiao', 'Data_Registro', 'Data_Evento', 'codigo_ibge', 'Setores Censitários', 'COBRADE', 'Ano'] not found in axis"

In [17]:
#Excluir colunas que tenham mais do que 5% de dados ausentes
df_na_counts = df_eventos.isna().sum().reset_index().rename(columns={0: 'Qtde. dados ausentes', 'index': 'Variável'})
df_na_counts['% de dados ausentes'] = (df_na_counts['Qtde. dados ausentes'] / len(df_eventos)) * 100
colunas_mantidas = df_na_counts[df_na_counts['% de dados ausentes'] < 5]['Variável']
df_eventos = df_eventos[colunas_mantidas]
print(df_eventos.columns)

Index(['Status', 'DH_MORTOS', 'DH_FERIDOS', 'DH_ENFERMOS', 'DH_DESABRIGADOS', 'DH_DESALOJADOS', 'DH_DESAPARECIDOS', 'DH_OUTROS AFETADOS', 'DH_total_danos_humanos', 'DM_Uni Habita Danificadas', 'DM_Uni Habita Destruidas', 'DM_Uni Habita Valor', 'DM_Inst Saúde Danificadas', 'DM_Inst Saúde Destruidas', 'DM_Inst Saúde Valor', 'DM_Inst Ensino Danificadas', 'DM_Inst Ensino Destruidas', 'DM_Inst Ensino Valor', 'DM_Inst Serviços Danificadas', 'DM_Inst Serviços Destruidas', 'DM_Inst Serviços Valor', 'DM_Inst Comuni Danificadas', 'DM_Inst Comuni Destruidas', 'DM_Inst Comuni Valor', 'DM_Obras de Infra Danificadas', 'DM_Obras de Infra Destruidas', 'DM_Obras de Infra Valor', 'DM_total_danos_materiais', 'PEPL_Assis_méd e emergên(R$)', 'PEPL_Abast de água pot(R$)', 'PEPL_sist de esgotos sanit(R$)', 'PEPL_Sis limp e rec lixo (R$)', 'PEPL_Sis cont pragas (R$)', 'PEPL_distrib energia (R$)', 'PEPL_Telecomunicações (R$)', 'PEPL_Tran loc/reg/l_curso (R$)', 'PEPL_Distrib combustíveis(R$)',
       'PEPL_Segu

### Encoding das variáveis categóricas

In [24]:
encoder = OneHotEncoder(sparse_output=False, drop=None)  
categorias = ['Subgrupo', 'Categoria', 'Grupo']
one_hot_encoded = encoder.fit_transform(df_eventos[categorias])

one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorias))

df_eventos = pd.concat([df_eventos.drop(columns=categorias), one_hot_df], axis=1)

df_eventos.head(3)

KeyError: "None of [Index(['Subgrupo', 'Categoria', 'Grupo'], dtype='object')] are in the [columns]"

In [19]:
df_eventos['Status'] = df_eventos['Status'].map({'Reconhecido': 0, 'Não reconhecido': 1})

In [25]:
df_eventos.head(3)

Unnamed: 0,Status,DH_MORTOS,DH_FERIDOS,DH_ENFERMOS,DH_DESABRIGADOS,DH_DESALOJADOS,DH_DESAPARECIDOS,DH_OUTROS AFETADOS,DH_total_danos_humanos,DM_Uni Habita Danificadas,DM_Uni Habita Destruidas,DM_Uni Habita Valor,DM_Inst Saúde Danificadas,DM_Inst Saúde Destruidas,DM_Inst Saúde Valor,DM_Inst Ensino Danificadas,DM_Inst Ensino Destruidas,DM_Inst Ensino Valor,DM_Inst Serviços Danificadas,DM_Inst Serviços Destruidas,DM_Inst Serviços Valor,DM_Inst Comuni Danificadas,DM_Inst Comuni Destruidas,DM_Inst Comuni Valor,DM_Obras de Infra Danificadas,DM_Obras de Infra Destruidas,DM_Obras de Infra Valor,DM_total_danos_materiais,PEPL_Assis_méd e emergên(R$),PEPL_Abast de água pot(R$),PEPL_sist de esgotos sanit(R$),PEPL_Sis limp e rec lixo (R$),PEPL_Sis cont pragas (R$),PEPL_distrib energia (R$),PEPL_Telecomunicações (R$),PEPL_Tran loc/reg/l_curso (R$),PEPL_Distrib combustíveis(R$),PEPL_Segurança pública (R$),PEPL_Ensino (R$),PEPL_total_publico,PEPR_Agricultura (R$),PEPR_Pecuária (R$),PEPR_Indústria (R$),PEPR_Comércio (R$),PEPR_Serviços (R$),PEPR_total_privado,PE_PLePR,DensidadePop,Hab,Area,Subgrupo_Alagamentos,Subgrupo_Colapso de edificações,Subgrupo_Desastres relacionados à contaminação da água,Subgrupo_Enxurradas,Subgrupo_Epidemias,Subgrupo_Erosão,Subgrupo_Incêndios urbanos,Subgrupo_Infestações/Pragas,Subgrupo_Inundações,Subgrupo_Movimento de massa,Subgrupo_Rompimento/colapso de barragens,Subgrupo_Seca,Subgrupo_Sistemas de Grande Escala/Escala Regional,Subgrupo_Temperaturas Extremas,Subgrupo_Tempestades,Subgrupo_Terremoto,Subgrupo_Transporte aquaviário,Subgrupo_Transporte rodoviário,Subgrupo_nan,Categoria_Natural,Categoria_Tecnológico,Grupo_Biológico,Grupo_Climatológico,Grupo_Desastres Relacionados a Incêndios Urbanos,Grupo_Desastres Relacionados a Produtos Perigosos,Grupo_Desastres relacionados a obras civis,Grupo_Desastres relacionados a transporte de passageiros e cargas não perigosas,Grupo_Geológico,Grupo_Hidrológico,Grupo_Meteorológico
0,0,0,0,0,0,0,0,1500,1500,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.72,5358.0,500.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0,0,0,0,0,0,0,3200,3200,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,6271293.52,6271293.52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2332356.82,0.0,0.0,0.0,2332356.82,326156.77,1630783.87,0.0,0.0,0.0,1956940.64,4289297.46,8.28,24114.0,2913.18,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0,0,0.0,0,0,54435.57,30.0,5.0,1041889.69,1096325.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.54,1696.0,147.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


#### Shuffle das variáveis para evitar a ordenação no treino

In [27]:
df_eventos = df_eventos.sample(frac=1).reset_index()

In [28]:
df_eventos.to_csv('df_eventos_preprocessado.csv',
                        index=False,
                        sep=';',
                        decimal=',')