# Necessary imports

In [1]:
%pip install pandas
%pip install numpy
%pip install scikit-learn
%pip install optuna
%pip install holidays

import pandas as pd
import numpy as np
from datetime import datetime, time
import holidays
import optuna

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


# Carregamento de dados

In [2]:
dtypes = {
    "hora_minuto":  str,
    "municipio": "category",
    "bairro": "category",
    "endereco": "category",
    "origem_chamado": "category",
    "tipo": "category",
    "subtipo": "category",
    "sexo": "category",
    "idade": float,
    "motivo_finalizacao": "category",
    "motivo_desfecho": "category",
}

columns_to_datetime = ["data"]

raw_df = pd.read_csv("./datasets/ocorrencias2022.csv", sep=';', dtype=dtypes, parse_dates=columns_to_datetime)
raw_df['hora_minuto'] = pd.to_datetime(raw_df['hora_minuto']).dt.time

# Tratando dados vazios e inconsistentes

In [3]:
raw_df = raw_df.drop("motivo_finalizacao", axis=1)
raw_df = raw_df.dropna(subset=["municipio", "bairro", "subtipo", "sexo", "idade"])

def older_than_120(age: int):
  if age >= 120:
    return 120
  return age

raw_df["idade"] = raw_df["idade"].apply(lambda x : older_than_120(x))

# Limitando subset para Recife

In [4]:
raw_df = raw_df[raw_df['municipio'] == 'RECIFE']

In [5]:
raw_df.head()

Unnamed: 0,data,hora_minuto,municipio,bairro,endereco,origem_chamado,tipo,subtipo,sexo,idade,motivo_desfecho
0,2022-01-01,00:02:19,RECIFE,JARDIM SAO PAULO,R LEANDRO BARRETO,RESIDENCIAL,RESPIRATORIA,CASO SUSPEITO COVID-19,FEMININO,81.0,PACIENTE RECUSA SER REMOVIDO
1,2022-01-01,00:03:00,RECIFE,MADALENA,R ALTINHO,RESIDENCIAL,CAUSAS EXTERNAS,QUEDA DA PROPRIA ALTURA,FEMININO,81.0,DESISTÊNCIA DA SOLICITAÇÃO
13,2022-01-01,00:29:22,RECIFE,PASSARINHO,R NOVA JERUSALEM N,RESIDENCIAL,GERAIS/OUTROS,OUTROS,FEMININO,60.0,SEM DESFECHO
15,2022-01-01,00:31:13,RECIFE,IMBIRIBEIRA,R BEZERRA DE CARVALHO,RESIDENCIAL,GERAIS/OUTROS,SINDROME VIRAL,MASCULINO,53.0,SEM DESFECHO
18,2022-01-01,00:41:35,RECIFE,ENCRUZILHADA,R CASTRO ALVES,RESIDENCIAL,NEUROLOGICA,OUTROS,FEMININO,87.0,PACIENTE RECUSA SER REMOVIDO


In [6]:
motivos_desfecho = raw_df['motivo_desfecho'].value_counts()
motivos_desfecho

OCORRÊNCIA CONCLUÍDA COM ÊXITO                           13946
SEM DESFECHO                                             11409
PACIENTE RECUSA SER REMOVIDO                              2983
REMOVIDO ANTES DO ATENDIMENTO POR PARTICULARES            1787
DESISTÊNCIA DA SOLICITAÇÃO                                1259
NÃO HÁ PACIENTE NO ENDEREÇO                                722
ACOMPANHANTE RECUSA REMOÇÃO                                613
PACIENTE JÉ ENCONTRADO EM ÓBITO                            577
CASA FECHADA / NINGUÉM ATENDE AO CHAMADO                   325
REMOVIDO PELOS BOMBEIROS/CIODS                             306
PACIENTE NÃO NECESSITA DE REMOÇÃO                          261
SOLICITAÇÃO DUPLICADA                                      169
ÓBITO DURANTE O ATENDIMENTO                                 48
TROTE                                                       34
PACIENTE SEM CONDIÇÕES CLÍNICAS DE REMOÇÃO HOSPITALAR       16
Name: motivo_desfecho, dtype: int64

# Propósito do modelo

Quando um chamado é iniciado, a maior partes das informações é recolhida na hora. O motivo do desfecho, contudo, só pode ser preenchido após o encerramento do chamado. Destacamos os seguintes motivos de desfecho:

"PACIENTE JÉ ENCONTRADO EM ÓBITO"

"ÓBITO DURANTE O ATENDIMENTO"

Partimos do pressuposto de que alguns desses casos de óbito poderiam ser evitados com maior agilidade ou priorização por parte do SAMU.

Daí veio a idéia do nosso modelo:

Um modelo capaz de determinar com certo grau de certeza, baseando-se nos detalhes recolhidos na hora do registro da ocorrência, se aquela ocorrência corre risco de terminar com algum óbito. Caso ela afirme positivamente, essa informação poderia ser usada para maior priorização ou agilidade por parte da equipe.

# Criação do dataset de treino

Iremos trabalhar com um subset dos dados presentes nesse dataset. Isso é, apenas dados pertinentes ao município de Recife.

1. Inicialmente, iremos criar uma coluna de "obito" mais simples, que engloba os 2 tipos de motivo de desfecho que levaram a óbitos, é composta por 1 ou 0, em caso de óito ou o contrário.

2. Após isso, removeremos a coluna "motivo_desfecho", pois, como discutido, ela só é preenchida após a conclusão da ocorrência, então não faria sentido o modelo ter acesso a essa informação no momento em que analisa uma ocorrência nova.

3. Criaremos uma coluna que transforme o dado de hora numa relação mais categórica e genérica, "Período", como madrugada, manhã, tarde e noite.

4. Extrairemos dados como dia da semana e se o dia era feriado a partir da data.

5. Removeremos colunas que julgamos serem irrelevantes para a classificação, como data e colunas de endereço. Decidimos manter a coluna de hora_minuto pois é argumentável que a hora que algo ocorre pode impactar na conclusão.

6. Para utilização do scikit learn, utilizaremos a técnica One Hot Enconding para transformar cada tipo categórico em um formato mais desejável para o scikit learn.
É importante ressaltar que os dados categóricos "Período" e "Idade" são ordinais, isso é, uma certa ordem pode ser determinada entre as categorias (manhã vem antes da tarde que vem antes da noite, 55 anos vem antes dos 56 anos), porém o resto das colunas representam dados nominais.

7. Por fim, separamos os datasets para treino e validação e teste.

In [7]:
raw_df['obito'] = [1 if x == "PACIENTE JÉ ENCONTRADO EM ÓBITO" or x == "ÓBITO DURANTE O ATENDIMENTO" else 0 for x in raw_df['motivo_desfecho']]

madrugada_upper = time(4, 59, 59)
manha_upper = time(11, 59, 59)
tarde_upper = time(17, 59, 59)
noite_upper = time(23, 59, 59)

conditions = [
    (raw_df['hora_minuto'] <= madrugada_upper),
    ((raw_df['hora_minuto'] > madrugada_upper) & (raw_df['hora_minuto'] <= manha_upper)),
    ((raw_df['hora_minuto'] > manha_upper) & (raw_df['hora_minuto'] <= tarde_upper)),
    ((raw_df['hora_minuto'] > tarde_upper) & (raw_df['hora_minuto'] <= noite_upper)),
]
choices = ['madrugada', 'manha', 'tarde', 'noite']
raw_df['periodo'] = np.select(conditions, choices, default='indeterminado')

recife_holidays = holidays.Brazil(years=[2022])

raw_df['dia_semana'] = raw_df['data'].dt.day_of_week
raw_df['feriado'] = [1 if x in recife_holidays else 0 for x in raw_df['data']]

relevant_columns = [
    "dia_semana",
    "feriado",
    "periodo",
    "origem_chamado",
    "tipo",
    "subtipo",
    "sexo",
    "idade",
    "obito"
]
df = raw_df[relevant_columns]
df.head()

Unnamed: 0,dia_semana,feriado,periodo,origem_chamado,tipo,subtipo,sexo,idade,obito
0,5,1,madrugada,RESIDENCIAL,RESPIRATORIA,CASO SUSPEITO COVID-19,FEMININO,81.0,0
1,5,1,madrugada,RESIDENCIAL,CAUSAS EXTERNAS,QUEDA DA PROPRIA ALTURA,FEMININO,81.0,0
13,5,1,madrugada,RESIDENCIAL,GERAIS/OUTROS,OUTROS,FEMININO,60.0,0
15,5,1,madrugada,RESIDENCIAL,GERAIS/OUTROS,SINDROME VIRAL,MASCULINO,53.0,0
18,5,1,madrugada,RESIDENCIAL,NEUROLOGICA,OUTROS,FEMININO,87.0,0


In [8]:
ohe_features = [
    "dia_semana",
    "feriado",
    "periodo",
    "origem_chamado",
    "tipo",
    "subtipo",
    "sexo",
]

ohe_df = pd.get_dummies(df, prefix=ohe_features, columns=ohe_features)
ohe_df.head()

Unnamed: 0,idade,obito,dia_semana_0,dia_semana_1,dia_semana_2,dia_semana_3,dia_semana_4,dia_semana_5,dia_semana_6,feriado_0,...,subtipo_TONTURAS,subtipo_TRABALHO DE PARTO,subtipo_TRAUMA OCULAR,subtipo_TREMORES,subtipo_USO DE DROGAS ILICITAS,subtipo_VOMITOS,subtipo_CORPO ESTRANHO OCULAR,subtipo_DOENCAS SEXUALMENTE TRANSMISSIVEIS,sexo_FEMININO,sexo_MASCULINO
0,81.0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,81.0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
13,60.0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
15,53.0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
18,87.0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [9]:
from sklearn.model_selection import train_test_split

features = list(ohe_df.columns)
features.remove('obito')

X = ohe_df[features].to_numpy()

y = ohe_df['obito'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.125, random_state=1337)

In [10]:
print("X_train count:", X_train.shape)
print("y_train count:", y_train.shape)
print("X_validation count:", X_validation.shape)
print("y_validation count:", y_validation.shape)
print("X_test count:", X_test.shape)
print("y_test count:", y_test.shape)

X_train count: (24118, 147)
y_train count: (24118,)
X_validation count: (3446, 147)
y_validation count: (3446,)
X_test count: (6891, 147)
y_test count: (6891,)


# Seleção e otimização de modelos

Os 4 modelos escolhidos foram:
- Naive Bayes (Gaussian)
- Random Forest
- Decision Tree
- Non-Linear SVC

Além disso, utilizamos o método GridSearch para otimização dos hiper-parâmetros.

In [11]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_val_score

labels = [0, 1]

# Using optuna to elect best model and hyper-parameters

Modelo 1: Naive Bayes

In [12]:
model = Pipeline([
    ('clf', GaussianNB())
])

def objective(trial):    
    clf__priors = trial.suggest_categorical('clf__priors', [None])
    clf__var_smoothing = trial.suggest_float('clf__var_smoothing', -9, 0)

    params = {
    'clf__priors': clf__priors,
    'clf__var_smoothing': clf__var_smoothing
    }
    
    model.set_params(**params)
    model.fit(X_train, y_train)

    return np.mean(cross_val_score(model, X_validation, y_validation, cv=8, n_jobs=-1))

study = optuna.create_study()
study.optimize(objective, timeout=600)

[32m[I 2022-09-20 22:08:44,652][0m A new study created in memory with name: no-name-08b78268-dac9-46eb-b6ff-18a5020f0978[0m
  n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))
  n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))
  n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))
  n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))
  n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))
  n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))
  n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))
  n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))
[32m[I 2022-09-20 22:08:45,730][0m Trial 0 finished with value: 0.9828791884746129 and parameters: {'clf__priors': None, 'clf__var_smoothing': -3.117674590052249}. Best is trial 0 with value: 0.9828791884746129.[0m
  n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))
  n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))
  n_ij = -0.5 * np.sum(np.log(2.0 *

In [17]:
print("Stats for the optimized Model")
print("Best Score:", study.best_value)
best_parameters = study.best_params
model_parameters = {}
for param in best_parameters:
    print(param, ":", best_parameters[param])
    model_parameters[param[5:]] = best_parameters[param]

tuned_model = GaussianNB(**model_parameters)
tuned_model.fit(X_train, y_train)

pred = tuned_model.predict(X_test)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, pred, labels=labels)
print("labels:", labels)
print("Precision:", [round(x, 2) for x in precision])
print("recall:", [round(x, 2) for x in recall])
print("f1:", [round(x, 2) for x in f1])

Stats for the optimized Model
Best Score: 0.9828791884746129
clf__priors : None
clf__var_smoothing : -3.117674590052249
labels: [0, 1]
Precision: [0.98, 0.0]
recall: [1.0, 0.0]
f1: [0.99, 0.0]


  n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))
  _warn_prf(average, modifier, msg_start, len(result))


Modelo 2: Random Forest

In [18]:
model = Pipeline([
    ('clf', RandomForestClassifier())
])

def objective(trial):    
    clf__n_estimators = trial.suggest_int('clf__n_estimators', 50, 501, 50)
    clf__max_depth = trial.suggest_int('clf__max_depth', 10, 110, log=True) # Find a way to include [None]. Maybe a categorical?
    clf__max_features = trial.suggest_categorical('clf__max_features', ['sqrt', 'log2', None])

    params = {
        'clf__n_estimators': clf__n_estimators,
        'clf__max_depth': clf__max_depth,
        'clf__max_features': clf__max_features
    }
    
    model.set_params(**params)
    model.fit(X_train, y_train)

    return np.mean(cross_val_score(model, X_validation, y_validation, cv=8, n_jobs=-1))

study = optuna.create_study()
study.optimize(objective, timeout=600)

[32m[I 2022-09-20 22:46:23,522][0m A new study created in memory with name: no-name-639aeade-65e7-494f-8335-d0bed69ca253[0m
[32m[I 2022-09-20 22:47:02,604][0m Trial 0 finished with value: 0.9791048400151081 and parameters: {'clf__n_estimators': 250, 'clf__max_depth': 31, 'clf__max_features': None}. Best is trial 0 with value: 0.9791048400151081.[0m
[32m[I 2022-09-20 22:47:52,891][0m Trial 1 finished with value: 0.9791055144876707 and parameters: {'clf__n_estimators': 300, 'clf__max_depth': 48, 'clf__max_features': None}. Best is trial 0 with value: 0.9791048400151081.[0m
[32m[I 2022-09-20 22:48:03,986][0m Trial 2 finished with value: 0.982007095451357 and parameters: {'clf__n_estimators': 400, 'clf__max_depth': 26, 'clf__max_features': 'sqrt'}. Best is trial 0 with value: 0.9791048400151081.[0m
[32m[I 2022-09-20 22:48:06,931][0m Trial 3 finished with value: 0.9828791884746129 and parameters: {'clf__n_estimators': 150, 'clf__max_depth': 15, 'clf__max_features': 'log2'}. Be

In [19]:
print("Stats for the optimized Model")
print("Best Score:", study.best_value)
best_parameters = study.best_params
model_parameters = {}
for param in best_parameters:
    print(param, ":", best_parameters[param])
    model_parameters[param[5:]] = best_parameters[param]

tuned_model = RandomForestClassifier(**model_parameters)
tuned_model.fit(X_train, y_train)

pred = tuned_model.predict(X_test)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, pred, labels=labels)
print("labels:", labels)
print("Precision:", [round(x, 2) for x in precision])
print("recall:", [round(x, 2) for x in recall])
print("f1:", [round(x, 2) for x in f1])

Stats for the optimized Model
Best Score: 0.9791048400151081
clf__n_estimators : 250
clf__max_depth : 31
clf__max_features : None
labels: [0, 1]
Precision: [0.99, 0.38]
recall: [1.0, 0.18]
f1: [0.99, 0.25]


Modelo 3: Decision Tree

In [21]:
model = Pipeline([
    ('clf', DecisionTreeClassifier())
])

def objective(trial):    
    clf__splitter = trial.suggest_categorical('clf__splitter', ['best', 'random'])
    clf__max_depth = trial.suggest_int('clf__max_depth', 10, 110, log=True) # Find a way to include [None]. Maybe a categorical?
    clf__max_features = trial.suggest_categorical('clf__max_features', ['sqrt', 'log2', None])

    params = {
        'clf__splitter': clf__splitter,
        'clf__max_depth': clf__max_depth,
        'clf__max_features': clf__max_features
    }
    
    model.set_params(**params)
    model.fit(X_train, y_train)

    return np.mean(cross_val_score(model, X_validation, y_validation, cv=8, n_jobs=-1))

study = optuna.create_study()
study.optimize(objective, timeout=600)

[32m[I 2022-09-20 23:09:27,555][0m A new study created in memory with name: no-name-b479806a-52d0-4301-9fba-4808c137f78c[0m
[32m[I 2022-09-20 23:09:30,457][0m Trial 0 finished with value: 0.9712674688393677 and parameters: {'clf__splitter': 'random', 'clf__max_depth': 29, 'clf__max_features': None}. Best is trial 0 with value: 0.9712674688393677.[0m
[32m[I 2022-09-20 23:09:30,578][0m Trial 1 finished with value: 0.970692818216155 and parameters: {'clf__splitter': 'random', 'clf__max_depth': 94, 'clf__max_features': 'log2'}. Best is trial 1 with value: 0.970692818216155.[0m
[32m[I 2022-09-20 23:09:30,786][0m Trial 2 finished with value: 0.9698153294123995 and parameters: {'clf__splitter': 'random', 'clf__max_depth': 46, 'clf__max_features': 'log2'}. Best is trial 2 with value: 0.9698153294123995.[0m
[32m[I 2022-09-20 23:09:30,984][0m Trial 3 finished with value: 0.9695259806831058 and parameters: {'clf__splitter': 'random', 'clf__max_depth': 32, 'clf__max_features': 'log2'

In [22]:
print("Stats for the optimized Model")
print("Best Score:", study.best_value)
best_parameters = study.best_params
model_parameters = {}
for param in best_parameters:
    print(param, ":", best_parameters[param])
    model_parameters[param[5:]] = best_parameters[param]

tuned_model = DecisionTreeClassifier(**model_parameters)
tuned_model.fit(X_train, y_train)

pred = tuned_model.predict(X_test)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, pred, labels=labels)
print("labels:", labels)
print("Precision:", [round(x, 2) for x in precision])
print("recall:", [round(x, 2) for x in recall])
print("f1:", [round(x, 2) for x in f1])

Stats for the optimized Model
Best Score: 0.9640128419575891
clf__splitter : random
clf__max_depth : 68
clf__max_features : None
labels: [0, 1]
Precision: [0.99, 0.18]
recall: [0.99, 0.18]
f1: [0.99, 0.18]


Modelo 4: Non-Linear SVC

In [24]:
model = Pipeline([
    ('clf', SVC())
])

def objective(trial):    
    clf__kernel = trial.suggest_categorical('clf__kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
    clf__gamma = trial.suggest_categorical('clf__gamma', ['scale', 'auto'])
    clf__C = trial.suggest_int('clf__C', 1, 50)

    params = {
        'clf__kernel': clf__kernel,
        'clf__gamma': clf__gamma,
        'clf__C': clf__C
    }
    
    model.set_params(**params)
    model.fit(X_train, y_train)

    return np.mean(cross_val_score(model, X_validation, y_validation, cv=8, n_jobs=-1))

study = optuna.create_study()
study.optimize(objective, timeout=600)

[32m[I 2022-09-20 23:35:47,207][0m A new study created in memory with name: no-name-b5037ba3-1134-424f-b252-8458d563d74e[0m
[32m[I 2022-09-20 23:36:52,775][0m Trial 0 finished with value: 0.9820091188690443 and parameters: {'clf__kernel': 'linear', 'clf__gamma': 'scale', 'clf__C': 19}. Best is trial 0 with value: 0.9820091188690443.[0m
[32m[I 2022-09-20 23:39:40,579][0m Trial 1 finished with value: 0.9820091188690443 and parameters: {'clf__kernel': 'linear', 'clf__gamma': 'scale', 'clf__C': 50}. Best is trial 0 with value: 0.9820091188690443.[0m
[32m[I 2022-09-20 23:39:43,239][0m Trial 2 finished with value: 0.9666257486645443 and parameters: {'clf__kernel': 'sigmoid', 'clf__gamma': 'auto', 'clf__C': 21}. Best is trial 2 with value: 0.9666257486645443.[0m
[32m[I 2022-09-20 23:40:35,084][0m Trial 3 finished with value: 0.9820091188690443 and parameters: {'clf__kernel': 'linear', 'clf__gamma': 'scale', 'clf__C': 14}. Best is trial 2 with value: 0.9666257486645443.[0m
[32m

In [25]:
print("Stats for the optimized Model")
print("Best Score:", study.best_value)
best_parameters = study.best_params
model_parameters = {}
for param in best_parameters:
    print(param, ":", best_parameters[param])
    model_parameters[param[5:]] = best_parameters[param]

tuned_model = SVC(**model_parameters)
tuned_model.fit(X_train, y_train)

pred = tuned_model.predict(X_test)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, pred, labels=labels)
print("labels:", labels)
print("Precision:", [round(x, 2) for x in precision])
print("recall:", [round(x, 2) for x in recall])
print("f1:", [round(x, 2) for x in f1])

Stats for the optimized Model
Best Score: 0.9666257486645443
clf__kernel : sigmoid
clf__gamma : auto
clf__C : 21
labels: [0, 1]
Precision: [0.98, 0.0]
recall: [0.98, 0.0]
f1: [0.98, 0.0]
