In [1]:
import pandas as pd
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import RandomOverSampler

In [2]:
# Cargar datos preprocesados
covid_data = pd.read_csv("../covid19_clean_data.csv", index_col=0).sample(1000000, random_state=42)
covid_data.shape

  covid_data = pd.read_csv("../covid19_clean_data.csv", index_col=0).sample(1000000, random_state=42)


(1000000, 11)

In [3]:
# Separar las caracteristicas de las clases 
X = covid_data.drop("estado", axis=1)
y = covid_data["estado"]

In [4]:
# Forzar formateo de categoria_departamento como String
X['categoria_departamento'] = X['categoria_departamento'].astype('str')

In [5]:
# Crear datasets de entrenamiento(80%) y validación(20%). 
X_train, X_test, y_train, y_test = train_test_split(
   X, y, test_size=0.2, random_state=42
)

In [6]:
# Definición de variables categoricas a usar en el modelo
    # Se excluyen las variables departamento y municipio, se usaran las categorias
categorical_features = ['categoria_departamento','sexo', 'tipo_contagio', 'asintomatico','rango_edad']#,'departamento','municipio'

# Definición de variables númericas a usar en el modelo
    # Excluye asintomatico pues el modelo considera la variable no relevante
    # Excluye id_caso pues no infiere en la clasificacón del estado
numerical_features = ['poblacion_departamento','edad'] 

# Preprocesamiento de las caracteristicas categoricas y númericas definidas previamente
preprocessor = ColumnTransformer(
   transformers=[
       ("cat", OneHotEncoder(), categorical_features),
       ("num", StandardScaler(), numerical_features),
   ]
)

# Creación de flujo de trabajo que seguirá el modelo

In [8]:
pipeline = ImbPipeline(
   [
       ("preprocessor", preprocessor),
       ('oversampling', RandomOverSampler(random_state=42)),
       ("classifier", HistGradientBoostingClassifier(random_state=42))
   ],
   verbose=True
)

In [9]:
grid =  GridSearchCV(
    pipeline,
    param_grid = { 
        'classifier__max_iter': [50, 100, 150],
        'classifier__learning_rate': [0.5, 0.1, 0.01], 
        'classifier__max_depth': [3, 9, None]}, 
    n_jobs=-1,
    cv=2,
    error_score='raise'
)

In [10]:
# Fit the grid on the training data
grid.fit(X_train, y_train)

#80min 2.4 seg

[Pipeline] ...... (step 1 of 3) Processing preprocessor, total=   1.7s
[Pipeline] ...... (step 2 of 3) Processing oversampling, total=   4.1s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 1.5min


In [20]:
# Save pipeline for future use without fitting
joblib.dump(grid, 'GradientBoosting_GridSearch_RandomOverSampling.joblib')

['GradientBoosting_GridSearch_RandomOverSampling.joblib']

In [21]:
grid.predict(X_test)

array(['Fallecido', 'Leve', 'Grave', ..., 'Fallecido', 'Leve', 'Leve'],
      dtype=object)

In [22]:
# Best parameters
print(grid.cv_results_)
print(grid.best_estimator_)
print(grid.best_score_)

{'mean_fit_time': array([ 64.67139554,  65.88523829,  63.33781171,  49.03492212,
        47.92916238,  51.08483386,  53.02306175,  56.82403016,
        53.66743767,  97.33558702, 175.64704657, 260.65998507,
       171.5008893 , 260.93074858, 333.55293727, 179.26434231,
       253.92922437, 325.45217216,  99.55213463, 185.64731634,
       269.45583129, 175.9325937 , 323.08777046, 479.56866229,
       168.42203546, 323.79544413, 452.08561158]), 'std_fit_time': array([1.41361268e+01, 1.39714185e+01, 1.41770792e+01, 1.08041286e+00,
       6.06108904e-02, 1.02166438e+00, 1.77375817e+00, 2.70460057e+00,
       3.13243854e+00, 2.20340014e-01, 1.55333948e+00, 5.62579155e-01,
       2.89571524e-01, 4.60344762e+01, 1.12695116e+02, 2.29695082e-01,
       4.53895704e+01, 1.15722113e+02, 1.63594687e+00, 3.49493372e+00,
       2.02873397e+00, 8.10930371e-01, 1.10540342e+00, 2.38499439e+00,
       1.05797267e+00, 1.64709699e+00, 2.48258114e-02]), 'mean_score_time': array([ 8.76889384,  8.83288455,  8

In [23]:
# Fit the tuned_pipeline on the training data
tuned_pipeline = grid.best_estimator_.fit(X_train, y_train)

[Pipeline] ...... (step 1 of 3) Processing preprocessor, total=   1.3s
[Pipeline] ...... (step 2 of 3) Processing oversampling, total=   3.1s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  57.6s


In [24]:
# Predict on the test set
y_pred = tuned_pipeline.predict(X_test)


In [25]:
# Generate classification report
report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(report)



Classification Report:
              precision    recall  f1-score   support

   Fallecido       0.09      0.76      0.16      4516
       Grave       0.00      0.00      0.00         1
        Leve       1.00      0.79      0.88    195443
    Moderado       0.00      0.38      0.01        40

    accuracy                           0.79    200000
   macro avg       0.27      0.48      0.26    200000
weighted avg       0.97      0.79      0.86    200000



In [26]:
print(tuned_pipeline.named_steps.preprocessor.feature_names_in_)
print(tuned_pipeline.named_steps.preprocessor.get_feature_names_out())
print(tuned_pipeline.named_steps.classifier.n_features_in_)

['id_caso' 'departamento' 'municipio' 'poblacion_departamento'
 'categoria_departamento' 'edad' 'sexo' 'tipo_contagio' 'asintomatico'
 'rango_edad']
['cat__categoria_departamento_1' 'cat__categoria_departamento_2'
 'cat__categoria_departamento_3' 'cat__categoria_departamento_4'
 'cat__categoria_departamento_ESP' 'cat__sexo_F' 'cat__sexo_M'
 'cat__tipo_contagio_Comunitaria' 'cat__tipo_contagio_Importado'
 'cat__tipo_contagio_Relacionado' 'cat__asintomatico_No'
 'cat__asintomatico_Si' 'cat__rango_edad_0 a 4 años'
 'cat__rango_edad_12 a 25 años' 'cat__rango_edad_26 a 40 años'
 'cat__rango_edad_41 a 64 años' 'cat__rango_edad_5 a 11 años'
 'cat__rango_edad_65 años y más' 'num__poblacion_departamento' 'num__edad']
20


In [27]:
y_proba= tuned_pipeline.predict_proba(X_test)
probabilities = pd.DataFrame(y_proba, index=y_test.index)
probabilities = probabilities.rename(
    columns={
        0:tuned_pipeline.classes_[0]
        ,1:tuned_pipeline.classes_[1]
        ,2:tuned_pipeline.classes_[2]
        #,3:tuned_pipeline.classes_[3]
        })
xd= pd.concat([X_test,y_test,pd.Series(y_pred, index=y_test.index),probabilities],axis=1)
xd

Unnamed: 0,id_caso,departamento,municipio,poblacion_departamento,categoria_departamento,edad,sexo,tipo_contagio,asintomatico,rango_edad,estado,0,Fallecido,Grave,Leve,3
1052451,3607325,DEPARTAMENTO DE ANTIOQUIA,ITAGUI,6787846,ESP,61.0,M,Relacionado,No,41 a 64 años,Leve,Fallecido,0.714223,0.001572,0.282583,0.001622
5535721,2337451,DEPARTAMENTO DE SUCRE,SINCELEJO,980942,3,54.0,F,Comunitaria,No,41 a 64 años,Leve,Leve,0.260776,0.004887,0.525052,0.209286
304409,4720311,DEPARTAMENTO DE VALLE DEL CAUCA,JAMUNDI,4626064,ESP,18.0,F,Comunitaria,No,12 a 25 años,Leve,Grave,0.001687,0.983554,0.008621,0.006138
1543446,3540325,DEPARTAMENTO DE SANTANDER,PIEDECUESTA,2335238,1,59.0,F,Relacionado,No,41 a 64 años,Leve,Leve,0.471693,0.002063,0.523886,0.002358
5585141,3410043,"CARTAGENA DE INDIAS, DISTRITO TURISTICO Y CULT...",CARTAGENA,1043185,ESP,11.0,M,Comunitaria,No,5 a 11 años,Leve,Leve,0.002286,0.005426,0.980625,0.011663
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3327865,4016184,BOGOTA D.C.,BOGOTA,7873316,ESP,22.0,M,Comunitaria,Si,12 a 25 años,Leve,Leve,0.000291,0.000040,0.998508,0.001161
5346803,2160620,"BARRANQUILLA, DISTRITO ESPECIAL, INDUSTRIAL Y ...",BARRANQUILLA,1310163,ESP,27.0,F,Comunitaria,No,26 a 40 años,Leve,Leve,0.023759,0.000603,0.972926,0.002712
2594006,568068,BOGOTA D.C.,BOGOTA,7873316,ESP,59.0,F,Comunitaria,No,41 a 64 años,Leve,Fallecido,0.521441,0.001944,0.454555,0.022060
2748597,2223381,BOGOTA D.C.,BOGOTA,7873316,ESP,28.0,F,Comunitaria,No,26 a 40 años,Leve,Leve,0.033115,0.000692,0.963769,0.002423


In [28]:
#xd.to_csv("testing.csv")