In [70]:
import pandas as pd
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import RandomOverSampler

In [71]:
# Cargar datos preprocesados
covid_data = pd.read_csv("../covid19_clean_data.csv", index_col=0).sample(1000000, random_state=2)
covid_data.shape

(1000000, 9)

In [72]:
# Separar las caracteristicas de las clases 
X = covid_data.drop("estado", axis=1)
y = covid_data["estado"]

In [73]:
# Crear datasets de entrenamiento(80%) y validación(20%). 
X_train, X_test, y_train, y_test = train_test_split(
   X, y, test_size=0.2, random_state=42
)

In [74]:
# Definición de variables categoricas a usar en el modelo
    # Se excluye la variable de municipio pues el modelo considera la variable no relevante
categorical_features = ['sexo', 'tipo_contagio', 'rango_edad','asintomatico']#,'departamento','municipio'

# Definición de variables númericas a usar en el modelo
    # Excluye asintomatico pues el modelo considera la variable no relevante
    # Excluye id_caso pues no infiere en la clasificacón del estado
numerical_features = ['edad'] 

# Preprocesamiento de las caracteristicas categoricas y númericas definidas previamente
preprocessor = ColumnTransformer(
   transformers=[
       ("cat", OneHotEncoder(), categorical_features),
       ("num", StandardScaler(), numerical_features),
   ]
)

# Creación de flujo de trabajo que seguirá el modelo

In [75]:
pipeline = ImbPipeline(
   [
       ("preprocessor", preprocessor),
       ('oversampling', RandomOverSampler()),
       ("classifier", HistGradientBoostingClassifier(random_state=42))
   ],
   verbose=True
)

In [76]:
grid =  GridSearchCV(
    pipeline,
    param_grid = { 
        'classifier__max_iter': [50, 100, 150],
        'classifier__learning_rate': [0.5, 0.1, 0.01], 
        'classifier__max_depth': [3, 9, None]}, 
    n_jobs=-1,
    cv=3,
    error_score='raise'
    #iverbose=4
)

In [77]:
# Fit the grid on the training data
grid.fit(X_train, y_train)

#80min 2.4 seg

[Pipeline] ...... (step 1 of 3) Processing preprocessor, total=   0.8s
[Pipeline] ...... (step 2 of 3) Processing oversampling, total=   2.9s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 1.7min


In [78]:
# Save pipeline for future use without fitting
joblib.dump(grid, 'GradientBoosting_GridSearch_RandomOverSampling_Trained1000000.joblib')

['GradientBoosting_GridSearch_RandomOverSampling_Trained1000000.joblib']

In [79]:
grid.predict(X_test)

array(['Fallecido', 'Leve', 'Leve', ..., 'Fallecido', 'Moderado', 'Leve'],
      dtype=object)

In [80]:
# Best parameters
print(grid.cv_results_)
print(grid.best_estimator_)
print(grid.best_score_)

{'mean_fit_time': array([ 64.68468642,  56.10310388,  93.7550935 ,  72.74608199,
        77.7403837 ,  85.46627879,  73.60229826,  70.10330566,
        67.62514305, 128.83466148, 231.453885  , 320.43674397,
       193.80260777, 307.05772765, 412.26871443, 195.54504704,
       297.74695293, 370.71061802, 118.59002042, 209.45583431,
       294.31318617, 158.21865694, 317.55716793, 463.03736774,
       162.33646003, 317.08344245, 447.95198488]), 'std_fit_time': array([ 26.95020247,  11.19847998,  21.12864561,   3.35995538,
         3.49676798,   4.16190781,   2.98323131,   3.78107793,
         3.22823007,   1.07017976,   1.28627302,  46.33673958,
         4.29287082,  86.13470263, 151.26848874,   3.8465814 ,
        76.16581129, 130.57696556,   0.96239983,   1.99303789,
         2.95968767,   1.33924579,   3.59468763,   6.2689339 ,
         2.05207504,   3.98382822,   7.123584  ]), 'mean_score_time': array([ 5.56854169,  5.42228293,  6.79115446,  5.37830814,  7.13060077,
        6.3188939

In [81]:
# Fit the tuned_pipeline on the training data
tuned_pipeline = grid.best_estimator_.fit(X_train, y_train)

[Pipeline] ...... (step 1 of 3) Processing preprocessor, total=   0.9s
[Pipeline] ...... (step 2 of 3) Processing oversampling, total=   3.2s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 1.8min


In [82]:
# Predict on the test set
y_pred = tuned_pipeline.predict(X_test)


In [83]:
# Generate classification report
report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(report)



Classification Report:
              precision    recall  f1-score   support

   Fallecido       0.12      0.70      0.20      4475
       Grave       0.00      0.00      0.00         2
        Leve       0.99      0.81      0.89    195488
    Moderado       0.00      0.37      0.00        35

    accuracy                           0.81    200000
   macro avg       0.28      0.47      0.27    200000
weighted avg       0.97      0.81      0.88    200000



In [84]:
y_proba= tuned_pipeline.predict_proba(X_test)
probabilities = pd.DataFrame(y_proba, index=y_test.index)
probabilities = probabilities.rename(
    columns={
        0:tuned_pipeline.classes_[0]
        ,1:tuned_pipeline.classes_[1]
        ,2:tuned_pipeline.classes_[2]
        #,3:tuned_pipeline.classes_[3]
        })
xd= pd.concat([X_test,y_test,pd.Series(y_pred, index=y_test.index),probabilities],axis=1)
xd

Unnamed: 0,id_caso,departamento,municipio,edad,sexo,tipo_contagio,asintomatico,rango_edad,estado,0,Fallecido,Grave,Leve,3
4684180,646506,BOGOTA,BOGOTA,69.0,F,Relacionado,0,65 años y más,Leve,Fallecido,0.670737,0.076712,0.174975,0.077576
2600067,1967322,BOGOTA,BOGOTA,46.0,M,Relacionado,0,41 a 64 años,Leve,Leve,0.287757,0.091167,0.560685,0.060391
3454285,4724082,BOGOTA,BOGOTA,23.0,F,Relacionado,0,12 a 25 años,Leve,Leve,0.104493,0.102790,0.718271,0.074446
3167232,358660,HUILA,LA PLATA,69.0,M,Comunitaria,0,65 años y más,Fallecido,Fallecido,0.476132,0.163362,0.129204,0.231302
949160,3446257,ANTIOQUIA,MEDELLIN,77.0,M,Comunitaria,0,65 años y más,Fallecido,Fallecido,0.473168,0.162345,0.128400,0.236088
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3169148,1578095,BOGOTA,BOGOTA,40.0,M,Comunitaria,0,26 a 40 años,Leve,Leve,0.246774,0.183835,0.410921,0.158470
4491623,3977253,NORTE SANTANDER,CUCUTA,42.0,M,Comunitaria,0,41 a 64 años,Leve,Leve,0.270181,0.201273,0.355044,0.173502
5563379,6291110,BOGOTA,BOGOTA,83.0,F,Comunitaria,0,65 años y más,Leve,Fallecido,0.446938,0.080476,0.124797,0.347789
3266033,3635939,CHOCO,QUIBDO,12.0,M,Comunitaria,0,12 a 25 años,Leve,Moderado,0.076690,0.263141,0.316201,0.343968


In [85]:
#xd.to_csv("testing.csv")