In [1]:
import pandas as pd
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler

In [2]:
# Cargar datos preprocesados
covid_data = pd.read_csv("../covid19_clean_data.csv", index_col=0)
covid_data.shape

  covid_data = pd.read_csv("../covid19_clean_data.csv", index_col=0)


(6349701, 11)

In [3]:
# Separar las caracteristicas de las clases 
X = covid_data.drop("estado", axis=1)
y = covid_data["estado"]

In [4]:
# Forzar formateo de categoria_departamento como String
X['categoria_departamento'] = X['categoria_departamento'].astype('str')

In [5]:
# Crear datasets de entrenamiento(80%) y validación(20%). 
X_train, X_test, y_train, y_test = train_test_split(
   X, y, test_size=0.2, random_state=42
)

In [6]:
# Definición de variables categoricas a usar en el modelo
    # Se excluyen las variables departamento y municipio, se usaran las categorias
categorical_features = ['categoria_departamento','sexo', 'tipo_contagio', 'asintomatico','rango_edad']#,'departamento','municipio'

# Definición de variables númericas a usar en el modelo
    # Excluye asintomatico pues el modelo considera la variable no relevante
    # Excluye id_caso pues no infiere en la clasificacón del estado
numerical_features = ['poblacion_departamento','edad'] 

# Preprocesamiento de las caracteristicas categoricas y númericas definidas previamente
preprocessor = ColumnTransformer(
   transformers=[
       ("cat", OneHotEncoder(), categorical_features),
       ("num", StandardScaler(), numerical_features),
   ]
)

# Creación de flujo de trabajo que seguirá el modelo

In [8]:
pipeline = ImbPipeline(
   [
       ('undersampling', RandomUnderSampler(sampling_strategy = 'majority', random_state=42)),
       ("preprocessor", preprocessor),
       ("classifier", HistGradientBoostingClassifier(random_state=42))
   ],
   verbose=True
)

In [9]:
grid =  GridSearchCV(
    pipeline,
    param_grid = { 
        'classifier__max_iter': [50, 100, 150],
        'classifier__learning_rate': [0.5, 0.1, 0.01], 
        'classifier__max_depth': [3, 9, None]}, 
    n_jobs=-1,
    cv=2,
    error_score='raise'
)

In [10]:
# Fit the grid on the training data
grid.fit(X_train, y_train)

[Pipeline] ..... (step 1 of 3) Processing undersampling, total=  26.3s
[Pipeline] ...... (step 2 of 3) Processing preprocessor, total=   0.3s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   1.2s


In [11]:
# Save pipeline for future use without fitting
joblib.dump(grid, 'GradientBoosting_GridSearch_RandomUnderSampling.joblib')

['GradientBoosting_GridSearch_RandomUnderSampling.joblib']

In [12]:
grid.predict(X_test)

array(['Fallecido', 'Fallecido', 'Fallecido', ..., 'Fallecido',
       'Fallecido', 'Grave'], dtype=object)

In [13]:
# Best parameters
print(grid.cv_results_)
print(grid.best_estimator_)
print(grid.best_score_)

{'mean_fit_time': array([18.34695613, 17.70805728, 16.68869352, 19.86729026, 21.94318295,
       19.46460104, 21.73116434, 20.18914604, 27.59866953, 25.69423139,
       22.04509962, 19.52845478, 21.91300797, 19.39580798, 21.48638332,
       20.8942945 , 21.86262155, 19.55673099, 22.56236863, 26.36906612,
       27.0415715 , 23.3613081 , 27.64988685, 30.16331327, 23.39307272,
       25.88127983, 29.48494852]), 'std_fit_time': array([0.36592257, 0.42961848, 0.05443382, 1.38486433, 0.58404231,
       0.5476172 , 0.08518112, 0.89780045, 2.15936947, 2.23493564,
       0.61143982, 0.43269324, 0.68261099, 0.71209288, 0.26943362,
       0.35445905, 0.0231936 , 0.5766046 , 0.01916647, 0.46265614,
       0.47960365, 0.49620771, 0.917377  , 4.34829199, 0.62839305,
       1.4315244 , 3.76557171]), 'mean_score_time': array([ 24.01712692,  21.85100484,  30.33138156,  35.76592755,
        31.37803483,  33.92804265,  30.47198653,  33.98129845,
        34.38692045,  27.34317207,  23.86992455,  25.95015

In [14]:
# Fit the tuned_pipeline on the training data
tuned_pipeline = grid.best_estimator_.fit(X_train, y_train)

[Pipeline] ..... (step 1 of 3) Processing undersampling, total=  27.3s
[Pipeline] ...... (step 2 of 3) Processing preprocessor, total=   0.2s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.9s


In [15]:
# Predict on the test set
y_pred = tuned_pipeline.predict(X_test)

In [16]:
# Generate classification report
report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(report)


Classification Report:
              precision    recall  f1-score   support

   Fallecido       0.03      0.92      0.05     28406
       Grave       0.00      0.00      0.00        20
        Leve       0.99      0.03      0.06   1241299
    Moderado       0.00      0.12      0.00       216

    accuracy                           0.05   1269941
   macro avg       0.25      0.27      0.03   1269941
weighted avg       0.97      0.05      0.06   1269941



In [17]:
print(tuned_pipeline.named_steps.preprocessor.feature_names_in_)
print(tuned_pipeline.named_steps.preprocessor.get_feature_names_out())
print(tuned_pipeline.named_steps.classifier.n_features_in_)

['id_caso' 'departamento' 'municipio' 'poblacion_departamento'
 'categoria_departamento' 'edad' 'sexo' 'tipo_contagio' 'asintomatico'
 'rango_edad']
['cat__categoria_departamento_1' 'cat__categoria_departamento_2'
 'cat__categoria_departamento_3' 'cat__categoria_departamento_4'
 'cat__categoria_departamento_ESP' 'cat__sexo_F' 'cat__sexo_M'
 'cat__tipo_contagio_Comunitaria' 'cat__tipo_contagio_Importado'
 'cat__tipo_contagio_Relacionado' 'cat__asintomatico_No'
 'cat__asintomatico_Si' 'cat__rango_edad_0 a 4 años'
 'cat__rango_edad_12 a 25 años' 'cat__rango_edad_26 a 40 años'
 'cat__rango_edad_41 a 64 años' 'cat__rango_edad_5 a 11 años'
 'cat__rango_edad_65 años y más' 'num__poblacion_departamento' 'num__edad']
20


In [18]:
features=[]
for columns in X.columns:
    features.append(columns)
    
imp_features = tuned_pipeline.named_steps.classifier.feature_importances_

df_imp_features = pd.DataFrame({"features":features}).join(pd.DataFrame({"weights":imp_features}))
df_imp_features.sort_values(by=['weights'], ascending=False)

AttributeError: 'HistGradientBoostingClassifier' object has no attribute 'feature_importances_'