In [1]:
import pandas as pd
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import RandomOverSampler

In [2]:
# Cargar datos preprocesados
covid_data = pd.read_csv("../covid19_clean_data.csv", index_col=0).sample(1000000, random_state=42)
covid_data.shape

  covid_data = pd.read_csv("../covid19_clean_data.csv", index_col=0).sample(1000000, random_state=42)


(1000000, 11)

In [3]:
# Separar las caracteristicas de las clases 
X = covid_data.drop("estado", axis=1)
y = covid_data["estado"]

In [4]:
# Forzar formateo de categoria_departamento como String
X['categoria_departamento'] = X['categoria_departamento'].astype('str')

In [5]:
# Crear datasets de entrenamiento(80%) y validación(20%). 
X_train, X_test, y_train, y_test = train_test_split(
   X, y, test_size=0.2, random_state=42
)

In [6]:
# Definición de variables categoricas a usar en el modelo
    # Se excluyen las variables departamento y municipio, se usaran las categorias
categorical_features = ['categoria_departamento','sexo', 'tipo_contagio', 'asintomatico','rango_edad']#,'departamento','municipio'

# Definición de variables númericas a usar en el modelo
    # Excluye asintomatico pues el modelo considera la variable no relevante
    # Excluye id_caso pues no infiere en la clasificacón del estado
numerical_features = ['poblacion_departamento','edad'] 

# Preprocesamiento de las caracteristicas categoricas y númericas definidas previamente
preprocessor = ColumnTransformer(
   transformers=[
       ("cat", OneHotEncoder(), categorical_features),
       ("num", StandardScaler(), numerical_features),
   ]
)

# Creación de flujo de trabajo que seguirá el modelo

In [7]:
pipeline = ImbPipeline(
   [
       ("preprocessor", preprocessor),
       ('oversampling', RandomOverSampler(random_state=42)),
       ("classifier", RandomForestClassifier(n_jobs=-1, random_state=42))
   ],
   verbose=True,
)

In [8]:
grid =  GridSearchCV(
    pipeline,
    param_grid = { 
        'classifier__n_estimators': [50,100,150], 
        'classifier__max_depth': [3, 6, 9], 
        'classifier__max_leaf_nodes': [3, 6, 9]},
    n_jobs=-1,
    cv=2,
    error_score='raise'
)

In [9]:
# Fit the grid on the training data
grid.fit(X_train, y_train)

[Pipeline] ...... (step 1 of 3) Processing preprocessor, total=   1.5s
[Pipeline] ...... (step 2 of 3) Processing oversampling, total=   3.7s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  59.9s


In [10]:
# Save pipeline for future use without fitting
joblib.dump(grid, 'RandomForest_GridSearch_RandomOverSampling.joblib')

['RandomForest_GridSearch_RandomOverSampling.joblib']

In [11]:
grid.predict(X_test)

array(['Leve', 'Leve', 'Leve', ..., 'Leve', 'Leve', 'Leve'], dtype=object)

In [12]:
# Best parameters
print(grid.cv_results_)
print(grid.best_estimator_)
print(grid.best_score_)

{'mean_fit_time': array([ 62.62612724, 109.99016058, 145.02571321,  71.85981524,
       128.28808379, 180.41925204,  75.22957826, 120.1641072 ,
       171.53722644,  71.85742307, 132.79659188, 183.03341532,
        90.02853072, 169.79465842, 250.69678044, 111.4501555 ,
       186.09297013, 267.74423397,  80.37286758, 121.10338068,
       159.24372327,  73.57726812, 141.22850108, 208.19957876,
        96.9151758 , 153.4686116 , 194.52457607]), 'std_fit_time': array([1.38389111, 0.55209386, 0.00706005, 0.72473323, 0.16569829,
       0.36487544, 0.51997757, 3.67223155, 1.70773816, 0.1284523 ,
       0.02358592, 4.48285985, 1.10142195, 0.9531858 , 2.36917996,
       3.03295255, 1.03748536, 2.7113229 , 0.18406272, 0.80094361,
       0.51660645, 0.65229869, 1.04796028, 1.51580071, 0.82148349,
       0.12311137, 0.31336725]), 'mean_score_time': array([ 6.62002671,  7.46943748,  8.4520061 ,  9.10727346, 10.48616314,
       11.5009079 ,  3.70943236, 11.03016913, 14.44621706,  5.60467148,
      

In [13]:
# Fit the tuned_pipeline on the training data
tuned_pipeline = grid.best_estimator_.fit(X_train, y_train)

[Pipeline] ...... (step 1 of 3) Processing preprocessor, total=   2.1s
[Pipeline] ...... (step 2 of 3) Processing oversampling, total=   6.3s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 1.0min


In [14]:
# Predict on the test set
y_pred = tuned_pipeline.predict(X_test)


In [15]:
# Generate classification report
report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(report)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Classification Report:
              precision    recall  f1-score   support

   Fallecido       0.14      0.65      0.23      4516
       Grave       0.00      0.00      0.00         1
        Leve       0.99      0.87      0.92    195443
    Moderado       0.00      0.42      0.00        40

    accuracy                           0.86    200000
   macro avg       0.28      0.49      0.29    200000
weighted avg       0.97      0.86      0.91    200000



  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
print(tuned_pipeline.named_steps.preprocessor.feature_names_in_)
print(tuned_pipeline.named_steps.preprocessor.get_feature_names_out())
print(tuned_pipeline.named_steps.classifier.n_features_in_)

['id_caso' 'departamento' 'municipio' 'poblacion_departamento'
 'categoria_departamento' 'edad' 'sexo' 'tipo_contagio' 'asintomatico'
 'rango_edad']
['cat__categoria_departamento_1' 'cat__categoria_departamento_2'
 'cat__categoria_departamento_3' 'cat__categoria_departamento_4'
 'cat__categoria_departamento_ESP' 'cat__sexo_F' 'cat__sexo_M'
 'cat__tipo_contagio_Comunitaria' 'cat__tipo_contagio_Importado'
 'cat__tipo_contagio_Relacionado' 'cat__asintomatico_No'
 'cat__asintomatico_Si' 'cat__rango_edad_0 a 4 años'
 'cat__rango_edad_12 a 25 años' 'cat__rango_edad_26 a 40 años'
 'cat__rango_edad_41 a 64 años' 'cat__rango_edad_5 a 11 años'
 'cat__rango_edad_65 años y más' 'num__poblacion_departamento' 'num__edad']
20


In [17]:
features=[]
for columns in X.columns:
    features.append(columns)
    
imp_features = tuned_pipeline.named_steps.classifier.feature_importances_

df_imp_features = pd.DataFrame({"features":features}).join(pd.DataFrame({"weights":imp_features}))
df_imp_features.sort_values(by=['weights'], ascending=False)

Unnamed: 0,features,weights
7,tipo_contagio,0.110949
9,rango_edad,0.038605
3,poblacion_departamento,0.002762
2,municipio,0.002431
6,sexo,0.001536
1,departamento,0.000929
0,id_caso,0.0
4,categoria_departamento,0.0
5,edad,0.0
8,asintomatico,0.0


In [18]:
y_proba= tuned_pipeline.predict_proba(X_test)
probabilities = pd.DataFrame(y_proba, index=y_test.index)
probabilities = probabilities.rename(
    columns={
        0:tuned_pipeline.classes_[0]
        ,1:tuned_pipeline.classes_[1]
        ,2:tuned_pipeline.classes_[2]
        ,3:tuned_pipeline.classes_[3]})
xd= pd.concat([X_test,y_test,pd.Series(y_pred, index=y_test.index),probabilities],axis=1)
xd

Unnamed: 0,id_caso,departamento,municipio,poblacion_departamento,categoria_departamento,edad,sexo,tipo_contagio,asintomatico,rango_edad,estado,0,Fallecido,Grave,Leve,Moderado
1052451,3607325,DEPARTAMENTO DE ANTIOQUIA,ITAGUI,6787846,ESP,61.0,M,Relacionado,No,41 a 64 años,Leve,Leve,0.328130,0.140529,0.408746,0.122595
5535721,2337451,DEPARTAMENTO DE SUCRE,SINCELEJO,980942,3,54.0,F,Comunitaria,No,41 a 64 años,Leve,Leve,0.269364,0.211786,0.328396,0.190454
304409,4720311,DEPARTAMENTO DE VALLE DEL CAUCA,JAMUNDI,4626064,ESP,18.0,F,Comunitaria,No,12 a 25 años,Leve,Leve,0.216124,0.258784,0.308547,0.216545
1543446,3540325,DEPARTAMENTO DE SANTANDER,PIEDECUESTA,2335238,1,59.0,F,Relacionado,No,41 a 64 años,Leve,Leve,0.328130,0.140529,0.408746,0.122595
5585141,3410043,"CARTAGENA DE INDIAS, DISTRITO TURISTICO Y CULT...",CARTAGENA,1043185,ESP,11.0,M,Comunitaria,No,5 a 11 años,Leve,Moderado,0.137644,0.247387,0.223587,0.391382
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3327865,4016184,BOGOTA D.C.,BOGOTA,7873316,ESP,22.0,M,Comunitaria,Si,12 a 25 años,Leve,Leve,0.184348,0.211178,0.435427,0.169047
5346803,2160620,"BARRANQUILLA, DISTRITO ESPECIAL, INDUSTRIAL Y ...",BARRANQUILLA,1310163,ESP,27.0,F,Comunitaria,No,26 a 40 años,Leve,Leve,0.202357,0.166909,0.461741,0.168994
2594006,568068,BOGOTA D.C.,BOGOTA,7873316,ESP,59.0,F,Comunitaria,No,41 a 64 años,Leve,Leve,0.281633,0.225138,0.295208,0.198022
2748597,2223381,BOGOTA D.C.,BOGOTA,7873316,ESP,28.0,F,Comunitaria,No,26 a 40 años,Leve,Leve,0.202357,0.166909,0.461741,0.168994


In [19]:
#xd.to_csv("testing.csv")