In [1]:
import pandas as pd
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler

In [2]:
# Cargar datos preprocesados
covid_data = pd.read_csv("../covid19_clean_data.csv", index_col=0)
covid_data.shape

  covid_data = pd.read_csv("../covid19_clean_data.csv", index_col=0)


(6349701, 11)

In [3]:
# Separar las caracteristicas de las clases 
X = covid_data.drop("estado", axis=1)
y = covid_data["estado"]

In [4]:
# Forzar formateo de categoria_departamento como String
X['categoria_departamento'] = X['categoria_departamento'].astype('str')

In [5]:
# Crear datasets de entrenamiento(80%) y validación(20%). 
X_train, X_test, y_train, y_test = train_test_split(
   X, y, test_size=0.2, random_state=42
)

In [6]:
# Definición de variables categoricas a usar en el modelo
    # Se excluyen las variables departamento y municipio, se usaran las categorias
categorical_features = ['categoria_departamento','sexo', 'tipo_contagio', 'asintomatico','rango_edad']#,'departamento','municipio'

# Definición de variables númericas a usar en el modelo
    # Excluye asintomatico pues el modelo considera la variable no relevante
    # Excluye id_caso pues no infiere en la clasificacón del estado
numerical_features = ['poblacion_departamento','edad'] 

# Preprocesamiento de las caracteristicas categoricas y númericas definidas previamente
preprocessor = ColumnTransformer(
   transformers=[
       ("cat", OneHotEncoder(), categorical_features),
       ("num", StandardScaler(), numerical_features),
   ]
)

# Creación de flujo de trabajo que seguirá el modelo

In [7]:
pipeline = ImbPipeline(
   [
       ('undersampling', RandomUnderSampler(sampling_strategy = 'majority', random_state=42)),
       ("preprocessor", preprocessor),
       ("classifier", RandomForestClassifier(n_jobs=-1, random_state=42))
   ],
   verbose=True,
)

In [8]:
grid =  GridSearchCV(
    pipeline,
    param_grid = { 
        'classifier__n_estimators': [50,100,150], 
        'classifier__max_depth': [3, 6, 9], 
        'classifier__max_leaf_nodes': [3, 6, 9]},
    n_jobs=-1,
    cv=2,
    error_score='raise'
)

In [9]:
# Fit the grid on the training data
grid.fit(X_train, y_train)

[Pipeline] ..... (step 1 of 3) Processing undersampling, total=  31.3s
[Pipeline] ...... (step 2 of 3) Processing preprocessor, total=   0.4s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   4.1s


In [10]:
# Save pipeline for future use without fitting
joblib.dump(grid, 'RandomForest_GridSearch_RandomUnderSampling.joblib')

['RandomForest_GridSearch_RandomUnderSampling.joblib']

In [11]:
grid.predict(X_test)

array(['Fallecido', 'Fallecido', 'Fallecido', ..., 'Fallecido',
       'Fallecido', 'Fallecido'], dtype=object)

In [12]:
# Best parameters
print(grid.cv_results_)
print(grid.best_estimator_)

{'mean_fit_time': array([20.78472793, 35.32220459, 28.1670295 , 36.28584921, 20.05219018,
       32.11265481, 32.11969388, 32.2446053 , 38.11341131, 39.69944859,
       25.17485571, 31.84744728, 34.60735548, 31.03336394, 41.50060463,
       43.57589817, 28.92204976, 36.0120405 , 35.71635282, 29.80846775,
       38.8411988 , 38.97884107, 30.34528553, 34.98208773, 37.36979926,
       30.27296185, 35.50377178]), 'std_fit_time': array([2.07973397, 4.25058579, 0.95265663, 8.6304189 , 0.99789393,
       3.83177269, 4.67645776, 0.06995416, 1.18419278, 0.39010525,
       1.03674936, 1.80164564, 6.02750194, 0.92516482, 1.17577243,
       0.39903593, 0.37499452, 1.7407254 , 4.44175875, 1.70551407,
       2.12069762, 0.78052473, 1.31709015, 2.16713727, 4.33972132,
       2.63853431, 0.49175572]), 'mean_score_time': array([32.60112858, 36.33804917, 64.25947022, 37.8150202 , 49.71518207,
       65.90566361, 28.36364031, 43.23705232, 55.48078084, 29.67108274,
       44.98653972, 64.53813028, 26.0266

In [13]:
# Fit the tuned_pipeline on the training data
tuned_pipeline = grid.best_estimator_.fit(X_train, y_train)

[Pipeline] ..... (step 1 of 3) Processing undersampling, total=  30.0s
[Pipeline] ...... (step 2 of 3) Processing preprocessor, total=   0.3s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   3.6s


In [14]:
# Predict on the test set
y_pred = tuned_pipeline.predict(X_test)


In [15]:
# Generate classification report
report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(report)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Classification Report:
              precision    recall  f1-score   support

   Fallecido       0.02      1.00      0.05     28406
       Grave       0.00      0.00      0.00        20
        Leve       1.00      0.08      0.14   1241299
    Moderado       0.01      0.42      0.01       216

    accuracy                           0.10   1269941
   macro avg       0.26      0.37      0.05   1269941
weighted avg       0.98      0.10      0.14   1269941



  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
print(tuned_pipeline.named_steps.preprocessor.feature_names_in_)
print(tuned_pipeline.named_steps.preprocessor.get_feature_names_out())
print(tuned_pipeline.named_steps.classifier.n_features_in_)

['id_caso' 'departamento' 'municipio' 'poblacion_departamento'
 'categoria_departamento' 'edad' 'sexo' 'tipo_contagio' 'asintomatico'
 'rango_edad']
['cat__categoria_departamento_1' 'cat__categoria_departamento_2'
 'cat__categoria_departamento_3' 'cat__categoria_departamento_4'
 'cat__categoria_departamento_ESP' 'cat__sexo_F' 'cat__sexo_M'
 'cat__tipo_contagio_Comunitaria' 'cat__tipo_contagio_Importado'
 'cat__tipo_contagio_Relacionado' 'cat__asintomatico_No'
 'cat__asintomatico_Si' 'cat__rango_edad_0 a 4 años'
 'cat__rango_edad_12 a 25 años' 'cat__rango_edad_26 a 40 años'
 'cat__rango_edad_41 a 64 años' 'cat__rango_edad_5 a 11 años'
 'cat__rango_edad_65 años y más' 'num__poblacion_departamento' 'num__edad']
20


In [17]:
features=[]
for columns in X.columns:
    features.append(columns)
    
imp_features = tuned_pipeline.named_steps.classifier.feature_importances_

df_imp_features = pd.DataFrame({"features":features}).join(pd.DataFrame({"weights":imp_features}))
df_imp_features.sort_values(by=['weights'], ascending=False)

Unnamed: 0,features,weights
9,rango_edad,0.014394
7,tipo_contagio,0.007613
1,departamento,0.003797
4,categoria_departamento,0.003482
5,edad,0.002776
6,sexo,0.002205
0,id_caso,0.001768
8,asintomatico,0.001438
2,municipio,0.001053
3,poblacion_departamento,0.000817


In [18]:
y_proba= tuned_pipeline.predict_proba(X_test)
probabilities = pd.DataFrame(y_proba, index=y_test.index)
probabilities = probabilities.rename(
    columns={
        0:tuned_pipeline.classes_[0]
        ,1:tuned_pipeline.classes_[1]
        ,2:tuned_pipeline.classes_[2]
        ,3:tuned_pipeline.classes_[3]})
xd= pd.concat([X_test,y_test,pd.Series(y_pred, index=y_test.index),probabilities],axis=1)
xd

Unnamed: 0,id_caso,departamento,municipio,poblacion_departamento,categoria_departamento,edad,sexo,tipo_contagio,asintomatico,rango_edad,estado,0,Fallecido,Grave,Leve,Moderado
3738032,5922217,BOGOTA D.C.,BOGOTA,7873316,ESP,10.0,F,Comunitaria,No,5 a 11 años,Leve,Fallecido,0.698623,0.018164,0.006986,0.276227
1794956,5795099,DEPARTAMENTO DE SANTANDER,BUCARAMANGA,2335238,1,26.0,M,Comunitaria,No,26 a 40 años,Leve,Fallecido,0.989739,0.000777,0.002126,0.007358
718164,2546441,DEPARTAMENTO DE ANTIOQUIA,ITAGUI,6787846,ESP,23.0,F,Comunitaria,No,12 a 25 años,Leve,Fallecido,0.958456,0.004231,0.006869,0.030444
5907795,2276824,DEPARTAMENTO DE ATLANTICO,CANDELARIA,2774958,1,69.0,F,Comunitaria,No,65 años y más,Fallecido,Fallecido,0.994361,0.000517,0.000521,0.004600
4220823,6002612,DEPARTAMENTO DE CUNDINAMARCA,SOACHA,3334637,ESP,34.0,F,Relacionado,No,26 a 40 años,Leve,Fallecido,0.991030,0.000663,0.002502,0.005806
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6185378,1707625,DEPARTAMENTO DE BOYACA,MONIQUIRA,1285035,1,38.0,F,Comunitaria,No,26 a 40 años,Leve,Fallecido,0.989104,0.000960,0.001620,0.008316
4282155,3167828,DEPARTAMENTO DE CAQUETA,FLORENCIA,421797,4,42.0,M,Comunitaria,No,41 a 64 años,Leve,Fallecido,0.994637,0.000478,0.000707,0.004178
1280055,481515,DEPARTAMENTO DE ANTIOQUIA,MEDELLIN,6787846,ESP,41.0,M,Comunitaria,No,41 a 64 años,Leve,Fallecido,0.994620,0.000480,0.000708,0.004193
5115709,3754662,DEPARTAMENTO DE TOLIMA,IBAGUE,1367802,2,47.0,F,Relacionado,No,41 a 64 años,Leve,Fallecido,0.995244,0.000408,0.000874,0.003474


In [19]:
#xd.to_csv("testing.csv")