In [147]:
import pandas as pd
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import RandomOverSampler

In [125]:
# Cargar datos preprocesados
covid_data = pd.read_csv("../covid19_clean_data.csv", index_col=0).sample(100000, random_state=2)
covid_data.shape

(100000, 9)

In [126]:
# Separar las caracteristicas de las clases 
X = covid_data.drop("estado", axis=1)
y = covid_data["estado"]

In [127]:
# Crear datasets de entrenamiento(80%) y validación(20%). 
X_train, X_test, y_train, y_test = train_test_split(
   X, y, test_size=0.2, random_state=42
)

In [128]:
# Definición de variables categoricas a usar en el modelo
    # Se excluye la variable de municipio pues el modelo considera la variable no relevante
categorical_features = ['sexo', 'tipo_contagio', 'rango_edad','asintomatico']#,'departamento','municipio'

# Definición de variables númericas a usar en el modelo
    # Excluye asintomatico pues el modelo considera la variable no relevante
    # Excluye id_caso pues no infiere en la clasificacón del estado
numerical_features = ['edad'] 

# Preprocesamiento de las caracteristicas categoricas y númericas definidas previamente
preprocessor = ColumnTransformer(
   transformers=[
       ("cat", OneHotEncoder(), categorical_features),
       ("num", StandardScaler(), numerical_features),
   ]
)

# Creación de flujo de trabajo que seguirá el modelo

In [148]:
pipeline = ImbPipeline(
   [
       ("preprocessor", preprocessor),
       ('oversampling', RandomOverSampler()),
       ("classifier", RandomForestClassifier(random_state=42))
   ],
   verbose=True
)

In [151]:
grid =  GridSearchCV(
    pipeline,
    param_grid = { 
        'classifier__n_estimators': [50,100,150], 
        'classifier__max_depth': [3, 6, 9], 
        'classifier__max_leaf_nodes': [3, 6, 9]},
    n_jobs=-1,
    cv=3,
    error_score='raise'
    #iverbose=4
)

In [152]:
# Fit the grid on the training data
grid.fit(X_train, y_train)

[Pipeline] ...... (step 1 of 3) Processing preprocessor, total=   0.1s
[Pipeline] ...... (step 2 of 3) Processing oversampling, total=   0.4s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  16.3s


In [165]:
# Save pipeline for future use without fitting
joblib.dump(grid, 'RandomForest_GridSearch_RandomOverSampling.joblib')

['RandomForest_GridSearch_RandomOverSampling.joblib']

In [155]:
grid.predict(X_test)

array(['Leve', 'Leve', 'Leve', ..., 'Fallecido', 'Leve', 'Fallecido'],
      dtype=object)

In [164]:
# Best parameters
print(grid.cv_results_)
print(grid.best_estimator_)

{'mean_fit_time': array([11.27093561, 19.39559253, 25.72451901,  9.66670219, 19.64556265,
       25.21744061,  8.83514285, 16.25232673, 23.37138033,  8.20170331,
       13.44751159, 18.46709291,  8.06751744, 12.36587516, 17.54572399,
        6.9232897 , 12.82730039, 19.38763491,  5.30468718, 10.31738893,
       17.41604964,  8.97969302, 15.91175977, 20.29821253,  8.02693486,
       14.47627123, 16.89418523]), 'std_fit_time': array([0.19360144, 0.07574426, 0.16341651, 0.34617777, 0.14586345,
       1.28359847, 0.37301602, 0.11186837, 0.10434661, 0.10146147,
       0.19647205, 0.49740793, 0.18301008, 0.37279605, 0.1224132 ,
       0.25064217, 0.24934115, 0.71145657, 0.17070546, 0.265498  ,
       0.75057071, 0.70083587, 0.65433948, 0.51238989, 0.14758009,
       0.63923467, 1.67058906]), 'mean_score_time': array([0.4771467 , 0.85800354, 1.07475344, 0.62348541, 1.07168889,
       0.93537013, 0.41295147, 0.72724207, 0.8958226 , 0.4058884 ,
       0.58963124, 0.6420691 , 0.3059024 , 0.55856

In [158]:
# Fit the tuned_pipeline on the training data
tuned_pipeline = grid.best_estimator_.fit(X_train, y_train)

[Pipeline] ...... (step 1 of 3) Processing preprocessor, total=   0.1s
[Pipeline] ...... (step 2 of 3) Processing oversampling, total=   0.3s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  10.6s


In [159]:
# Predict on the test set
y_pred = tuned_pipeline.predict(X_test)


In [160]:
# Generate classification report
report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(report)


  _warn_prf(average, modifier, msg_start, len(result))



Classification Report:
              precision    recall  f1-score   support

   Fallecido       0.13      0.64      0.22       453
       Grave       0.00      0.00      0.00         2
        Leve       0.99      0.85      0.92     19544
    Moderado       0.00      0.00      0.00         1

    accuracy                           0.85     20000
   macro avg       0.28      0.37      0.28     20000
weighted avg       0.97      0.85      0.90     20000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [161]:
features=[]
for columns in X.columns:
    features.append(columns)
    
imp_features = tuned_pipeline.named_steps.classifier.feature_importances_

df_imp_features = pd.DataFrame({"features":features}).join(pd.DataFrame({"weights":imp_features}))
df_imp_features.sort_values(by=['weights'], ascending=False)

Unnamed: 0,features,weights
5,tipo_contagio,0.18312
7,rango_edad,0.05956
4,sexo,0.050481
2,municipio,0.040383
6,asintomatico,0.02569
0,id_caso,0.001297
1,departamento,0.000241
3,edad,0.0


In [162]:
y_proba= tuned_pipeline.predict_proba(X_test)
probabilities = pd.DataFrame(y_proba, index=y_test.index)
probabilities = probabilities.rename(
    columns={
        0:tuned_pipeline.classes_[0]
        ,1:tuned_pipeline.classes_[1]
        ,2:tuned_pipeline.classes_[2]
        #,3:tuned_pipeline.classes_[3]
        })
xd= pd.concat([X_test,y_test,pd.Series(y_pred, index=y_test.index),probabilities],axis=1)
xd

Unnamed: 0,id_caso,departamento,municipio,edad,sexo,tipo_contagio,asintomatico,rango_edad,estado,0,Fallecido,Leve,Moderado
3738759,4450850,BOYACA,PAIPA,35.0,M,Comunitaria,0,26 a 40 años,Leve,Leve,0.260605,0.465365,0.274030
987160,3227837,ANTIOQUIA,BELLO,32.0,M,Relacionado,0,26 a 40 años,Leve,Leve,0.276237,0.531055,0.192707
6326184,5397980,ANTIOQUIA,MEDELLIN,44.0,F,Comunitaria,0,41 a 64 años,Leve,Leve,0.347886,0.438213,0.213901
5764593,5862162,BOGOTA,BOGOTA,25.0,F,Comunitaria,0,12 a 25 años,Leve,Leve,0.267206,0.433457,0.299337
3373837,4814022,PUTUMAYO,MOCOA,31.0,F,Relacionado,0,26 a 40 años,Leve,Leve,0.271883,0.532331,0.195786
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3260107,3171797,BARRANQUILLA,BARRANQUILLA,37.0,M,Comunitaria,0,26 a 40 años,Leve,Leve,0.260605,0.465365,0.274030
2946932,1611128,ATLANTICO,SOLEDAD,29.0,M,Comunitaria,0,26 a 40 años,Leve,Leve,0.260605,0.465365,0.274030
1255309,3822457,ANTIOQUIA,SAN JERONIMO,68.0,F,Comunitaria,0,65 años y más,Fallecido,Fallecido,0.523455,0.229296,0.247249
4439355,4075535,BOGOTA,BOGOTA,22.0,F,Comunitaria,1,12 a 25 años,Leve,Leve,0.227648,0.521831,0.250521


In [163]:
#xd.to_csv("testing.csv")