In [2]:
import pandas as pd
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
# Cargar datos preprocesados
covid_data = pd.read_csv("../covid19_clean_data.csv", index_col=0)
covid_data.shape

(6349701, 9)

In [4]:
# Separar las caracteristicas de las clases 
X = covid_data.drop("estado", axis=1)
y = covid_data["estado"]

In [5]:
# Crear datasets de entrenamiento(80%) y validación(20%). 
X_train, X_test, y_train, y_test = train_test_split(
   X, y, test_size=0.2, random_state=42
)

In [6]:
# Definición de variables categoricas a usar en el modelo
    # Se excluye la variable de municipio pues el modelo considera la variable no relevante
categorical_features = ['sexo', 'tipo_contagio', 'rango_edad','asintomatico']#,'departamento','municipio'

# Definición de variables númericas a usar en el modelo
    # Excluye asintomatico pues el modelo considera la variable no relevante
    # Excluye id_caso pues no infiere en la clasificacón del estado
numerical_features = ['edad'] 

# Preprocesamiento de las caracteristicas categoricas y númericas definidas previamente
preprocessor = ColumnTransformer(
   transformers=[
       ("cat", OneHotEncoder(), categorical_features),
       ("num", StandardScaler(), numerical_features),
   ]
)

# Creación de flujo de trabajo que seguirá el modelo

In [7]:
pipeline = ImbPipeline(
   [
       ('undersampling', RandomUnderSampler(sampling_strategy = 'majority', random_state=42)),
       ("preprocessor", preprocessor),
       ("classifier", RandomForestClassifier(random_state=42))
   ],
   verbose=True
)

In [8]:
grid =  GridSearchCV(
    pipeline,
    param_grid = { 
        'classifier__n_estimators': [50,100,150], 
        'classifier__max_depth': [3, 6, 9], 
        'classifier__max_leaf_nodes': [3, 6, 9]},
    n_jobs=-1,
    cv=3,
    error_score='raise'
    
)

In [9]:
# Fit the grid on the training data
grid.fit(X_train, y_train)

[Pipeline] ..... (step 1 of 3) Processing undersampling, total=  53.0s
[Pipeline] ...... (step 2 of 3) Processing preprocessor, total=   0.3s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   9.8s


In [26]:
# Save pipeline for future use without fitting
joblib.dump(grid, 'RandomForest_GridSearch_RandomUnderSampling.joblib')

['RandomForest_GridSearch_RandomUnderSampling.joblib']

In [11]:
grid.predict(X_test)

array(['Fallecido', 'Fallecido', 'Fallecido', ..., 'Fallecido',
       'Fallecido', 'Fallecido'], dtype=object)

In [27]:
# Best parameters
print(grid.cv_results_)
print(grid.best_estimator_)

{'mean_fit_time': array([36.11602354, 31.8939352 , 32.89264472, 32.0475409 , 35.4619085 ,
       39.68661253, 32.4645013 , 34.41636952, 30.58216516, 26.15989923,
       27.865237  , 29.92653433, 27.13145566, 28.71409233, 30.59904067,
       28.31371133, 31.16075333, 30.86129816, 29.17408403, 33.29485345,
       30.97092915, 28.73178458, 28.48341235, 32.81058423, 44.9598333 ,
       47.74826733, 72.33506497]), 'std_fit_time': array([0.39324906, 2.81790942, 1.7370958 , 1.28099884, 1.81406246,
       2.50600924, 1.19195304, 0.63013566, 1.26723385, 0.59292087,
       0.92630935, 0.28464507, 1.14044639, 1.9146066 , 0.5244003 ,
       0.77099364, 0.26833535, 0.9718227 , 1.96794885, 1.68489475,
       0.53777751, 2.64860191, 1.38820191, 1.26606131, 3.70700289,
       9.1714824 , 1.47619763]), 'mean_score_time': array([16.70767522, 25.76259637, 36.06296221, 17.55362781, 32.96679568,
       39.86087513, 18.26853434, 26.88131126, 34.84853522, 14.69566774,
       21.96897833, 31.22992524, 14.5121

In [13]:
# Fit the tuned_pipeline on the training data
tuned_pipeline = grid.best_estimator_.fit(X_train, y_train)

[Pipeline] ..... (step 1 of 3) Processing undersampling, total=  53.8s
[Pipeline] ...... (step 2 of 3) Processing preprocessor, total=   0.2s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   6.9s


In [14]:
# Predict on the test set
y_pred = tuned_pipeline.predict(X_test)


In [15]:
# Generate classification report
report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(report)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Classification Report:
              precision    recall  f1-score   support

   Fallecido       0.02      1.00      0.05     28487
       Grave       0.00      0.00      0.00        23
        Leve       1.00      0.08      0.15   1241208
    Moderado       0.01      0.39      0.01       223

    accuracy                           0.10   1269941
   macro avg       0.26      0.37      0.05   1269941
weighted avg       0.98      0.10      0.15   1269941



  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
features=[]
for columns in X.columns:
    features.append(columns)
    
imp_features = tuned_pipeline.named_steps.classifier.feature_importances_

df_imp_features = pd.DataFrame({"features":features}).join(pd.DataFrame({"weights":imp_features}))
df_imp_features.sort_values(by=['weights'], ascending=False)

Unnamed: 0,features,weights
5,tipo_contagio,0.385312
7,rango_edad,0.021997
2,municipio,0.014877
4,sexo,0.010568
6,asintomatico,0.010001
1,departamento,0.002068
0,id_caso,0.001695
3,edad,0.0


In [17]:
y_proba= pipeline.predict_proba(X_test)
probabilities = pd.DataFrame(y_proba, index=y_test.index)
probabilities = probabilities.rename(
    columns={
        0:pipeline.classes_[0]
        ,1:pipeline.classes_[1]
        ,2:pipeline.classes_[2]
        ,3:pipeline.classes_[3]})
xd= pd.concat([X_test,y_test,pd.Series(y_pred, index=y_test.index),probabilities],axis=1)
xd = xd.rename(columns={"estado":"y_test",0:"y_pred2"})

NotFittedError: This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
#xd.to_csv("testing.csv")