# Random Forest (Reproducible)

Importamos paqueterías

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

Importamos la base de datos

In [4]:
filepath = '/home/jupyter-user7/CAMDA/Camda24_resistance/DataSets/group-2/data/combined_antibiotic_resistance.tsv'
df = pd.read_csv(filepath, sep='\t')

  df = pd.read_csv(filepath, sep='\t')


Preprocesamiento

In [41]:
df = df.dropna()
df.head()

Unnamed: 0,antibiotic,accession,genus,species,phenotype,mic,3005053,3000830,3003838,3000508,...,3007751-D87Y,3003926-D87Y,3003709-G46S,3004851-A39T,3004832-A501P,3003381-R20H,3003926-S83I,3003381-G121D,3004832-T483S,3004832-A311V
0,meropenem,GCA_002947415,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,meropenem,GCA_002947845,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,meropenem,GCA_002948925,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,meropenem,GCA_002996805,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,meropenem,GCA_003006035,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Modelo de Random Forest

In [11]:
def train_random_forest(df):
    """
    Function to train a random forest classifier on the 'mic' column of the dataframe.
    Args:
        df: Input pandas DataFrame with features and labels.
        
    Returns:
        model: Trained RandomForestClassifier model.
        f1: F1 score of the model on test data.
    """
    
    # 1. Separate features and labels
    X = df.drop(columns=['phenotype', 'mic',"antibiotic"])  # Drop label columns
    y = df['mic']  # Target label

    # 2. Encode 'mic' (target) using LabelEncoder (since it's multiclass)
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # 3. Detect binary and multiclass features
    binary_cols = [col for col in X.columns if set(X[col].unique()) <= {0, 1}]  # Binary features
    multiclass_cols = [col for col in X.columns if len(X[col].unique()) > 2]  # Multiclass features

    # 4. Preprocessing pipeline
    # Binary columns don't need much processing, multiclass columns need one-hot encoding
    preprocessor = ColumnTransformer(
        transformers=[
            ('binary', 'passthrough', binary_cols),  # Pass through binary columns
            ('multiclass', OneHotEncoder(handle_unknown='ignore'), multiclass_cols)  # One-hot encode multiclass columns Con handle_unknown='ignore', el codificador ignorará las categorías desconocidas y no producirá un error durante la predicción.
        ], remainder='passthrough')  # Any remaining columns are passed through (if any)

    # 5. Create a pipeline that first applies preprocessing, then trains a random forest
    model_pipeline = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ])

    # 6. Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
    
    # 7. Train the model
    model_pipeline.fit(X_train, y_train)

    # 8. Make predictions and calculate the F1 score
    y_pred = model_pipeline.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')  # Use 'weighted' for multiclass F1

    return model_pipeline, f1

In [12]:
model, f1 = train_random_forest(df)
print(f'Trained model: {model}')
print(f'F1 score: {f1}')

Trained model: Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('binary', 'passthrough',
                                                  ['3000830', '3000206',
                                                   '3006880', '3000676',
                                                   '3003576', '3001216',
                                                   '3000237', '3003548',
                                                   '3001889', '3003652',
                                                   '3003899', '3006228',
                                                   '3003900', '3006881',
                                                   '3001866', '3003479',
                                                   '3000166', '3006878',
                                                   '3006874', '3000168',
                                                   '3004290', '3006875',
             

## Entrenamiento y evaluación de los clasificadores RandomForest, SVC y KNeighborsC con preprocesamiento

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline

def train_classifiers(df):
    """
    Function to train RandomForest, SVM, and KNN classifiers on the 'mic' column of the dataframe.
    Args:
        df: Input pandas DataFrame with features and labels.
        
    Returns:
        models: Dictionary containing trained models for each classifier.
        f1_scores: Dictionary containing F1 scores for each classifier on test data.
    """
    
    # 1. Separate features and labels
    X = df.drop(columns=['phenotype', 'mic', "antibiotic"])  # Drop label columns
    y = df['mic']  # Target label

    # 2. Encode 'mic' (target) using LabelEncoder (since it's multiclass)
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # 3. Detect binary and multiclass features
    binary_cols = [col for col in X.columns if set(X[col].unique()) <= {0, 1}]  # Binary features
    multiclass_cols = [col for col in X.columns if len(X[col].unique()) > 2]  # Multiclass features

    # 4. Preprocessing pipeline
    # Binary columns don't need much processing, multiclass columns need one-hot encoding
    preprocessor = ColumnTransformer(
        transformers=[
            ('binary', 'passthrough', binary_cols),  # Pass through binary columns
            ('multiclass', OneHotEncoder(handle_unknown='ignore'), multiclass_cols)  # One-hot encode multiclass columns
        ], remainder='passthrough')  # Any remaining columns are passed through (if any)

    # 5. Create pipelines for different classifiers
    rf_pipeline = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ])
    
    svm_pipeline = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', SVC(random_state=42))
    ])
    
    knn_pipeline = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', KNeighborsClassifier())
    ])

    # 6. Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    # 7. Train models and calculate F1 scores
    models = {}
    f1_scores = {}

    # Random Forest
    rf_pipeline.fit(X_train, y_train)
    y_pred_rf = rf_pipeline.predict(X_test)
    f1_rf = f1_score(y_test, y_pred_rf, average='weighted')
    models['RandomForest'] = rf_pipeline
    f1_scores['RandomForest'] = f1_rf

    # SVM
    svm_pipeline.fit(X_train, y_train)
    y_pred_svm = svm_pipeline.predict(X_test)
    f1_svm = f1_score(y_test, y_pred_svm, average='weighted')
    models['SVM'] = svm_pipeline
    f1_scores['SVM'] = f1_svm

    # KNN
    knn_pipeline.fit(X_train, y_train)
    y_pred_knn = knn_pipeline.predict(X_test)
    f1_knn = f1_score(y_test, y_pred_knn, average='weighted')
    models['KNN'] = knn_pipeline
    f1_scores['KNN'] = f1_knn

    return models, f1_scores


In [12]:
models, f1_scores = train_classifiers(df)
print(f'Trained models: {models}')
print(f'F1 scores: {f1_scores}')

Trained models: {'RandomForest': Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('binary', 'passthrough',
                                                  ['3000830', '3000206',
                                                   '3006880', '3000676',
                                                   '3003576', '3001216',
                                                   '3000237', '3003548',
                                                   '3001889', '3003652',
                                                   '3003899', '3006228',
                                                   '3003900', '3006881',
                                                   '3001866', '3003479',
                                                   '3000166', '3002540',
                                                   '3006878', '3006874',
                                                   '3000168', '30042

## Entrenamiento de los tres modelos de clasificación con reajuste de hiperparámetros

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline

def train_classifiers_with_tuning(df):
    """
    Function to train RandomForest, SVM, and KNN classifiers with hyperparameter tuning on the 'mic' column.
    Args:
        df: Input pandas DataFrame with features and labels.
        
    Returns:
        best_models: Dictionary containing the best tuned models for each classifier.
        f1_scores: Dictionary containing F1 scores for each best-tuned classifier on test data.
  
    """

    # 1. Separate features and labels
    X = df.drop(columns=['phenotype', 'mic'])  # Drop label columns "antibiotic"
    y = df['mic']  # Target label

    # 2. Encode 'mic' (target) using LabelEncoder (since it's multiclass)
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # 3. Detect binary and multiclass features
    binary_cols = [col for col in X.columns if set(X[col].unique()) <= {0, 1}]  # Binary features
    multiclass_cols = [col for col in X.columns if len(X[col].unique()) > 2]  # Multiclass features

    # 4. Preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('binary', 'passthrough', binary_cols),  # Pass through binary columns
            ('multiclass', OneHotEncoder(handle_unknown='ignore'), multiclass_cols)  # One-hot encode multiclass columns
        ], remainder='passthrough')

    # 5. Define parameter grids for hyperparameter tuning
    param_grid_rf = {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 5, 10]
    }

    param_grid_svm = {
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['linear', 'rbf'],
        'classifier__gamma': ['scale', 'auto']
    }

    param_grid_knn = {
        'classifier__n_neighbors': [3, 5, 7],
        'classifier__weights': ['uniform', 'distance']
    }

    # 6. Create pipelines for different classifiers
    rf_pipeline = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ])
    
    svm_pipeline = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', SVC(random_state=42))
    ])
    
    knn_pipeline = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', KNeighborsClassifier())
    ])

    # 7. Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    # 8. Hyperparameter tuning with GridSearchCV
    rf_grid = GridSearchCV(rf_pipeline, param_grid_rf, cv=3, scoring='f1_weighted')
    svm_grid = GridSearchCV(svm_pipeline, param_grid_svm, cv=3, scoring='f1_weighted')
    knn_grid = GridSearchCV(knn_pipeline, param_grid_knn, cv=3, scoring='f1_weighted')

    # 9. Train models with best parameters and calculate F1 scores
    best_models = {}
    f1_scores = {}

    # Random Forest
    rf_grid.fit(X_train, y_train)
    y_pred_rf = rf_grid.predict(X_test)
    f1_rf = f1_score(y_test, y_pred_rf, average='weighted')
    best_models['RandomForest'] = rf_grid.best_estimator_
    f1_scores['RandomForest'] = f1_rf

    # SVM
    svm_grid.fit(X_train, y_train)
    y_pred_svm = svm_grid.predict(X_test)
    f1_svm = f1_score(y_test, y_pred_svm, average='weighted')
    best_models['SVM'] = svm_grid.best_estimator_
    f1_scores['SVM'] = f1_svm

    # KNN
    knn_grid.fit(X_train, y_train)
    y_pred_knn = knn_grid.predict(X_test)
    f1_knn = f1_score(y_test, y_pred_knn, average='weighted')
    best_models['KNN'] = knn_grid.best_estimator_
    f1_scores['KNN'] = f1_knn

    return best_models, f1_scores

# Entrenar los modelos con ajuste de hiperparámetros
best_models, f1_scores = train_classifiers_with_tuning(df)
print(f'Best models: {best_models}')
print(f'F1 scores: {f1_scores}')




Best models: {'RandomForest': Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('binary', 'passthrough',
                                                  ['antibiotic', '3000830',
                                                   '3000206', '3006880',
                                                   '3000676', '3003576',
                                                   '3001216', '3000237',
                                                   '3003548', '3001889',
                                                   '3003652', '3003899',
                                                   '3006228', '3003900',
                                                   '3006881', '3001866',
                                                   '3003479', '3000166',
                                                   '3002540', '3006878',
                                                   '3006874', '30001

In [None]:
# Parámetros para RandomForestClassifier: Ajusta el número de árboles, la profundidad máxima y el número mínimo de muestras para dividir un nodo.
# Parámetros para SVC: Ajusta el parámetro de regularización C, el kernel y el parámetro gamma.
# Parámetros para KNeighborsClassifier: Ajusta el número de vecinos y el tipo de ponderación para los vecinos.

## Entrenamiento de los tres modelos de clasificación con reajuste de hiperparámetros con distintas métricas

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.pipeline import Pipeline
import pandas as pd

def train_classifiers_with_metrics(df):
    """
    Train and evaluate RandomForest, SVM, and KNN classifiers with hyperparameter tuning on the 'mic' column.
    Args:
        df: Input pandas DataFrame with features and labels.
        
    Returns:
        best_models: Dictionary containing the best tuned models for each classifier.
        metrics: Dictionary containing metrics for each best-tuned classifier on test data.
    """
    
    # 1. Separate features and labels
    X = df.drop(columns=['phenotype', 'mic'])  # Drop label columns
    y = df['mic']  # Target label

    # 2. Encode 'mic' (target) using LabelEncoder
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # 3. Detect categorical features for encoding
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
    numeric_cols = X.select_dtypes(exclude=['object']).columns.tolist()

    # 4. Preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numeric_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ]
    )

    # 5. Define parameter grids for hyperparameter tuning
    param_grids = {
        'RandomForest': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [None, 10, 20],
            'classifier__min_samples_split': [2, 5, 10]
        },
        'SVM': {
            'classifier__C': [0.1, 1, 10],
            'classifier__kernel': ['linear', 'rbf'],
            'classifier__gamma': ['scale', 'auto']
        },
        'KNN': {
            'classifier__n_neighbors': [3, 5, 7],
            'classifier__weights': ['uniform', 'distance']
        }
    }

    # 6. Create pipelines for classifiers
    pipelines = {
        'RandomForest': Pipeline([
            ('preprocessing', preprocessor),
            ('classifier', RandomForestClassifier(random_state=42))
        ]),
        'SVM': Pipeline([
            ('preprocessing', preprocessor),
            ('classifier', SVC(random_state=42))
        ]),
        'KNN': Pipeline([
            ('preprocessing', preprocessor),
            ('classifier', KNeighborsClassifier())
        ])
    }

    # 7. Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    # 8. Train models, perform hyperparameter tuning, and calculate metrics
    best_models = {}
    metrics = {}

    for name, pipeline in pipelines.items():
        print(f"Training and tuning {name}...")

        # Hyperparameter tuning
        grid_search = GridSearchCV(pipeline, param_grids[name], cv=3, scoring='f1_weighted', error_score='raise')
        grid_search.fit(X_train, y_train)

        # Get the best model
        best_model = grid_search.best_estimator_
        best_models[name] = best_model

        # Predictions
        y_pred = best_model.predict(X_test)

        # Calculate metrics
        metrics[name] = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred, average='weighted'),
            'recall': recall_score(y_test, y_pred, average='weighted'),
            'f1_score': f1_score(y_test, y_pred, average='weighted')
            #'confusion_matrix': confusion_matrix(y_test, y_pred)
        }

    return best_models, metrics

In [10]:
best_models, metrics = train_classifiers_with_metrics(df)
for model_name, model_metrics in metrics.items():
    print(f"\nMetrics for {model_name}:")
    for metric, value in model_metrics.items():
        print(f"{metric}: {value}")


Training and tuning RandomForest...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training and tuning SVM...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training and tuning KNN...

Metrics for RandomForest:
accuracy: 0.7022508038585209
precision: 0.6848024827565329
recall: 0.7022508038585209
f1_score: 0.6717188255019244

Metrics for SVM:
accuracy: 0.7022508038585209
precision: 0.6694115031871631
recall: 0.7022508038585209
f1_score: 0.6644823619560657

Metrics for KNN:
accuracy: 0.6527331189710611
precision: 0.6351292916155808
recall: 0.6527331189710611
f1_score: 0.6341902336519429


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Entrenamiento de los tres modelos de clasificación con reajuste de hiperparámetros con gridsearch, k-fold cross validation con distintas métricas

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.pipeline import Pipeline
import pandas as pd

def train_classifiers_with_metrics(df):
    """
    Train and evaluate RandomForest, SVM, and KNN classifiers with hyperparameter tuning using k-fold cross-validation.
    Args:
        df: Input pandas DataFrame with features and labels.
        
    Returns:
        best_models: Dictionary containing the best tuned models for each classifier.
        metrics: Dictionary containing metrics for each best-tuned classifier on test data.
    """
    
    # 1. Separate features and labels
    X = df.drop(columns=['phenotype', 'mic'])  # Drop label columns
    y = df['mic']  # Target label

    # 2. Encode 'mic' (target) using LabelEncoder
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # 3. Detect categorical features for encoding
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
    numeric_cols = X.select_dtypes(exclude=['object']).columns.tolist()

    # 4. Preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numeric_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ]
    )

    # 5. Define parameter grids for hyperparameter tuning
    param_grids = {
        'RandomForest': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [None, 10, 20],
            'classifier__min_samples_split': [2, 5, 10]
        },
        'SVM': {
            'classifier__C': [0.1, 1, 10],
            'classifier__kernel': ['linear', 'rbf'],
            'classifier__gamma': ['scale', 'auto']
        },
        'KNN': {
            'classifier__n_neighbors': [3, 5, 7],
            'classifier__weights': ['uniform', 'distance']
        }
    }

    # 6. Create pipelines for classifiers
    pipelines = {
        'RandomForest': Pipeline([
            ('preprocessing', preprocessor),
            ('classifier', RandomForestClassifier(random_state=42))
        ]),
        'SVM': Pipeline([
            ('preprocessing', preprocessor),
            ('classifier', SVC(random_state=42))
        ]),
        'KNN': Pipeline([
            ('preprocessing', preprocessor),
            ('classifier', KNeighborsClassifier())
        ])
    }

    # 7. Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    # 8. Initialize k-fold cross-validation
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # 9. Train models, perform hyperparameter tuning, and calculate metrics
    best_models = {}
    metrics = {}

    for name, pipeline in pipelines.items():
        print(f"Training and tuning {name}...")

        # Hyperparameter tuning using GridSearchCV with k-fold cross-validation
        grid_search = GridSearchCV(pipeline, param_grids[name], cv=kf, scoring='f1_weighted', error_score='raise')
        grid_search.fit(X_train, y_train)

        # Get the best model
        best_model = grid_search.best_estimator_
        best_models[name] = best_model

        # Predictions on the test set
        y_pred = best_model.predict(X_test)

        # Calculate metrics
        metrics[name] = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred, average='weighted'),
            'recall': recall_score(y_test, y_pred, average='weighted'),
            'f1_score': f1_score(y_test, y_pred, average='weighted')
            #'confusion_matrix': confusion_matrix(y_test, y_pred)
        }

    return best_models, metrics

In [15]:
best_models, metrics = train_classifiers_with_metrics(df)

print("Best Models:")
for model_name, model in best_models.items():
    print(f"{model_name}: {model}")

print("\nMetrics:")
for model_name, metric in metrics.items():
    print(f"{model_name}: {metric}")


Training and tuning RandomForest...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training and tuning SVM...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training and tuning KNN...
Best Models:
RandomForest: Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('num', 'passthrough',
                                                  ['3005053', '3000830',
                                                   '3003838', '3000508',
                                                   '3003890', '3000491',
                                                   '3000833', '3000832',
                                                   '3000206', '3000254',
                                                   '3006880', '3000502',
                                                   '3000499', '3000656',
                                                   '3000676', '3004039',
                                                   '3000516', '3003578',
                                                   '3000027', '3000074',
                                                   '3003378', '3000263',
                                     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
df

In [None]:
print(df["mic"].max(),df["mic"].min())

In [None]:
print(df['mic'].unique())

In [None]:
df["mic"].value_counts()

In [20]:
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline

def train_classifiers_with_stacking(df):
    """
    Function to train RandomForest, SVM, and KNN classifiers with hyperparameter tuning using K-Fold Cross-Validation,
    and combine them using StackingClassifier.
    
    Args:
        df: Input pandas DataFrame with features and labels.
        
    Returns:
        stacking_model: Trained stacking model.
        f1_stacking: F1 score of the stacking model on the test data.
    """

    # 1. Separate features and labels
    X = df.drop(columns=['phenotype', 'mic'])  # Drop label columns but keep 'antibiotic'
    y = df['mic']  # Target label

    # 2. Encode 'mic' (target) using LabelEncoder (since it's multiclass)
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # 3. Detect binary and multiclass features
    binary_cols = [col for col in X.columns if set(X[col].unique()) <= {0, 1}]  # Binary features
    multiclass_cols = [col for col in X.columns if len(X[col].unique()) > 2]  # Multiclass features

    # 4. Preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('binary', 'passthrough', binary_cols),  # Pass through binary columns
            ('multiclass', OneHotEncoder(handle_unknown='ignore'), multiclass_cols)  # One-hot encode multiclass columns
        ], remainder='passthrough')

    # 5. Create base models (without hyperparameter tuning)
    rf_model = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ])
    
    svm_model = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', SVC(random_state=42, probability=True))  # We need probability=True for stacking
    ])
    
    knn_model = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', KNeighborsClassifier())
    ])

    # 6. Meta-classifier for stacking (Logistic Regression)
    meta_classifier = LogisticRegression()

    # 7. StackingClassifier: combines base models
    stacking_model = StackingClassifier(
        estimators=[
            ('rf', rf_model),
            ('svm', svm_model),
            ('knn', knn_model)
        ],
        final_estimator=meta_classifier,
        cv=5  # Cross-validation for stacking
    )

    # 8. Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    # 9. Train the stacking model
    stacking_model.fit(X_train, y_train)

    # 10. Make predictions and calculate F1 score
    y_pred_stacking = stacking_model.predict(X_test)
    f1_stacking = f1_score(y_test, y_pred_stacking, average='weighted')

    return stacking_model, f1_stacking

In [21]:
stacking_model, f1_stacking = train_classifiers_with_stacking(df)
print(f'Stacking Model F1 Score: {f1_stacking}')




Stacking Model F1 Score: 0.704428213409905


In [27]:
def mic_transform(mic_values):
    log_mic = np.log2(mic_values)
    rounded_mic = np.round(log_mic).clip(-7, 7)  # Limitar a potencias entre -7 y 7
    return rounded_mic

mic_transformer = FunctionTransformer(mic_transform)

In [28]:
def train_classifiers_with_stacking(df):
    """
    Function to train RandomForest, SVM, and KNN classifiers with hyperparameter tuning using K-Fold Cross-Validation,
    and combine them using StackingClassifier.
    
    Args:
        df: Input pandas DataFrame with features and labels.
        
    Returns:
        stacking_model: Trained stacking model.
        f1_stacking: F1 score of the stacking model on the test data.
    """

    # 1. Separate features and labels
    X = df.drop(columns=['phenotype', 'mic'])  # Drop label columns but keep 'antibiotic'
    y = df['mic']  # Target label (mic)

    # 2. Apply log2 transformation and rounding to `mic` using FunctionTransformer
    y_transformed = mic_transformer.fit_transform(y.values.reshape(-1, 1)).ravel()  # Transform `mic` column

    # 3. Detect binary and multiclass features in X
    binary_cols = [col for col in X.columns if set(X[col].unique()) <= {0, 1}]  # Binary features
    multiclass_cols = [col for col in X.columns if len(X[col].unique()) > 2]  # Multiclass features

    # 4. Preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('binary', 'passthrough', binary_cols),  # Pass through binary columns
            ('multiclass', OneHotEncoder(handle_unknown='ignore'), multiclass_cols)  # One-hot encode multiclass columns
        ], remainder='passthrough')

    # 5. Create base models (without hyperparameter tuning)
    rf_model = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ])
    
    svm_model = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', SVC(random_state=42, probability=True))  # We need probability=True for stacking
    ])
    
    knn_model = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', KNeighborsClassifier())
    ])

    # 6. Meta-classifier for stacking (Logistic Regression)
    meta_classifier = LogisticRegression()

    # 7. StackingClassifier: combines base models
    stacking_model = StackingClassifier(
        estimators=[
            ('rf', rf_model),
            ('svm', svm_model),
            ('knn', knn_model)
        ],
        final_estimator=meta_classifier,
        cv=5  # Cross-validation for stacking
    )

    # 8. Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

    # 9. Train the stacking model
    stacking_model.fit(X_train, y_train)

    # 10. Make predictions and calculate F1 score
    y_pred_stacking = stacking_model.predict(X_test)
    f1_stacking = f1_score(y_test, y_pred_stacking, average='weighted')

    return stacking_model, f1_stacking

# Entrenar el modelo de stacking con la transformación de mic
stacking_model, f1_stacking = train_classifiers_with_stacking(df)
print(f'Stacking Model F1 Score: {f1_stacking}')



Stacking Model F1 Score: 0.7101268174047273


In [46]:
    # 1. Convert 'phenotype' values (Resistant, Susceptible) into binary labels
    df['phenotype'] = df['phenotype'].map({'Resistant': 1, 'Susceptible': 0})

In [30]:
#multioutput

In [34]:
def train_classifiers_with_multioutput(df):
    """
    Function to train RandomForest, SVM, and KNN classifiers for multi-output prediction (predicting both mic and phenotype),
    and combine them using StackingClassifier and MultiOutputClassifier.
    
    Args:
        df: Input pandas DataFrame with features and labels (mic and phenotype).
        
    Returns:
        stacking_model: Trained multi-output stacking model.
        f1_scores: Dictionary containing F1 scores for mic and phenotype on the test data.
    """

    # 2. Separate features and labels
    X = df.drop(columns=['mic', 'phenotype'])  # Features
    y_mic = df['mic']  # Target label 1 (mic)
    y_phenotype = df['phenotype']  # Target label 2 (phenotype)

    # 3. Apply log2 transformation
    y_mic_transformed = mic_transformer.fit_transform(y_mic.values.reshape(-1, 1)).ravel()  # Transform `mic` column

    # 4. Detect binary and multiclass features in X
    binary_cols = [col for col in X.columns if set(X[col].unique()) <= {0, 1}]  # Binary features
    multiclass_cols = [col for col in X.columns if len(X[col].unique()) > 2]  # Multiclass features

    # 5. Preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('binary', 'passthrough', binary_cols),  # Pass through binary columns
            ('multiclass', OneHotEncoder(handle_unknown='ignore'), multiclass_cols)  # One-hot encode multiclass columns
        ], remainder='passthrough')

    # 6. Create base models 
    rf_model = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ])
    
    svm_model = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', SVC(random_state=42, probability=True))  # We need probability=True for stacking
    ])
    
    knn_model = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', KNeighborsClassifier())
    ])

    # 7. Meta-classifier for stacking 
    meta_classifier = LogisticRegression()

    # 8. StackingClassifier: combines base models
    stacking_model = StackingClassifier(
        estimators=[
            ('rf', rf_model),
            ('svm', svm_model),
            ('knn', knn_model)
        ],
        final_estimator=meta_classifier,
        cv=5  
    )

    # 9. Use MultiOutputClassifier to predict both mic and phenotype
    multioutput_model = MultiOutputClassifier(stacking_model)

    # 10. Split data into training and testing sets for both mic and phenotype
    y_combined = np.column_stack([y_mic_transformed, y_phenotype])  # Combine both target variables
    X_train, X_test, y_train, y_test = train_test_split(X, y_combined, test_size=0.2, random_state=42)

    # 11. Train the multi-output model
    multioutput_model.fit(X_train, y_train)

    # 12. Make predictions and calculate F1 scores for both mic and phenotype
    y_pred = multioutput_model.predict(X_test)
    f1_mic = f1_score(y_test[:, 0], y_pred[:, 0], average='weighted')  # F1 score for mic
    f1_phenotype = f1_score(y_test[:, 1], y_pred[:, 1], average='weighted')  # F1 score for phenotype

    f1_scores = {
        'mic': f1_mic,
        'phenotype': f1_phenotype
    }

    return multioutput_model, f1_scores


In [35]:
multioutput_model, f1_scores = train_classifiers_with_multioutput(df)
print(f'F1 scores: {f1_scores}')



F1 scores: {'mic': 0.7101268174047273, 'phenotype': 0.9396797964022365}


In [36]:
df

Unnamed: 0,antibiotic,genus,species,phenotype,mic,3005053,3000830,3003838,3000508,3003890,...,3007751-D87Y,3003926-D87Y,3003709-G46S,3004851-A39T,3004832-A501P,3003381-R20H,3003926-S83I,3003381-G121D,3004832-T483S,3004832-A311V
0,0,Acinetobacter,baumannii,1,8.000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,Acinetobacter,baumannii,1,8.000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,Acinetobacter,baumannii,1,8.000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,Acinetobacter,baumannii,1,8.000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,Acinetobacter,baumannii,1,8.000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6704,1,Salmonella,enterica,0,0.015,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6705,1,Salmonella,enterica,0,0.015,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6706,1,Salmonella,enterica,0,0.015,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6707,1,Salmonella,enterica,0,0.015,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
def train_random_forest_multioutput(df):
    """
    Function to train a random forest classifier for multi-output prediction of 'mic' and 'phenotype' columns.
    
    Args:
        df: Input pandas DataFrame with features and labels ('mic' and 'phenotype').
        
    Returns:
        model: Trained MultiOutput RandomForestClassifier model.
        f1_scores: F1 scores for both 'mic' and 'phenotype' on test data.
    """
    

    # 2. Separate features and labels
    X = df.drop(columns=['mic', 'phenotype', 'antibiotic'])  
    y_mic = df['mic']  
    y_phenotype = df['phenotype']

    # 3. Encode 'mic' using LabelEncoder 
    label_encoder = LabelEncoder()
    y_mic_encoded = label_encoder.fit_transform(y_mic)

    # 4. Detect binary and multiclass features in X
    binary_cols = [col for col in X.columns if set(X[col].unique()) <= {0, 1}]  # Binary features
    multiclass_cols = [col for col in X.columns if len(X[col].unique()) > 2]  # Multiclass features

    # 5. Preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('binary', 'passthrough', binary_cols), 
            ('multiclass', OneHotEncoder(handle_unknown='ignore'), multiclass_cols)  
        ], remainder='passthrough')

    # 6. Create a RandomForest pipeline for multi-output classification
    rf_pipeline = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', MultiOutputClassifier(RandomForestClassifier(random_state=42)))
    ])

    # 7. Combine both mic and phenotype into a single target array
    y_combined = np.column_stack([y_mic_encoded, y_phenotype])

    # 8. Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_combined, test_size=0.2, random_state=42)

    # 9. Train the multi-output model
    rf_pipeline.fit(X_train, y_train)

    # 10. Make predictions and calculate the F1 scores for both mic and phenotype
    y_pred = rf_pipeline.predict(X_test)
    f1_mic = f1_score(y_test[:, 0], y_pred[:, 0], average='weighted')  # F1 score for mic
    f1_phenotype = f1_score(y_test[:, 1], y_pred[:, 1], average='weighted')  # F1 score for phenotype

    f1_scores = {
        'mic': f1_mic,
        'phenotype': f1_phenotype
    }

    return rf_pipeline, f1_scores



In [38]:
rf_model, f1_scores = train_random_forest_multioutput(df)
print(f'F1 scores: {f1_scores}')

F1 scores: {'mic': 0.7135263475674756, 'phenotype': 0.9338607851184337}
