# Random Forest (Reproducible)

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [3]:
filepath = '/home/jupyter-user5/Camda24_resistance/DataSets/group-2/data/combined_antibiotic_resistance.tsv'
df1 = pd.read_csv(filepath, sep='\t')

  df1 = pd.read_csv(filepath, sep='\t')


In [4]:
df1 = df1.dropna()
df1.head()

Unnamed: 0,antibiotic,accession,genus,species,phenotype,mic,3005053,3000830,3003838,3000508,...,3007751-D87Y,3003926-D87Y,3003709-G46S,3004851-A39T,3004832-A501P,3003381-R20H,3003926-S83I,3003381-G121D,3004832-T483S,3004832-A311V
0,meropenem,GCA_002947415,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,meropenem,GCA_002947845,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,meropenem,GCA_002948925,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,meropenem,GCA_002996805,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,meropenem,GCA_003006035,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
df1.shape

(5952, 881)

In [6]:
df1 = df1.drop('accession', axis=1)
df1.head()

Unnamed: 0,antibiotic,genus,species,phenotype,mic,3005053,3000830,3003838,3000508,3003890,...,3007751-D87Y,3003926-D87Y,3003709-G46S,3004851-A39T,3004832-A501P,3003381-R20H,3003926-S83I,3003381-G121D,3004832-T483S,3004832-A311V
0,meropenem,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,meropenem,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,meropenem,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,meropenem,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,meropenem,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df1.shape

(5952, 880)

In [8]:
def train_random_forest(df):
    """
    Function to train a random forest classifier on the 'mic' column of the dataframe.
    Args:
        df: Input pandas DataFrame with features and labels.
        
    Returns:
        model: Trained RandomForestClassifier model.
        f1: F1 score of the model on test data.
    """
    
    # 1. Separate features and labels
    X = df.drop(columns=['phenotype', 'mic',"antibiotic"])  # Drop label columns
    y = df['mic']  # Target label

    # 2. Encode 'mic' (target) using LabelEncoder (since it's multiclass)
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # 3. Detect binary and multiclass features
    binary_cols = [col for col in X.columns if set(X[col].unique()) <= {0, 1}]  # Binary features
    multiclass_cols = [col for col in X.columns if len(X[col].unique()) > 2]  # Multiclass features

    # 4. Preprocessing pipeline
    # Binary columns don't need much processing, multiclass columns need one-hot encoding
    preprocessor = ColumnTransformer(
        transformers=[
            ('binary', 'passthrough', binary_cols),  # Pass through binary columns
            ('multiclass', OneHotEncoder(handle_unknown='ignore'), multiclass_cols)  # One-hot encode multiclass columns Con handle_unknown='ignore', el codificador ignorará las categorías desconocidas y no producirá un error durante la predicción.
        ], remainder='passthrough')  # Any remaining columns are passed through (if any)

    # 5. Create a pipeline that first applies preprocessing, then trains a random forest
    model_pipeline = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ])

    # 6. Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
    
    # 7. Train the model
    model_pipeline.fit(X_train, y_train)

    # 8. Make predictions and calculate the F1 score
    y_pred = model_pipeline.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')  # Use 'weighted' for multiclass F1

    return model_pipeline, f1

In [9]:
model, f1 = train_random_forest(df1)
print(f'Trained model: {model}')
print(f'F1 score: {f1}')

Trained model: Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('binary', 'passthrough',
                                                  ['3000830', '3000206',
                                                   '3006880', '3000676',
                                                   '3003576', '3001216',
                                                   '3000237', '3003548',
                                                   '3001889', '3003652',
                                                   '3003899', '3006228',
                                                   '3003900', '3006881',
                                                   '3001866', '3003479',
                                                   '3000166', '3002540',
                                                   '3006878', '3006874',
                                                   '3000168', '3004290',
             

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline

def train_classifiers(df):
    """
    Function to train RandomForest, SVM, and KNN classifiers on the 'mic' column of the dataframe.
    Args:
        df: Input pandas DataFrame with features and labels.
        
    Returns:
        models: Dictionary containing trained models for each classifier.
        f1_scores: Dictionary containing F1 scores for each classifier on test data.
    """
    
    # 1. Separate features and labels
    X = df.drop(columns=['phenotype', 'mic', "antibiotic"])  # Drop label columns
    y = df['mic']  # Target label

    # 2. Encode 'mic' (target) using LabelEncoder (since it's multiclass)
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # 3. Detect binary and multiclass features
    binary_cols = [col for col in X.columns if set(X[col].unique()) <= {0, 1}]  # Binary features
    multiclass_cols = [col for col in X.columns if len(X[col].unique()) > 2]  # Multiclass features

    # 4. Preprocessing pipeline
    # Binary columns don't need much processing, multiclass columns need one-hot encoding
    preprocessor = ColumnTransformer(
        transformers=[
            ('binary', 'passthrough', binary_cols),  # Pass through binary columns
            ('multiclass', OneHotEncoder(handle_unknown='ignore'), multiclass_cols)  # One-hot encode multiclass columns
        ], remainder='passthrough')  # Any remaining columns are passed through (if any)

    # 5. Create pipelines for different classifiers
    rf_pipeline = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ])
    
    svm_pipeline = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', SVC(random_state=42))
    ])
    
    knn_pipeline = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', KNeighborsClassifier())
    ])

    # 6. Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    # 7. Train models and calculate F1 scores
    models = {}
    f1_scores = {}

    # Random Forest
    rf_pipeline.fit(X_train, y_train)
    y_pred_rf = rf_pipeline.predict(X_test)
    f1_rf = f1_score(y_test, y_pred_rf, average='weighted')
    models['RandomForest'] = rf_pipeline
    f1_scores['RandomForest'] = f1_rf

    # SVM
    svm_pipeline.fit(X_train, y_train)
    y_pred_svm = svm_pipeline.predict(X_test)
    f1_svm = f1_score(y_test, y_pred_svm, average='weighted')
    models['SVM'] = svm_pipeline
    f1_scores['SVM'] = f1_svm

    # KNN
    knn_pipeline.fit(X_train, y_train)
    y_pred_knn = knn_pipeline.predict(X_test)
    f1_knn = f1_score(y_test, y_pred_knn, average='weighted')
    models['KNN'] = knn_pipeline
    f1_scores['KNN'] = f1_knn

    return models, f1_scores

# Entrenar los modelos
models, f1_scores = train_classifiers(df1)
print(f'Trained models: {models}')
print(f'F1 scores: {f1_scores}')


Trained models: {'RandomForest': Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('binary', 'passthrough',
                                                  ['3000830', '3000206',
                                                   '3006880', '3000676',
                                                   '3003576', '3001216',
                                                   '3000237', '3003548',
                                                   '3001889', '3003652',
                                                   '3003899', '3006228',
                                                   '3003900', '3006881',
                                                   '3001866', '3003479',
                                                   '3000166', '3002540',
                                                   '3006878', '3006874',
                                                   '3000168', '30042

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline

def train_classifiers_with_tuning(df):
    """
    Function to train RandomForest, SVM, and KNN classifiers with hyperparameter tuning on the 'mic' column.
    Args:
        df: Input pandas DataFrame with features and labels.
        
    Returns:
        best_models: Dictionary containing the best tuned models for each classifier.
        f1_scores: Dictionary containing F1 scores for each best-tuned classifier on test data.
    """
    
    # 1. Separate features and labels
    X = df.drop(columns=['phenotype', 'mic'])  # Drop label columns "antibiotic"
    y = df['mic']  # Target label

    # 2. Encode 'mic' (target) using LabelEncoder (since it's multiclass)
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # 3. Detect binary and multiclass features
    binary_cols = [col for col in X.columns if set(X[col].unique()) <= {0, 1}]  # Binary features
    multiclass_cols = [col for col in X.columns if len(X[col].unique()) > 2]  # Multiclass features

    # 4. Preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('binary', 'passthrough', binary_cols),  # Pass through binary columns
            ('multiclass', OneHotEncoder(handle_unknown='ignore'), multiclass_cols)  # One-hot encode multiclass columns
        ], remainder='passthrough')

    # 5. Define parameter grids for hyperparameter tuning
    param_grid_rf = {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 5, 10]
    }

    param_grid_svm = {
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['linear', 'rbf'],
        'classifier__gamma': ['scale', 'auto']
    }

    param_grid_knn = {
        'classifier__n_neighbors': [3, 5, 7],
        'classifier__weights': ['uniform', 'distance']
    }

    # 6. Create pipelines for different classifiers
    rf_pipeline = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ])
    
    svm_pipeline = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', SVC(random_state=42))
    ])
    
    knn_pipeline = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', KNeighborsClassifier())
    ])

    # 7. Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    # 8. Hyperparameter tuning with GridSearchCV
    rf_grid = GridSearchCV(rf_pipeline, param_grid_rf, cv=3, scoring='f1_weighted')
    svm_grid = GridSearchCV(svm_pipeline, param_grid_svm, cv=3, scoring='f1_weighted')
    knn_grid = GridSearchCV(knn_pipeline, param_grid_knn, cv=3, scoring='f1_weighted')

    # 9. Train models with best parameters and calculate F1 scores
    best_models = {}
    f1_scores = {}

    # Random Forest
    rf_grid.fit(X_train, y_train)
    y_pred_rf = rf_grid.predict(X_test)
    f1_rf = f1_score(y_test, y_pred_rf, average='weighted')
    best_models['RandomForest'] = rf_grid.best_estimator_
    f1_scores['RandomForest'] = f1_rf

    # SVM
    svm_grid.fit(X_train, y_train)
    y_pred_svm = svm_grid.predict(X_test)
    f1_svm = f1_score(y_test, y_pred_svm, average='weighted')
    best_models['SVM'] = svm_grid.best_estimator_
    f1_scores['SVM'] = f1_svm

    # KNN
    knn_grid.fit(X_train, y_train)
    y_pred_knn = knn_grid.predict(X_test)
    f1_knn = f1_score(y_test, y_pred_knn, average='weighted')
    best_models['KNN'] = knn_grid.best_estimator_
    f1_scores['KNN'] = f1_knn

    return best_models, f1_scores

# Entrenar los modelos con ajuste de hiperparámetros
best_models, f1_scores = train_classifiers_with_tuning(df1)
print(f'Best models: {best_models}')
print(f'F1 scores: {f1_scores}')




ValueError: 
All the 81 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
27 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/envs/umap/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/envs/umap/lib/python3.9/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/opt/conda/envs/umap/lib/python3.9/site-packages/sklearn/pipeline.py", line 476, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/opt/conda/envs/umap/lib/python3.9/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/opt/conda/envs/umap/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 363, in fit
    X, y = self._validate_data(
  File "/opt/conda/envs/umap/lib/python3.9/site-packages/sklearn/base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/opt/conda/envs/umap/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1273, in check_X_y
    X = check_array(
  File "/opt/conda/envs/umap/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1007, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/opt/conda/envs/umap/lib/python3.9/site-packages/sklearn/utils/_array_api.py", line 746, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
ValueError: could not convert string to float: 'meropenem'

--------------------------------------------------------------------------------
54 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/envs/umap/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/envs/umap/lib/python3.9/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/opt/conda/envs/umap/lib/python3.9/site-packages/sklearn/pipeline.py", line 476, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/opt/conda/envs/umap/lib/python3.9/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/opt/conda/envs/umap/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 363, in fit
    X, y = self._validate_data(
  File "/opt/conda/envs/umap/lib/python3.9/site-packages/sklearn/base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/opt/conda/envs/umap/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1273, in check_X_y
    X = check_array(
  File "/opt/conda/envs/umap/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1007, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/opt/conda/envs/umap/lib/python3.9/site-packages/sklearn/utils/_array_api.py", line 746, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
ValueError: could not convert string to float: 'ciprofloxacin'


In [12]:
# Parámetros para RandomForestClassifier: Ajusta el número de árboles, la profundidad máxima y el número mínimo de muestras para dividir un nodo.
# Parámetros para SVC: Ajusta el parámetro de regularización C, el kernel y el parámetro gamma.
# Parámetros para KNeighborsClassifier: Ajusta el número de vecinos y el tipo de ponderación para los vecinos.