# Random Forest (Reproducible)

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [2]:
filepath = '/home/jupyter-user7/CAMDA/Camda24_resistance/DataSets/ResistanceCiprofloxacinStrict.tsv.gz'
df1 = pd.read_csv(filepath, sep='\t', compression='gzip')

  df1 = pd.read_csv(filepath, sep='\t', compression='gzip')


In [3]:
df1 = df1.dropna()
df1.head()

Unnamed: 0,accession,genus,species,phenotype,mic,3005053,3000830,3003838,3000508,3003890,...,3007751-D87Y,3003926-D87Y,3003709-G46S,3004851-A39T,3004832-A501P,3003381-R20H,3003926-S83I,3003381-G121D,3004832-T483S,3004832-A311V
0,SRR3138666,Campylobacter,jejuni,Susceptible,0.12,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,SRR3138667,Campylobacter,jejuni,Susceptible,0.06,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,SRR3138668,Campylobacter,jejuni,Susceptible,0.06,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,SRR3138669,Campylobacter,jejuni,Susceptible,0.06,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,SRR3138670,Campylobacter,jejuni,Susceptible,0.06,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
df1.shape

(2818, 880)

In [7]:
df1 = df1.drop('accession', axis=1)
df1.head()

Unnamed: 0,genus,species,phenotype,mic,3005053,3000830,3003838,3000508,3003890,3000491,...,3007751-D87Y,3003926-D87Y,3003709-G46S,3004851-A39T,3004832-A501P,3003381-R20H,3003926-S83I,3003381-G121D,3004832-T483S,3004832-A311V
0,Campylobacter,jejuni,Susceptible,0.12,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Campylobacter,jejuni,Susceptible,0.06,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Campylobacter,jejuni,Susceptible,0.06,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Campylobacter,jejuni,Susceptible,0.06,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Campylobacter,jejuni,Susceptible,0.06,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
df1.shape

(2818, 879)

In [5]:
def train_random_forest(df):
    """
    Function to train a random forest classifier on the 'mic' column of the dataframe.
    Args:
        df: Input pandas DataFrame with features and labels.
        
    Returns:
        model: Trained RandomForestClassifier model.
        f1: F1 score of the model on test data.
    """
    
    # 1. Separate features and labels
    X = df.drop(columns=['phenotype', 'mic'])  # Drop label columns
    y = df['mic']  # Target label

    # 2. Encode 'mic' (target) using LabelEncoder (since it's multiclass)
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # 3. Detect binary and multiclass features
    binary_cols = [col for col in X.columns if set(X[col].unique()) <= {0, 1}]  # Binary features
    multiclass_cols = [col for col in X.columns if len(X[col].unique()) > 2]  # Multiclass features

    # 4. Preprocessing pipeline
    # Binary columns don't need much processing, multiclass columns need one-hot encoding
    preprocessor = ColumnTransformer(
        transformers=[
            ('binary', 'passthrough', binary_cols),  # Pass through binary columns
            ('multiclass', OneHotEncoder(), multiclass_cols)  # One-hot encode multiclass columns
        ], remainder='passthrough')  # Any remaining columns are passed through (if any)

    # 5. Create a pipeline that first applies preprocessing, then trains a random forest
    model_pipeline = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ])

    # 6. Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
    
    # 7. Train the model
    model_pipeline.fit(X_train, y_train)

    # 8. Make predictions and calculate the F1 score
    y_pred = model_pipeline.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')  # Use 'weighted' for multiclass F1

    return model_pipeline, f1

In [8]:
model, f1 = train_random_forest(df1)
print(f'Trained model: {model}')
print(f'F1 score: {f1}')

ValueError: Found unknown categories [1.0] in column 3 during transform

In [None]:

# Cambiar hiperparámetros