# Random Forest (Reproducible)

In [42]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [43]:
filepath = '/home/jupyter-user7/CAMDA/Camda24_resistance/DataSets/group-2/data/combined_antibiotic_resistance.tsv'
df = pd.read_csv(filepath, sep='\t')

  df = pd.read_csv(filepath, sep='\t')


In [44]:
df.head()

Unnamed: 0,antibiotic,accession,genus,species,phenotype,mic,3005053,3000830,3003838,3000508,...,3007751-D87Y,3003926-D87Y,3003709-G46S,3004851-A39T,3004832-A501P,3003381-R20H,3003926-S83I,3003381-G121D,3004832-T483S,3004832-A311V
0,meropenem,GCA_002947415,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,meropenem,GCA_002947845,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,meropenem,GCA_002948925,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,meropenem,GCA_002996805,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,meropenem,GCA_003006035,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
df.shape

(7772, 881)

In [37]:
df.isnull().value_counts().sum()

7772

In [45]:
df = df.drop('accession', axis=1)
df.head()

Unnamed: 0,antibiotic,genus,species,phenotype,mic,3005053,3000830,3003838,3000508,3003890,...,3007751-D87Y,3003926-D87Y,3003709-G46S,3004851-A39T,3004832-A501P,3003381-R20H,3003926-S83I,3003381-G121D,3004832-T483S,3004832-A311V
0,meropenem,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,meropenem,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,meropenem,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,meropenem,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,meropenem,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
df.shape

(7772, 880)

In [49]:
def train_random_forest(df):
    """
    Function to train a random forest classifier on the 'mic' column of the dataframe.
    Args:
        df: Input pandas DataFrame with features and labels.
        
    Returns:
        model: Trained RandomForestClassifier model.
        f1: F1 score of the model on test data.
    """
    # Delete na valued 
    df = df.dropna()   

    # Convert antibiotic column to binary values
    df['antibiotic'] = df['antibiotic'].map({'meropenem': 0, 'ciprofloxacin': 1})
    
    # 1. Separate features and labels
    X = df.drop(columns=['mic', 'phenotype'])  # Drop label columns
    y = df['mic']  # Target label

    # 2. Encode 'mic' (target) using LabelEncoder (since it's multiclass)
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # 3. Detect binary and multiclass features
    binary_cols = [col for col in X.columns if set(X[col].unique()) <= {0, 1}]  # Binary features
    multiclass_cols = [col for col in X.columns if len(X[col].unique()) > 2]  # Multiclass features

    # 4. Preprocessing pipeline
    # Binary columns don't need much processing, multiclass columns need one-hot encoding
    preprocessor = ColumnTransformer(
        transformers=[
            ('binary', 'passthrough', binary_cols),  # Pass through binary columns
            ('multiclass', OneHotEncoder(handle_unknown='ignore'), multiclass_cols)  # One-hot encode multiclass columns
        ], remainder='passthrough')  # Any remaining columns are passed through (if any)

    # 5. Create a pipeline that first applies preprocessing, then trains a random forest
    model_pipeline = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ])

    # 6. Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    # 7. Train the model
    model_pipeline.fit(X_train, y_train)

    # 8. Make predictions and calculate the F1 score
    y_pred = model_pipeline.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')  # Use 'weighted' for multiclass F1

    return model_pipeline, f1

In [50]:
model, f1 = train_random_forest(df)
print(f'Trained model: {model}')
print(f'F1 score: {f1}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['antibiotic'] = df['antibiotic'].map({'meropenem': 0, 'ciprofloxacin': 1})


Trained model: Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('binary', 'passthrough',
                                                  ['antibiotic', '3000830',
                                                   '3000206', '3006880',
                                                   '3000676', '3003576',
                                                   '3001216', '3000237',
                                                   '3003548', '3001889',
                                                   '3003652', '3003899',
                                                   '3006228', '3003900',
                                                   '3006881', '3001866',
                                                   '3003479', '3000166',
                                                   '3002540', '3006878',
                                                   '3006874', '3000168',
          

In [20]:
df1.head()

Unnamed: 0,genus,species,phenotype,mic,3005053,3000830,3003838,3000508,3003890,3000491,...,3007751-D87Y,3003926-D87Y,3003709-G46S,3004851-A39T,3004832-A501P,3003381-R20H,3003926-S83I,3003381-G121D,3004832-T483S,3004832-A311V
0,Salmonella,enterica,Susceptible,0.015,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Salmonella,enterica,Susceptible,0.015,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Neisseria,gonorrhoeae,Resistant,8.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Salmonella,enterica,Susceptible,0.015,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Salmonella,enterica,Susceptible,0.015,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Random projections + Logistic Regression

In [17]:
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

def apply_random_projection_logistic_regression(df, target_col='mic', n_components=50, test_size=0.2, random_state=42):
    """
    Apply logistic multinomial regression after performing Gaussian and Sparse random projections, 
    with preprocessing for binary and multiclass features.
    
    Parameters:
    df (DataFrame): Input DataFrame containing features and labels.
    target_col (str): Name of the target column to encode and predict.
    n_components (int): Number of components to keep after random projection.
    test_size (float): Proportion of the dataset to include in the test split.
    random_state (int): Random seed for reproducibility.

    Returns:
    None
    """
    
    # Step 1: Drop NA values
    df = df.dropna()

    # Step 2: Separate features and labels
    X = df.drop(columns=[target_col, 'phenotype'])  # Drop target and other irrelevant columns
    y = df[target_col]  # Target label
    
    # Step 3: Encode the target label (e.g., 'mic') using LabelEncoder
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # Step 4: Detect binary and multiclass features
    binary_cols = [col for col in X.columns if set(X[col].unique()) <= {0, 1}]  # Binary features (0 or 1)
    multiclass_cols = [col for col in X.columns if len(X[col].unique()) > 2]  # Multiclass features
    
    print(f"Binary columns: {binary_cols}")
    print(f"Multiclass columns: {multiclass_cols}")

    # Step 5: Preprocessing pipeline
    # Binary columns will pass through without modification, multiclass columns will be one-hot encoded
    preprocessor = ColumnTransformer(
        transformers=[
            ('binary', 'passthrough', binary_cols),  # Pass through binary columns
            ('multiclass', OneHotEncoder(), multiclass_cols)  # One-hot encode multiclass columns
        ], remainder='passthrough')  # Pass through any remaining columns (if any)

    # Step 6: Create a preprocessing pipeline that first applies transformations
    pipeline = Pipeline(steps=[('preprocessing', preprocessor)])

    # Apply the preprocessing pipeline to transform X
    X_transformed = pipeline.fit_transform(X)

    # Step 7: Perform Gaussian Random Projection
    gaussian_rp = GaussianRandomProjection(n_components=n_components)
    X_projected_gaussian = gaussian_rp.fit_transform(X_transformed)

    # Step 8: Train-Test Split for Gaussian Random Projection
    X_train_gaussian, X_test_gaussian, y_train_gaussian, y_test_gaussian = train_test_split(
        X_projected_gaussian, y_encoded, test_size=test_size, random_state=random_state)

    # Step 9: Apply Multinomial Logistic Regression for Gaussian RP
    logreg_gaussian = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
    logreg_gaussian.fit(X_train_gaussian, y_train_gaussian)
    
    # Step 10: Make predictions and evaluate for Gaussian RP
    y_pred_gaussian = logreg_gaussian.predict(X_test_gaussian)
    print("Gaussian Random Projection Logistic Regression Performance:")
    print(classification_report(y_test_gaussian, y_pred_gaussian))

    # Step 11: Perform Sparse Random Projection
    sparse_rp = SparseRandomProjection(n_components=n_components)
    X_projected_sparse = sparse_rp.fit_transform(X_transformed)

    # Step 12: Train-Test Split for Sparse Random Projection
    X_train_sparse, X_test_sparse, y_train_sparse, y_test_sparse = train_test_split(
        X_projected_sparse, y_encoded, test_size=test_size, random_state=random_state)

    # Step 13: Apply Multinomial Logistic Regression for Sparse RP
    logreg_sparse = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
    logreg_sparse.fit(X_train_sparse, y_train_sparse)
    
    # Step 14: Make predictions and evaluate for Sparse RP
    y_pred_sparse = logreg_sparse.predict(X_test_sparse)
    print("Sparse Random Projection Logistic Regression Performance:")
    print(classification_report(y_test_sparse, y_pred_sparse))

In [19]:
apply_random_projection_logistic_regression(df1, target_col='mic', n_components=50)

Binary columns: ['3000830', '3003838', '3003890', '3000491', '3000833', '3000832', '3000206', '3006880', '3000502', '3000676', '3000027', '3000074', '3000165', '3002660', '3002639', '3000412', '3000410', '3005010', '3002605', '3002860', '3000316', '3003839', '3000796', '3000795', '3003576', '3003952', '3000794', '3003843', '3001216', '3003950', '3000237', '3003308', '3002909', '3000216', '3003550', '3003548', '3001889', '3001877', '3003889', '3003652', '3003899', '3000873', '3004623', '3001878', '3000777', '3001917', '3004597', '3006228', '3003900', '3006881', '3001866', '3004621', '3003479', '3000166', '3002540', '3006878', '3006874', '3002734', '3000168', '3002683', '3004290', '3006875', '3007014', '3003923', '3003209', '3005059', '3003922', '3004588', '3005047', '3007509', '3002861', '3007453', '3002872', '3002132', '3004111', '3002718', '3002641', '3005116', '3001396', '3002676', '3002616', '3004550', '3004122', '3001070', '3005044', '3007682', '3002848', '3001059', '3003307', '300

ValueError: could not convert string to float: 'Salmonella'