In [2]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.12.4 imblearn-0.0


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_val_score
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import FunctionTransformer
from scipy.stats import randint, uniform

def mic_transform(mic_values):
    log_mic = np.log2(mic_values)
    rounded_mic = np.round(log_mic).clip(-7, 7)
    return rounded_mic

mic_transformer = FunctionTransformer(mic_transform)



def apply_smote_for_multioutput(
    X: pd.DataFrame, 
    y_mic: np.ndarray, 
    y_phenotype: np.ndarray, 
    preprocessor: ColumnTransformer, 
    label_encoder_mic: LabelEncoder, 
) -> tuple:
    """
    Applies SMOTE to resample X and multi-output targets (y_mic and y_phenotype).
    
    Args:
        X (pd.DataFrame): Features.
        y_mic (np.ndarray): Encoded mic target.
        y_phenotype (np.ndarray): Encoded phenotype target.
        preprocessor (ColumnTransformer): Preprocessing pipeline for X.
        label_encoder_mic (LabelEncoder): Label encoder for mic.

    Returns:
        tuple: Resampled X and y as (X_resampled, y_resampled) with y_resampled split into columns.
    """
    # Combine `y_mic` and `y_phenotype` into a single target for SMOTE
    y_combined = np.array([f"{mic}_{phenotype}" for mic, phenotype in zip(y_mic, y_phenotype)])

    # Apply SMOTE to resample
    smote = SMOTE(random_state=42)
    X_resampled, y_combined_resampled = smote.fit_resample(preprocessor.fit_transform(X), y_combined)

    # Split `y_combined_resampled` back into separate `mic` and `phenotype` labels
    y_mic_resampled = [int(label.split("_")[0]) for label in y_combined_resampled]
    y_phenotype_resampled = [int(label.split("_")[1]) for label in y_combined_resampled]

    # Convert back to original format if encoded
    y_mic_resampled = label_encoder_mic.inverse_transform(y_mic_resampled)
    y_phenotype_resampled = label_encoder_phenotype.inverse_transform(y_phenotype_resampled)

    # Create the final resampled target array with two columns
    y_resampled = np.column_stack([y_mic_resampled, y_phenotype_resampled])

    return X_resampled, y_resampled

def train_classifiers_with_multioutput(df, mic_as_continuous=True):
    """
    Function to train KNN, RandomForest, and SVM classifiers for multi-output prediction (predicting both mic and phenotype),
    and combine them using StackingClassifier with a neural network as the meta-model.
    
    Args:
        df: Input pandas DataFrame with features and labels (mic and phenotype).
        
    Returns:
        metrics_df: DataFrame containing precision, recall, F1 score, and accuracy for each fold.
    """
    # Separate features and labels
    X = df.drop(columns=['mic', 'phenotype', 'antibiotic'])
    y_mic = df['mic']
    y_phenotype = df['phenotype']

    # Apply log2 transformation to 'mic'
    y_mic_transformed = mic_transformer.fit_transform(y_mic.values.reshape(-1, 1)).ravel()

    if not mic_as_continuous:
        # Encode 'mic' using LabelEncoder
        label_encoder = LabelEncoder()
        y_mic_transformed = label_encoder.fit_transform(y_mic_transformed)

    # Detect binary and multiclass features in X
    binary_cols = [col for col in X.columns if set(X[col].unique()) <= {0, 1}]
    multiclass_cols = [col for col in X.columns if len(X[col].unique()) > 2]

    # Preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('binary', 'passthrough', binary_cols),
            ('multiclass', OneHotEncoder(handle_unknown='ignore'), multiclass_cols)
        ], remainder='passthrough')

    # Create base models
    rf_model = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ])

    svm_model = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', SVC(random_state=42, probability=True))
    ])

    knn_model = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('classifier', KNeighborsClassifier())
    ])

    # Meta-classifier: neural network
    meta_classifier = MLPClassifier(random_state=42)

    # StackingClassifier: combines base models
    stacking_model = StackingClassifier(
        estimators=[
            ('rf', rf_model),
            ('svm', svm_model),
            ('knn', knn_model)
        ],
        final_estimator=meta_classifier,
        cv=StratifiedKFold(n_splits=5)
    )

    # Hyperparameter tuning for base models and meta-model
    param_grid = {
        'rf__classifier__n_estimators': randint(50, 200),
        'rf__classifier__max_depth': randint(3, 10),
        'svm__classifier__C': uniform(0.1, 10),
        'svm__classifier__gamma': ['scale', 'auto'],
        'knn__classifier__n_neighbors': randint(3, 20),
        'final_estimator__hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'final_estimator__alpha': uniform(0.0001, 0.01)
    }

    # MultiOutputClassifier to handle multi-output prediction
    multioutput_model = MultiOutputClassifier(stacking_model)

    # Combine both mic and phenotype into a single target array
    y_combined = np.column_stack([y_mic_transformed, y_phenotype])

    # Apply SMOTE for oversampling
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(preprocessor.fit_transform(X), y_combined)

    # Nested cross-validation
    outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Hyperparameter tuning with RandomizedSearchCV
    search = RandomizedSearchCV(multioutput_model, param_distributions=param_grid, n_iter=50, cv=inner_cv, random_state=42, n_jobs=-1)

    # Custom scorer for multi-output metrics
    def multioutput_metrics(estimator, X, y):
        y_pred = estimator.predict(X)
        precision_mic = precision_score(y[:, 0], y_pred[:, 0], average='weighted')
        recall_mic = recall_score(y[:, 0], y_pred[:, 0], average='weighted')
        f1_mic = f1_score(y[:, 0], y_pred[:, 0], average='weighted')
        accuracy_mic = accuracy_score(y[:, 0], y_pred[:, 0])
        
        precision_phenotype = precision_score(y[:, 1], y_pred[:, 1], average='weighted')
        recall_phenotype = recall_score(y[:, 1], y_pred[:, 1], average='weighted')
        f1_phenotype = f1_score(y[:, 1], y_pred[:, 1], average='weighted')
        accuracy_phenotype = accuracy_score(y[:, 1], y_pred[:, 1])
        
        return {
            'precision_mic': precision_mic,
            'recall_mic': recall_mic,
            'f1_mic': f1_mic,
            'accuracy_mic': accuracy_mic,
            'precision_phenotype': precision_phenotype,
            'recall_phenotype': recall_phenotype,
            'f1_phenotype': f1_phenotype,
            'accuracy_phenotype': accuracy_phenotype
        }

    # Perform nested cross-validation and collect metrics
    metrics_list = []
    for train_idx, test_idx in outer_cv.split(X_resampled, y_resampled[:, 1]):
        X_train, X_test = X_resampled[train_idx], X_resampled[test_idx]
        y_train, y_test = y_resampled[train_idx], y_resampled[test_idx]
        
        search.fit(X_train, y_train)
        best_model = search.best_estimator_
        
        metrics = multioutput_metrics(best_model, X_test, y_test)
        metrics_list.append(metrics)

    # Convert metrics to DataFrame and write to CSV
    metrics_df = pd.DataFrame(metrics_list)
    metrics_df.to_csv('model_metrics.csv', index=False)

    return metrics_df

# Example usage
# df = pd.read_csv('your_dataset.csv')
# metrics_df = train_classifiers_with_multioutput(df)
# print(metrics_df)

In [26]:
df = pd.read_csv('./../../DataSets/group-2/data/combined_antibiotic_resistance.tsv', sep='\t')

  df = pd.read_csv('./../../DataSets/group-2/data/combined_antibiotic_resistance.tsv', sep='\t')


In [27]:
df.head()

Unnamed: 0,antibiotic,accession,genus,species,phenotype,mic,3005053,3000830,3003838,3000508,...,3007751-D87Y,3003926-D87Y,3003709-G46S,3004851-A39T,3004832-A501P,3003381-R20H,3003926-S83I,3003381-G121D,3004832-T483S,3004832-A311V
0,meropenem,GCA_002947415,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,meropenem,GCA_002947845,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,meropenem,GCA_002948925,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,meropenem,GCA_002996805,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,meropenem,GCA_003006035,Acinetobacter,baumannii,Resistant,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
df['species'].describe()

count         7772
unique           8
top       enterica
freq          2841
Name: species, dtype: object

In [31]:
df.species.value_counts()

species
enterica       2841
pneumoniae     2638
aeruginosa      652
jejuni          547
coli            480
gonorrhoeae     277
baumannii       269
cloacae          68
Name: count, dtype: int64

In [None]:
train_classifiers_with_multioutput(df)

  df = pd.read_csv(df_path, sep='\t')


TypeError: '<' not supported between instances of 'str' and 'float'