# PCA

In [1]:
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import make_scorer, accuracy_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv("/files/camda2024/resistence/git/DataSets/SelectedVariables/ResistanceJoinedLooseBiofiltered.tsv.gz", sep="\t", compression="gzip")
df

  df = pd.read_csv("/files/camda2024/resistence/git/DataSets/SelectedVariables/ResistanceJoinedLooseBiofiltered.tsv.gz", sep="\t", compression="gzip")


Unnamed: 0.1,Unnamed: 0,accession,genus,species,antibiotic,phenotype,measurement_value,3000502,3000813,3000378,...,3007751-S91I,3007751-T86I,3007751-P116A,3007751-T87I,3007751-S80I,3007751-N87I,3004562-M593T,3003294-D105E,3003304-E540V,3003304-E504V
0,0,GCA_002947415,Acinetobacter,baumannii,meropenem,Resistant,8.0,6,10,6,...,0,0,0,0,0,0,0,0,0,0
1,1,GCA_002947845,Acinetobacter,baumannii,meropenem,Resistant,8.0,5,8,5,...,0,0,0,0,0,0,0,0,0,0
2,2,GCA_002948925,Acinetobacter,baumannii,meropenem,Resistant,8.0,4,9,5,...,0,0,0,0,0,0,0,0,0,0
3,3,GCA_002996805,Acinetobacter,baumannii,meropenem,Resistant,8.0,5,8,6,...,0,0,0,0,0,0,0,0,0,0
4,4,GCA_003006035,Acinetobacter,baumannii,meropenem,Resistant,8.0,7,11,4,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7767,7767,SRR3242343,Salmonella,enterica,ciprofloxacin,,,6,10,7,...,0,0,0,0,0,0,0,0,0,0
7768,7768,SRR2082856,Salmonella,enterica,ciprofloxacin,,,9,13,4,...,0,0,0,0,0,0,0,0,0,0
7769,7769,SRR3242362,Salmonella,enterica,ciprofloxacin,,,14,11,5,...,0,0,0,0,0,0,0,0,0,0
7770,7770,SRR1257300,Salmonella,enterica,ciprofloxacin,,,11,13,8,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Define the algorithms
algorithms = {
    "Random Forest (1200 trees)": RandomForestClassifier(n_estimators=1200, random_state=42),
    "Random Forest (500 trees)": RandomForestClassifier(n_estimators=500, random_state=42),
    "KNN (k=1)": KNeighborsClassifier(n_neighbors=1),
    "KNN (k=3)": KNeighborsClassifier(n_neighbors=3),
    "KNN (k=5)": KNeighborsClassifier(n_neighbors=5),
    "SVM (RBF Kernel)": SVC(kernel='rbf', gamma='auto', C=1, random_state=42),
    "SVM (Linear Kernel)": SVC(kernel='linear', gamma='auto', C=1, random_state=42),
    "SVM (Polynomial Kernel)": SVC(kernel='poly', gamma='auto', C=1, random_state=42),
    "MLP (1 Hidden Layer, 200 neurons)": MLPClassifier(hidden_layer_sizes=(200,), max_iter=1000, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Gaussian NB": GaussianNB()
}

results = {}

# Define the output folder
output_folder = "/home/haydeeperuyero/camda24/hackaton/output_pca"
os.makedirs(output_folder, exist_ok=True)

# Filter the DataFrame by the column 'phenotype'
df_training = df[df['phenotype'].isin(['Susceptible', 'Resistant'])]
df_test = df[df['phenotype'].isna()]

# Select numeric columns for training data
df_numeric_training = df_training.iloc[:, 7:]

# Scale the training data
scaler = StandardScaler()
X_scaled_training = scaler.fit_transform(df_numeric_training)

# Apply PCA initially to determine the number of components
pca = PCA()
pca.fit(X_scaled_training)

# Calculate cumulative variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

# Find the number of components that reach 95% of the variance
n_components = np.where(cumulative_variance >= 0.95)[0][0] + 1

# Apply PCA with the optimal number of components
pca = PCA(n_components=n_components)
X_pca_training = pca.fit_transform(X_scaled_training)

# Convert to DataFrame
X_pca_df_training = pd.DataFrame(X_pca_training, index=df_training.index)

# Add the first 7 columns of df_training to X_pca_df_training
df_pca_training = pd.concat([df_training.iloc[:, :7], X_pca_df_training], axis=1)

# Filter the original DataFrame to select only the desired columns for test data
df_numeric_test = df_test.iloc[:, 7:]

# Scale and transform the test data using the same scaler and PCA
X_scaled_test = scaler.transform(df_numeric_test)
X_pca_test = pca.transform(X_scaled_test)

# Convert to DataFrame
X_pca_df_test = pd.DataFrame(X_pca_test, index=df_test.index)

# Add the first 7 columns of df_test to X_pca_df_test
df_pca_test = pd.concat([df_test.iloc[:, :7], X_pca_df_test], axis=1)

# Get the filtered DataFrames for training and testing
df_filtrado_training = df_pca_training.iloc[:, 7:]
df_filtrado_test = df_pca_test.iloc[:, 7:]

# Encode the labels
labels = df_pca_training['phenotype']
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Configure cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for algo_name, algo in algorithms.items():
    # Evaluate the model using cross-validation
    accuracy_scores = cross_val_score(algo, df_filtrado_training, encoded_labels, cv=cv, scoring=make_scorer(accuracy_score))
    f1_scores = cross_val_score(algo, df_filtrado_training, encoded_labels, cv=cv, scoring=make_scorer(f1_score, average='weighted'))

    # Train the final model on all training data
    algo.fit(df_filtrado_training, encoded_labels)

    # Calculate the confusion matrix
    y_pred = cross_val_predict(algo, df_filtrado_training, encoded_labels, cv=cv)
    cm = confusion_matrix(encoded_labels, y_pred)

    # Save the results
    results[algo_name] = {
        'accuracy_mean': accuracy_scores.mean(),
        'accuracy_std': accuracy_scores.std(),
        'f1_score_mean': f1_scores.mean(),
        'f1_score_std': f1_scores.std(),
        'confusion_matrix': cm
    }

    # Plot and save the confusion matrix
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix - {algo_name}')
    plt.savefig(os.path.join(output_folder, algo_name + "_confusion_matrix.png"))
    plt.close()

    # Make predictions on df_filtrado_test
    test_predictions = algo.predict(df_filtrado_test)

    # Save the predictions to a CSV file
    test_predictions_labels = label_encoder.inverse_transform(test_predictions)
    test_predictions_df = pd.DataFrame({
        'genus': df_test.genus,
        'species': df_test.species,
        'accession': df_test.accession,
        'antibiotic': df_test.antibiotic,
        'status': test_predictions_labels
    })
    test_predictions_df.to_csv(os.path.join(output_folder, algo_name + "_test_predictions.csv"), index=False)

# Save overall results to CSV
results_df = pd.DataFrame.from_dict(results, orient='index')
results_df.to_csv(os.path.join(output_folder, "overall_results_PCA.csv"))

results_df


Unnamed: 0,accuracy_mean,accuracy_std,f1_score_mean,f1_score_std,confusion_matrix
Random Forest (1200 trees),0.908435,0.011228,0.906299,0.011696,"[[1023, 355], [190, 4384]]"
Random Forest (500 trees),0.906923,0.011106,0.904792,0.011804,"[[1021, 357], [197, 4377]]"
KNN (k=1),0.894153,0.011404,0.893253,0.011257,"[[1030, 348], [282, 4292]]"
KNN (k=3),0.90121,0.008765,0.89933,0.009493,"[[1017, 361], [227, 4347]]"
KNN (k=5),0.901715,0.005609,0.8997,0.006144,"[[1013, 365], [220, 4354]]"
SVM (RBF Kernel),0.886764,0.00997,0.881276,0.010803,"[[883, 495], [179, 4395]]"
SVM (Linear Kernel),0.882731,0.009142,0.879788,0.010724,"[[944, 434], [264, 4310]]"
SVM (Polynomial Kernel),0.888611,0.010535,0.885478,0.010961,"[[948, 430], [233, 4341]]"
"MLP (1 Hidden Layer, 200 neurons)",0.894155,0.011662,0.893724,0.011677,"[[1048, 330], [300, 4274]]"
Logistic Regression,0.882899,0.007528,0.880789,0.008555,"[[966, 412], [285, 4289]]"


# Joint strict

In [9]:
df = pd.read_csv("/files/camda2024/resistence/git/DataSets/SelectedVariables/ResistanceJoinedStrictBiofiltered.tsv.gz", sep="\t", compression="gzip")
df

  df = pd.read_csv("/files/camda2024/resistence/git/DataSets/SelectedVariables/ResistanceJoinedStrictBiofiltered.tsv.gz", sep="\t", compression="gzip")


Unnamed: 0.1,Unnamed: 0,accession,genus,species,antibiotic,phenotype,measurement_value,3000502,3000499,3000656,...,3007751-S91I,3007751-T86I,3007751-P116A,3007751-T87I,3007751-S80I,3007751-N87I,3004562-M593T,3003294-D105E,3003304-E540V,3003304-E504V
0,0,GCA_002947415,Acinetobacter,baumannii,meropenem,Resistant,8.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,GCA_002947845,Acinetobacter,baumannii,meropenem,Resistant,8.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,GCA_002948925,Acinetobacter,baumannii,meropenem,Resistant,8.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,GCA_002996805,Acinetobacter,baumannii,meropenem,Resistant,8.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,GCA_003006035,Acinetobacter,baumannii,meropenem,Resistant,8.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7767,7767,SRR3242343,Salmonella,enterica,ciprofloxacin,,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7768,7768,SRR2082856,Salmonella,enterica,ciprofloxacin,,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7769,7769,SRR3242362,Salmonella,enterica,ciprofloxacin,,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7770,7770,SRR1257300,Salmonella,enterica,ciprofloxacin,,,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Define the algorithms
algorithms = {
    "Random Forest (1200 trees)": RandomForestClassifier(n_estimators=1200, random_state=42),
    "Random Forest (500 trees)": RandomForestClassifier(n_estimators=500, random_state=42),
    "KNN (k=1)": KNeighborsClassifier(n_neighbors=1),
    "KNN (k=3)": KNeighborsClassifier(n_neighbors=3),
    "KNN (k=5)": KNeighborsClassifier(n_neighbors=5),
    "SVM (RBF Kernel)": SVC(kernel='rbf', gamma='auto', C=1, random_state=42),
    "SVM (Linear Kernel)": SVC(kernel='linear', gamma='auto', C=1, random_state=42),
    "SVM (Polynomial Kernel)": SVC(kernel='poly', gamma='auto', C=1, random_state=42),
    "MLP (1 Hidden Layer, 200 neurons)": MLPClassifier(hidden_layer_sizes=(200,), max_iter=1000, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Gaussian NB": GaussianNB()
}

results = {}

# Define the output folder
output_folder = "/home/haydeeperuyero/camda24/hackaton/output_pca_strict"
os.makedirs(output_folder, exist_ok=True)

# Filter the DataFrame by the column 'phenotype'
df_training = df[df['phenotype'].isin(['Susceptible', 'Resistant'])]
df_test = df[df['phenotype'].isna()]

# Select numeric columns for training data
df_numeric_training = df_training.iloc[:, 7:]

# Scale the training data
scaler = StandardScaler()
X_scaled_training = scaler.fit_transform(df_numeric_training)

# Apply PCA initially to determine the number of components
pca = PCA()
pca.fit(X_scaled_training)

# Calculate cumulative variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

# Find the number of components that reach 95% of the variance
n_components = np.where(cumulative_variance >= 0.95)[0][0] + 1

# Apply PCA with the optimal number of components
pca = PCA(n_components=n_components)
X_pca_training = pca.fit_transform(X_scaled_training)

# Convert to DataFrame
X_pca_df_training = pd.DataFrame(X_pca_training, index=df_training.index)

# Add the first 7 columns of df_training to X_pca_df_training
df_pca_training = pd.concat([df_training.iloc[:, :7], X_pca_df_training], axis=1)

# Filter the original DataFrame to select only the desired columns for test data
df_numeric_test = df_test.iloc[:, 7:]

# Scale and transform the test data using the same scaler and PCA
X_scaled_test = scaler.transform(df_numeric_test)
X_pca_test = pca.transform(X_scaled_test)

# Convert to DataFrame
X_pca_df_test = pd.DataFrame(X_pca_test, index=df_test.index)

# Add the first 7 columns of df_test to X_pca_df_test
df_pca_test = pd.concat([df_test.iloc[:, :7], X_pca_df_test], axis=1)

# Get the filtered DataFrames for training and testing
df_filtrado_training = df_pca_training.iloc[:, 7:]
df_filtrado_test = df_pca_test.iloc[:, 7:]

# Encode the labels
labels = df_pca_training['phenotype']
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Configure cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for algo_name, algo in algorithms.items():
    # Evaluate the model using cross-validation
    accuracy_scores = cross_val_score(algo, df_filtrado_training, encoded_labels, cv=cv, scoring=make_scorer(accuracy_score))
    f1_scores = cross_val_score(algo, df_filtrado_training, encoded_labels, cv=cv, scoring=make_scorer(f1_score, average='weighted'))

    # Train the final model on all training data
    algo.fit(df_filtrado_training, encoded_labels)

    # Calculate the confusion matrix
    y_pred = cross_val_predict(algo, df_filtrado_training, encoded_labels, cv=cv)
    cm = confusion_matrix(encoded_labels, y_pred)

    # Save the results
    results[algo_name] = {
        'accuracy_mean': accuracy_scores.mean(),
        'accuracy_std': accuracy_scores.std(),
        'f1_score_mean': f1_scores.mean(),
        'f1_score_std': f1_scores.std(),
        'confusion_matrix': cm
    }

    # Plot and save the confusion matrix
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix - {algo_name}')
    plt.savefig(os.path.join(output_folder, algo_name + "_confusion_matrix.png"))
    plt.close()

    # Make predictions on df_filtrado_test
    test_predictions = algo.predict(df_filtrado_test)

    # Save the predictions to a CSV file
    test_predictions_labels = label_encoder.inverse_transform(test_predictions)
    test_predictions_df = pd.DataFrame({
        'genus': df_test.genus,
        'species': df_test.species,
        'accession': df_test.accession,
        'antibiotic': df_test.antibiotic,
        'status': test_predictions_labels
    })
    test_predictions_df.to_csv(os.path.join(output_folder, algo_name + "_test_predictions.csv"), index=False)

# Save overall results to CSV
results_df = pd.DataFrame.from_dict(results, orient='index')
results_df.to_csv(os.path.join(output_folder, "overall_results_PCA.csv"))

results_df


Unnamed: 0,accuracy_mean,accuracy_std,f1_score_mean,f1_score_std,confusion_matrix
Random Forest (1200 trees),0.870299,0.00768,0.863992,0.01003,"[[838, 540], [232, 4342]]"
Random Forest (500 trees),0.869795,0.008469,0.86316,0.011215,"[[831, 547], [228, 4346]]"
KNN (k=1),0.815025,0.018649,0.82565,0.016508,"[[1154, 224], [877, 3697]]"
KNN (k=3),0.830985,0.012763,0.83926,0.011406,"[[1133, 245], [761, 3813]]"
KNN (k=5),0.843251,0.014887,0.850008,0.013774,"[[1128, 250], [683, 3891]]"
SVM (RBF Kernel),0.868114,0.008991,0.859591,0.011594,"[[789, 589], [196, 4378]]"
SVM (Linear Kernel),0.867105,0.006193,0.859532,0.009307,"[[807, 571], [220, 4354]]"
SVM (Polynomial Kernel),0.868954,0.007931,0.861111,0.010264,"[[803, 575], [205, 4369]]"
"MLP (1 Hidden Layer, 200 neurons)",0.87265,0.006442,0.8678,0.007077,"[[871, 507], [251, 4323]]"
Logistic Regression,0.869458,0.00694,0.862669,0.009587,"[[826, 552], [225, 4349]]"


# Pangenome Meropenen

In [25]:
import polars as pl
df_polar = pl.read_csv("/files/camda2024/resistence/git/DataSets/PangenomeCountMeropenem.tsv", separator='\t')
df_polar

accession,genus,species,antibiotic,phenotype,measurement_value,ERR1204814_CDS_0064,ERR1204814_CDS_0160,ERR1204814_CDS_0320,ERR1204814_CDS_0448,ERR1204814_CDS_0576,ERR1204814_CDS_0672,ERR1204814_CDS_0800,ERR1204814_CDS_0832,ERR1204814_CDS_0896,ERR1204814_CDS_1056,ERR1204814_CDS_1120,ERR1204814_CDS_1184,ERR1204814_CDS_1216,ERR1204814_CDS_1376,ERR1204814_CDS_1568,ERR1204814_CDS_1696,ERR1204814_CDS_1728,ERR1204814_CDS_1792,ERR1204814_CDS_1856,ERR1204814_CDS_1888,ERR1204814_CDS_1920,ERR1204814_CDS_1984,ERR1204814_CDS_2016,ERR1204814_CDS_2048,ERR1204814_CDS_2080,ERR1204814_CDS_2112,ERR1204814_CDS_2176,ERR1204814_CDS_2208,ERR1204814_CDS_2304,ERR1204814_CDS_2496,ERR1204814_CDS_2560,…,SRR1056064_CDS_0250,SRR1056064_CDS_1338,SRR1056064_CDS_2458,SRR1056064_CDS_2490,SRR1056064_CDS_2842,SRR1056099_CDS_2752,SRR1056099_CDS_4288,SRR1056110_CDS_5171,SRR1056110_CDS_5747,SRR1056110_CDS_5779,SRR1056111_CDS_0933,SRR1056143_CDS_3153,SRR1056143_CDS_3281,SRR1056143_CDS_3345,SRR1056143_CDS_3473,SRR1056143_CDS_3537,SRR1056143_CDS_3601,SRR1056143_CDS_3665,SRR1056143_CDS_3697,SRR1056143_CDS_3825,SRR1056143_CDS_3857,SRR1056143_CDS_3953,SRR1056144_CDS_0752,SRR1056144_CDS_0880,SRR1056164_CDS_3874,SRR1056164_CDS_4290,SRR1056164_CDS_4322,SRR1056164_CDS_4770,SRR1056164_CDS_5442,SRR1056164_CDS_5666,SRR1056164_CDS_5698,SRR1056164_CDS_5730,SRR1056169_CDS_5064,SRR1056169_CDS_5160,SRR1056169_CDS_5544,SRR1056169_CDS_5736,SRR1056169_CDS_5832
str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""GCA_002947415""","""Acinetobacter""","""baumannii""","""meropenem""","""Resistant""",8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""GCA_002947845""","""Acinetobacter""","""baumannii""","""meropenem""","""Resistant""",8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""GCA_002948925""","""Acinetobacter""","""baumannii""","""meropenem""","""Resistant""",8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""GCA_002996805""","""Acinetobacter""","""baumannii""","""meropenem""","""Resistant""",8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""GCA_003006035""","""Acinetobacter""","""baumannii""","""meropenem""","""Resistant""",8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""SRR1056175""","""Pseudomonas""","""aeruginosa""","""meropenem""",,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""SRR1056177""","""Pseudomonas""","""aeruginosa""","""meropenem""",,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""SRR1056178""","""Pseudomonas""","""aeruginosa""","""meropenem""",,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
"""SRR1056179""","""Pseudomonas""","""aeruginosa""","""meropenem""",,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# Convertir a pandas
df = df_polar.to_pandas()
df

Unnamed: 0,accession,genus,species,antibiotic,phenotype,measurement_value,ERR1204814_CDS_0064,ERR1204814_CDS_0160,ERR1204814_CDS_0320,ERR1204814_CDS_0448,...,SRR1056164_CDS_4770,SRR1056164_CDS_5442,SRR1056164_CDS_5666,SRR1056164_CDS_5698,SRR1056164_CDS_5730,SRR1056169_CDS_5064,SRR1056169_CDS_5160,SRR1056169_CDS_5544,SRR1056169_CDS_5736,SRR1056169_CDS_5832
0,GCA_002947415,Acinetobacter,baumannii,meropenem,Resistant,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,GCA_002947845,Acinetobacter,baumannii,meropenem,Resistant,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,GCA_002948925,Acinetobacter,baumannii,meropenem,Resistant,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,GCA_002996805,Acinetobacter,baumannii,meropenem,Resistant,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,GCA_003006035,Acinetobacter,baumannii,meropenem,Resistant,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3886,SRR1056175,Pseudomonas,aeruginosa,meropenem,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3887,SRR1056177,Pseudomonas,aeruginosa,meropenem,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3888,SRR1056178,Pseudomonas,aeruginosa,meropenem,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3889,SRR1056179,Pseudomonas,aeruginosa,meropenem,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Define the algorithms
algorithms = {
    "Random Forest (1200 trees)": RandomForestClassifier(n_estimators=1200, random_state=42),
    "Random Forest (500 trees)": RandomForestClassifier(n_estimators=500, random_state=42),
    "KNN (k=1)": KNeighborsClassifier(n_neighbors=1),
    "KNN (k=3)": KNeighborsClassifier(n_neighbors=3),
    "KNN (k=5)": KNeighborsClassifier(n_neighbors=5),
    "SVM (RBF Kernel)": SVC(kernel='rbf', gamma='auto', C=1, random_state=42),
    "SVM (Linear Kernel)": SVC(kernel='linear', gamma='auto', C=1, random_state=42),
    "SVM (Polynomial Kernel)": SVC(kernel='poly', gamma='auto', C=1, random_state=42),
    "MLP (1 Hidden Layer, 200 neurons)": MLPClassifier(hidden_layer_sizes=(200,), max_iter=1000, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Gaussian NB": GaussianNB()
}

results = {}

# Define the output folder
output_folder = "/home/haydeeperuyero/camda24/hackaton/output_pca_meropenem"
os.makedirs(output_folder, exist_ok=True)

# Filter the DataFrame by the column 'phenotype'
df_training = df[df['phenotype'].isin(['Susceptible', 'Resistant'])]
df_test = df[df['phenotype'].isna()]

# Select numeric columns for training data
df_numeric_training = df_training.iloc[:, 6:]

# Scale the training data
scaler = StandardScaler()
X_scaled_training = scaler.fit_transform(df_numeric_training)

# Apply PCA initially to determine the number of components
pca = PCA()
pca.fit(X_scaled_training)

# Calculate cumulative variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

# Find the number of components that reach 95% of the variance
n_components = np.where(cumulative_variance >= 0.95)[0][0] + 1

# Apply PCA with the optimal number of components
pca = PCA(n_components=n_components)
X_pca_training = pca.fit_transform(X_scaled_training)

# Convert to DataFrame
X_pca_df_training = pd.DataFrame(X_pca_training, index=df_training.index)

# Add the first 7 columns of df_training to X_pca_df_training
df_pca_training = pd.concat([df_training.iloc[:, :6], X_pca_df_training], axis=1)

# Filter the original DataFrame to select only the desired columns for test data
df_numeric_test = df_test.iloc[:, 6:]

# Scale and transform the test data using the same scaler and PCA
X_scaled_test = scaler.transform(df_numeric_test)
X_pca_test = pca.transform(X_scaled_test)

# Convert to DataFrame
X_pca_df_test = pd.DataFrame(X_pca_test, index=df_test.index)

# Add the first 7 columns of df_test to X_pca_df_test
df_pca_test = pd.concat([df_test.iloc[:, :6], X_pca_df_test], axis=1)

# Get the filtered DataFrames for training and testing
df_filtrado_training = df_pca_training.iloc[:, 6:]
df_filtrado_test = df_pca_test.iloc[:, 6:]

# Encode the labels
labels = df_pca_training['phenotype']
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Configure cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for algo_name, algo in algorithms.items():
    # Evaluate the model using cross-validation
    accuracy_scores = cross_val_score(algo, df_filtrado_training, encoded_labels, cv=cv, scoring=make_scorer(accuracy_score))
    f1_scores = cross_val_score(algo, df_filtrado_training, encoded_labels, cv=cv, scoring=make_scorer(f1_score, average='weighted'))

    # Train the final model on all training data
    algo.fit(df_filtrado_training, encoded_labels)

    # Calculate the confusion matrix
    y_pred = cross_val_predict(algo, df_filtrado_training, encoded_labels, cv=cv)
    cm = confusion_matrix(encoded_labels, y_pred)

    # Save the results
    results[algo_name] = {
        'accuracy_mean': accuracy_scores.mean(),
        'accuracy_std': accuracy_scores.std(),
        'f1_score_mean': f1_scores.mean(),
        'f1_score_std': f1_scores.std(),
        'confusion_matrix': cm
    }

    # Plot and save the confusion matrix
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix - {algo_name}')
    plt.savefig(os.path.join(output_folder, algo_name + "_confusion_matrix.png"))
    plt.close()

    # Make predictions on df_filtrado_test
    test_predictions = algo.predict(df_filtrado_test)

    # Save the predictions to a CSV file
    test_predictions_labels = label_encoder.inverse_transform(test_predictions)
    test_predictions_df = pd.DataFrame({
        'genus': df_test.genus,
        'species': df_test.species,
        'accession': df_test.accession,
        'antibiotic': df_test.antibiotic,
        'status': test_predictions_labels
    })
    test_predictions_df.to_csv(os.path.join(output_folder, algo_name + "_test_predictions.csv"), index=False)

# Save overall results to CSV
results_df = pd.DataFrame.from_dict(results, orient='index')
results_df.to_csv(os.path.join(output_folder, "overall_results_PCA.csv"))

results_df


# Pangenome Cirpofloxacin

In [15]:
df_polar = pl.read_csv("/files/camda2024/resistence/git/DataSets/PangenomeCountCiprofloxacin.tsv", separator='\t')
df_polar

accession,genus,species,antibiotic,phenotype,measurement_value,ERR2044062_CDS_4717,ERR2044062_CDS_4845,ERR2044062_CDS_5037,ERR2044062_CDS_5581,ERR2044062_CDS_5677,ERR2044062_CDS_5741,ERR2044062_CDS_5837,ERR2060042_CDS_5277,ERR2060042_CDS_5309,ERR2060054_CDS_3426,ERR2060054_CDS_4258,ERR2060054_CDS_4386,ERR2060054_CDS_4834,ERR2060054_CDS_5314,ERR2060069_CDS_4083,ERR2060089_CDS_4054,ERR2060089_CDS_4246,ERR2060089_CDS_4406,ERR2060095_CDS_0500,ERR349962_CDS_2499,ERR352722_CDS_2561,ERR449460_CDS_2640,SRR10257458_CDS_1510,SRR10257459_CDS_0682,SRR10257459_CDS_0778,SRR10257459_CDS_0874,SRR10257459_CDS_1066,SRR10257459_CDS_1866,SRR10257519_CDS_0268,SRR10257519_CDS_1740,SRR1752844_CDS_4825,…,SRR4420895_CDS_4835,DRR148171_CDS_1233,DRR148171_CDS_2769,DRR148274_CDS_5026,DRR148300_CDS_5388,DRR148331_CDS_3778,DRR148331_CDS_4322,DRR148331_CDS_4514,DRR148331_CDS_4962,DRR148333_CDS_0444,ERR1995431_CDS_3873,ERR1995431_CDS_4609,ERR1995442_CDS_0900,ERR2044118_CDS_5137,ERR2044118_CDS_5617,ERR2044118_CDS_5713,ERR2044126_CDS_0104,ERR2044126_CDS_0200,SRR10257457_CDS_0542,SRR10257495_CDS_0731,SRR10257495_CDS_1755,SRR10257495_CDS_1883,SRR1300638_CDS_4645,SRR1557002_CDS_4631,SRR1910773_CDS_4859,SRR1910773_CDS_4923,SRR1910773_CDS_4955,SRR2239579_CDS_2505,SRR2239579_CDS_2633,SRR2239600_CDS_3411,SRR3056915_CDS_5005,SRR3271894_CDS_0070,SRR3295785_CDS_4561,SRR3295803_CDS_0359,SRR4025989_CDS_3336,SRR4025989_CDS_4488,SRR4025989_CDS_4776
str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""SRR3138666""","""Campylobacter""","""jejuni""","""ciprofloxacin""","""Susceptible""",0.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""SRR3138667""","""Campylobacter""","""jejuni""","""ciprofloxacin""","""Susceptible""",0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""SRR3138668""","""Campylobacter""","""jejuni""","""ciprofloxacin""","""Susceptible""",0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""SRR3138669""","""Campylobacter""","""jejuni""","""ciprofloxacin""","""Susceptible""",0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""SRR3138670""","""Campylobacter""","""jejuni""","""ciprofloxacin""","""Susceptible""",0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""SRR3242343""","""Salmonella""","""enterica""","""ciprofloxacin""",,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""SRR2082856""","""Salmonella""","""enterica""","""ciprofloxacin""",,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""SRR3242362""","""Salmonella""","""enterica""","""ciprofloxacin""",,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""SRR1257300""","""Salmonella""","""enterica""","""ciprofloxacin""",,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# Convertir a pandas
df = df_polar.to_pandas()
df

Unnamed: 0,accession,genus,species,antibiotic,phenotype,measurement_value,ERR2044062_CDS_4717,ERR2044062_CDS_4845,ERR2044062_CDS_5037,ERR2044062_CDS_5581,...,SRR2239579_CDS_2505,SRR2239579_CDS_2633,SRR2239600_CDS_3411,SRR3056915_CDS_5005,SRR3271894_CDS_0070,SRR3295785_CDS_4561,SRR3295803_CDS_0359,SRR4025989_CDS_3336,SRR4025989_CDS_4488,SRR4025989_CDS_4776
0,SRR3138666,Campylobacter,jejuni,ciprofloxacin,Susceptible,0.12,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,SRR3138667,Campylobacter,jejuni,ciprofloxacin,Susceptible,0.06,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,SRR3138668,Campylobacter,jejuni,ciprofloxacin,Susceptible,0.06,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,SRR3138669,Campylobacter,jejuni,ciprofloxacin,Susceptible,0.06,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,SRR3138670,Campylobacter,jejuni,ciprofloxacin,Susceptible,0.06,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3876,SRR3242343,Salmonella,enterica,ciprofloxacin,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3877,SRR2082856,Salmonella,enterica,ciprofloxacin,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3878,SRR3242362,Salmonella,enterica,ciprofloxacin,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,SRR1257300,Salmonella,enterica,ciprofloxacin,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
df_pca_training

Unnamed: 0,accession,genus,species,antibiotic,phenotype,measurement_value,ERR2044062_CDS_4717,0,1,2,...,1135,1136,1137,1138,1139,1140,1141,1142,1143,1144
0,SRR3138666,Campylobacter,jejuni,ciprofloxacin,Susceptible,0.120,0.0,-131.842585,3.102263,-69.817847,...,0.842845,-0.580427,-0.721620,0.711847,-0.024470,0.242287,1.362052,0.861218,1.196600,1.520058
1,SRR3138667,Campylobacter,jejuni,ciprofloxacin,Susceptible,0.060,0.0,-127.900150,2.897969,-64.721184,...,-0.308756,-0.747376,-0.285192,0.843814,0.466940,0.034582,-0.244550,-0.101264,0.551659,0.023565
2,SRR3138668,Campylobacter,jejuni,ciprofloxacin,Susceptible,0.060,0.0,-130.060913,3.010356,-67.523587,...,0.727880,-0.881287,0.904585,-0.129399,0.476932,-1.142547,0.859351,1.331297,-0.278336,-1.370365
3,SRR3138669,Campylobacter,jejuni,ciprofloxacin,Susceptible,0.060,0.0,-128.769795,2.944487,-65.884614,...,0.564534,-0.164454,0.331654,-0.346521,0.571657,-0.902049,0.434882,0.739398,0.210470,-0.402332
4,SRR3138670,Campylobacter,jejuni,ciprofloxacin,Susceptible,0.060,0.0,-135.085195,3.269450,-73.981375,...,-7.551632,2.305598,3.350566,-2.143093,3.630018,1.484714,3.059793,4.777714,2.348456,1.716259
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2813,SRR4449966,Salmonella,enterica,ciprofloxacin,Susceptible,0.015,0.0,43.867576,-4.306802,2.418075,...,0.211638,-1.512910,-1.558491,1.057103,1.332191,-0.843790,-0.240554,-1.436466,-0.649796,-1.457776
2814,SRR4449967,Salmonella,enterica,ciprofloxacin,Susceptible,0.015,0.0,42.057996,-3.182689,2.122539,...,0.506787,-0.097819,-0.870149,-0.114540,-0.240254,-0.891130,-0.792517,0.795252,-0.195830,0.319058
2815,SRR4449972,Salmonella,enterica,ciprofloxacin,Susceptible,0.015,0.0,42.726811,-3.745513,2.247876,...,-0.227895,-0.262304,-0.108402,0.401062,0.438727,-0.428125,-0.313163,-0.495947,-0.189048,-0.087641
2816,SRR4449973,Salmonella,enterica,ciprofloxacin,Susceptible,0.015,0.0,42.974373,-2.823206,2.175999,...,-0.559978,0.373316,-0.539559,-0.148558,-0.720063,0.105569,-0.339514,-0.395064,-0.037477,0.260920


In [21]:
# Define the algorithms
algorithms = {
    "Random Forest (1200 trees)": RandomForestClassifier(n_estimators=1200, random_state=42),
    "Random Forest (500 trees)": RandomForestClassifier(n_estimators=500, random_state=42),
    "KNN (k=1)": KNeighborsClassifier(n_neighbors=1),
    "KNN (k=3)": KNeighborsClassifier(n_neighbors=3),
    "KNN (k=5)": KNeighborsClassifier(n_neighbors=5),
    "SVM (RBF Kernel)": SVC(kernel='rbf', gamma='auto', C=1, random_state=42),
    "SVM (Linear Kernel)": SVC(kernel='linear', gamma='auto', C=1, random_state=42),
    "SVM (Polynomial Kernel)": SVC(kernel='poly', gamma='auto', C=1, random_state=42),
    "MLP (1 Hidden Layer, 200 neurons)": MLPClassifier(hidden_layer_sizes=(200,), max_iter=1000, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Gaussian NB": GaussianNB()
}

results = {}

# Define the output folder
output_folder = "/home/haydeeperuyero/camda24/hackaton/output_pca_ciprofloxacin"
os.makedirs(output_folder, exist_ok=True)

# Filter the DataFrame by the column 'phenotype'
df_training = df[df['phenotype'].isin(['Susceptible', 'Resistant'])]
df_test = df[df['phenotype'].isna()]

# Select numeric columns for training data
df_numeric_training = df_training.iloc[:, 6:]

# Scale the training data
scaler = StandardScaler()
X_scaled_training = scaler.fit_transform(df_numeric_training)

# Apply PCA initially to determine the number of components
pca = PCA()
pca.fit(X_scaled_training)

# Calculate cumulative variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

# Find the number of components that reach 95% of the variance
n_components = np.where(cumulative_variance >= 0.95)[0][0] + 1

# Apply PCA with the optimal number of components
pca = PCA(n_components=n_components)
X_pca_training = pca.fit_transform(X_scaled_training)

# Convert to DataFrame
X_pca_df_training = pd.DataFrame(X_pca_training, index=df_training.index)

# Add the first 7 columns of df_training to X_pca_df_training
df_pca_training = pd.concat([df_training.iloc[:, :6], X_pca_df_training], axis=1)

# Filter the original DataFrame to select only the desired columns for test data
df_numeric_test = df_test.iloc[:, 6:]

# Scale and transform the test data using the same scaler and PCA
X_scaled_test = scaler.transform(df_numeric_test)
X_pca_test = pca.transform(X_scaled_test)

# Convert to DataFrame
X_pca_df_test = pd.DataFrame(X_pca_test, index=df_test.index)

# Add the first 7 columns of df_test to X_pca_df_test
df_pca_test = pd.concat([df_test.iloc[:, :6], X_pca_df_test], axis=1)

# Get the filtered DataFrames for training and testing
df_filtrado_training = df_pca_training.iloc[:, 6:]
df_filtrado_test = df_pca_test.iloc[:, 6:]

# Encode the labels
labels = df_pca_training['phenotype']
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Configure cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for algo_name, algo in algorithms.items():
    # Evaluate the model using cross-validation
    accuracy_scores = cross_val_score(algo, df_filtrado_training, encoded_labels, cv=cv, scoring=make_scorer(accuracy_score))
    f1_scores = cross_val_score(algo, df_filtrado_training, encoded_labels, cv=cv, scoring=make_scorer(f1_score, average='weighted'))

    # Train the final model on all training data
    algo.fit(df_filtrado_training, encoded_labels)

    # Calculate the confusion matrix
    y_pred = cross_val_predict(algo, df_filtrado_training, encoded_labels, cv=cv)
    cm = confusion_matrix(encoded_labels, y_pred)

    # Save the results
    results[algo_name] = {
        'accuracy_mean': accuracy_scores.mean(),
        'accuracy_std': accuracy_scores.std(),
        'f1_score_mean': f1_scores.mean(),
        'f1_score_std': f1_scores.std(),
        'confusion_matrix': cm
    }

    # Plot and save the confusion matrix
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix - {algo_name}')
    plt.savefig(os.path.join(output_folder, algo_name + "_confusion_matrix.png"))
    plt.close()

    # Make predictions on df_filtrado_test
    test_predictions = algo.predict(df_filtrado_test)

    # Save the predictions to a CSV file
    test_predictions_labels = label_encoder.inverse_transform(test_predictions)
    test_predictions_df = pd.DataFrame({
        'genus': df_test.genus,
        'species': df_test.species,
        'accession': df_test.accession,
        'antibiotic': df_test.antibiotic,
        'status': test_predictions_labels
    })
    test_predictions_df.to_csv(os.path.join(output_folder, algo_name + "_test_predictions.csv"), index=False)

# Save overall results to CSV
results_df = pd.DataFrame.from_dict(results, orient='index')
results_df.to_csv(os.path.join(output_folder, "overall_results_PCA.csv"))

results_df


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,accuracy_mean,accuracy_std,f1_score_mean,f1_score_std,confusion_matrix
Random Forest (1200 trees),0.966646,0.007546,0.962553,0.009855,"[[124, 88], [6, 2600]]"
Random Forest (500 trees),0.965226,0.006957,0.960867,0.009208,"[[121, 91], [7, 2599]]"
KNN (k=1),0.952094,0.009253,0.954228,0.008469,"[[167, 45], [90, 2516]]"
KNN (k=3),0.953159,0.004559,0.954928,0.004121,"[[165, 47], [85, 2521]]"
KNN (k=5),0.953158,0.005332,0.95474,0.005146,"[[163, 49], [83, 2523]]"
SVM (RBF Kernel),0.928672,0.003448,0.900533,0.006729,"[[16, 196], [5, 2601]]"
SVM (Linear Kernel),0.933285,0.007759,0.938458,0.00601,"[[162, 50], [138, 2468]]"
SVM (Polynomial Kernel),0.954221,0.007405,0.955808,0.006985,"[[165, 47], [82, 2524]]"
"MLP (1 Hidden Layer, 200 neurons)",0.946768,0.007638,0.946309,0.008209,"[[134, 78], [72, 2534]]"
Logistic Regression,0.931509,0.004189,0.937255,0.004127,"[[165, 47], [146, 2460]]"


# Pangenome Pseudomonas

In [28]:
df_polar = pl.read_csv("/files/camda2024/resistence/git/DataSets/PangenomeCountPseudomonas.tsv", separator='\t')
df_polar

accession,genus,species,antibiotic,phenotype,measurement_value,GCA_000793885_CDS_2713,GCA_000793885_CDS_4409,GCA_000794965_CDS_3490,GCA_000794985_CDS_0460,GCA_000795565_CDS_1713,GCA_000795955_CDS_1420,GCA_000796165_CDS_5201,GCA_000797225_CDS_5220,GCA_000797225_CDS_5796,GCA_003000695_CDS_1742,GCA_003193645_CDS_1281,GCA_003193645_CDS_1345,GCA_003193645_CDS_2689,GCA_003324385_CDS_2972,GCA_003968125_CDS_3740,GCA_003968315_CDS_5323,GCA_003969475_CDS_2283,GCA_003969475_CDS_4811,GCA_003969695_CDS_0638,SRR4417542_CDS_2183,SRR1056051_CDS_3487,SRR1056051_CDS_3647,SRR1056051_CDS_3711,SRR1056051_CDS_4735,SRR1056051_CDS_4767,SRR1056051_CDS_5087,SRR1056051_CDS_5151,SRR1056051_CDS_5279,SRR1056051_CDS_5343,SRR1056071_CDS_3748,SRR1056071_CDS_3908,…,GCA_003204335_CDS_1546,GCA_003324275_CDS_0532,GCA_003324275_CDS_1076,GCA_003968015_CDS_5528,GCA_003968015_CDS_5560,GCA_003968015_CDS_5592,GCA_003968015_CDS_5624,GCA_003968015_CDS_5656,GCA_003968015_CDS_5784,GCA_003968015_CDS_5848,GCA_003968015_CDS_6264,GCA_003968015_CDS_6648,GCA_003968015_CDS_6904,GCA_003968445_CDS_5463,GCA_003968445_CDS_6039,GCA_003968445_CDS_6135,GCA_003968445_CDS_6615,GCA_003968475_CDS_0746,GCA_003968475_CDS_1226,GCA_003968475_CDS_1418,GCA_003968475_CDS_1450,GCA_003968475_CDS_1482,GCA_003969165_CDS_4601,GCA_003969165_CDS_4665,GCA_003969165_CDS_6041,GCA_003969165_CDS_6425,GCA_003969165_CDS_6649,GCA_003969165_CDS_6681,GCA_003969165_CDS_6713,GCA_003969365_CDS_5048,GCA_003969365_CDS_5176,GCA_003969365_CDS_5304,GCA_003969575_CDS_5708,GCA_003969785_CDS_4477,GCA_003977935_CDS_0006,GCF_001874795_CDS_4959,GCF_001874795_CDS_4991
str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""GCA_000789485""","""Pseudomonas""","""aeruginosa""","""meropenem""","""Susceptible""",0.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""GCA_000789495""","""Pseudomonas""","""aeruginosa""","""meropenem""","""Resistant""",8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""GCA_000789525""","""Pseudomonas""","""aeruginosa""","""meropenem""","""Resistant""",8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""GCA_000789535""","""Pseudomonas""","""aeruginosa""","""meropenem""","""Resistant""",32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""GCA_000789545""","""Pseudomonas""","""aeruginosa""","""meropenem""","""Susceptible""",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""SRR1056175""","""Pseudomonas""","""aeruginosa""","""meropenem""",,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
"""SRR1056177""","""Pseudomonas""","""aeruginosa""","""meropenem""",,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""SRR1056178""","""Pseudomonas""","""aeruginosa""","""meropenem""",,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""SRR1056179""","""Pseudomonas""","""aeruginosa""","""meropenem""",,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
# Convertir a pandas
df = df_polar.to_pandas()
df

Unnamed: 0,accession,genus,species,antibiotic,phenotype,measurement_value,GCA_000793885_CDS_2713,GCA_000793885_CDS_4409,GCA_000794965_CDS_3490,GCA_000794985_CDS_0460,...,GCA_003969165_CDS_6681,GCA_003969165_CDS_6713,GCA_003969365_CDS_5048,GCA_003969365_CDS_5176,GCA_003969365_CDS_5304,GCA_003969575_CDS_5708,GCA_003969785_CDS_4477,GCA_003977935_CDS_0006,GCF_001874795_CDS_4959,GCF_001874795_CDS_4991
0,GCA_000789485,Pseudomonas,aeruginosa,meropenem,Susceptible,0.12,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,GCA_000789495,Pseudomonas,aeruginosa,meropenem,Resistant,8.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,GCA_000789525,Pseudomonas,aeruginosa,meropenem,Resistant,8.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,GCA_000789535,Pseudomonas,aeruginosa,meropenem,Resistant,32.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,GCA_000789545,Pseudomonas,aeruginosa,meropenem,Susceptible,1.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
647,SRR1056175,Pseudomonas,aeruginosa,meropenem,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
648,SRR1056177,Pseudomonas,aeruginosa,meropenem,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
649,SRR1056178,Pseudomonas,aeruginosa,meropenem,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
650,SRR1056179,Pseudomonas,aeruginosa,meropenem,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# Define the algorithms
algorithms = {
    "Random Forest (1200 trees)": RandomForestClassifier(n_estimators=1200, random_state=42),
    "Random Forest (500 trees)": RandomForestClassifier(n_estimators=500, random_state=42),
    "KNN (k=1)": KNeighborsClassifier(n_neighbors=1),
    "KNN (k=3)": KNeighborsClassifier(n_neighbors=3),
    "KNN (k=5)": KNeighborsClassifier(n_neighbors=5),
    "SVM (RBF Kernel)": SVC(kernel='rbf', gamma='auto', C=1, random_state=42),
    "SVM (Linear Kernel)": SVC(kernel='linear', gamma='auto', C=1, random_state=42),
    "SVM (Polynomial Kernel)": SVC(kernel='poly', gamma='auto', C=1, random_state=42),
    "MLP (1 Hidden Layer, 200 neurons)": MLPClassifier(hidden_layer_sizes=(200,), max_iter=1000, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Gaussian NB": GaussianNB()
}

results = {}

# Define the output folder
output_folder = "/home/haydeeperuyero/camda24/hackaton/output_pca_Pseudomonas"
os.makedirs(output_folder, exist_ok=True)

# Filter the DataFrame by the column 'phenotype'
df_training = df[df['phenotype'].isin(['Susceptible', 'Resistant'])]
df_test = df[df['phenotype'].isna()]

# Select numeric columns for training data
df_numeric_training = df_training.iloc[:, 6:]

# Scale the training data
scaler = StandardScaler()
X_scaled_training = scaler.fit_transform(df_numeric_training)

# Apply PCA initially to determine the number of components
pca = PCA()
pca.fit(X_scaled_training)

# Calculate cumulative variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

# Find the number of components that reach 95% of the variance
n_components = np.where(cumulative_variance >= 0.95)[0][0] + 1

# Apply PCA with the optimal number of components
pca = PCA(n_components=n_components)
X_pca_training = pca.fit_transform(X_scaled_training)

# Convert to DataFrame
X_pca_df_training = pd.DataFrame(X_pca_training, index=df_training.index)

# Add the first 7 columns of df_training to X_pca_df_training
df_pca_training = pd.concat([df_training.iloc[:, :6], X_pca_df_training], axis=1)

# Filter the original DataFrame to select only the desired columns for test data
df_numeric_test = df_test.iloc[:, 6:]

# Scale and transform the test data using the same scaler and PCA
X_scaled_test = scaler.transform(df_numeric_test)
X_pca_test = pca.transform(X_scaled_test)

# Convert to DataFrame
X_pca_df_test = pd.DataFrame(X_pca_test, index=df_test.index)

# Add the first 7 columns of df_test to X_pca_df_test
df_pca_test = pd.concat([df_test.iloc[:, :6], X_pca_df_test], axis=1)

# Get the filtered DataFrames for training and testing
df_filtrado_training = df_pca_training.iloc[:, 6:]
df_filtrado_test = df_pca_test.iloc[:, 6:]

# Encode the labels
labels = df_pca_training['phenotype']
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Configure cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for algo_name, algo in algorithms.items():
    # Evaluate the model using cross-validation
    accuracy_scores = cross_val_score(algo, df_filtrado_training, encoded_labels, cv=cv, scoring=make_scorer(accuracy_score))
    f1_scores = cross_val_score(algo, df_filtrado_training, encoded_labels, cv=cv, scoring=make_scorer(f1_score, average='weighted'))

    # Train the final model on all training data
    algo.fit(df_filtrado_training, encoded_labels)

    # Calculate the confusion matrix
    y_pred = cross_val_predict(algo, df_filtrado_training, encoded_labels, cv=cv)
    cm = confusion_matrix(encoded_labels, y_pred)

    # Save the results
    results[algo_name] = {
        'accuracy_mean': accuracy_scores.mean(),
        'accuracy_std': accuracy_scores.std(),
        'f1_score_mean': f1_scores.mean(),
        'f1_score_std': f1_scores.std(),
        'confusion_matrix': cm
    }

    # Plot and save the confusion matrix
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix - {algo_name}')
    plt.savefig(os.path.join(output_folder, algo_name + "_confusion_matrix.png"))
    plt.close()

    # Make predictions on df_filtrado_test
    test_predictions = algo.predict(df_filtrado_test)

    # Save the predictions to a CSV file
    test_predictions_labels = label_encoder.inverse_transform(test_predictions)
    test_predictions_df = pd.DataFrame({
        'genus': df_test.genus,
        'species': df_test.species,
        'accession': df_test.accession,
        'antibiotic': df_test.antibiotic,
        'status': test_predictions_labels
    })
    test_predictions_df.to_csv(os.path.join(output_folder, algo_name + "_test_predictions.csv"), index=False)

# Save overall results to CSV
results_df = pd.DataFrame.from_dict(results, orient='index')
results_df.to_csv(os.path.join(output_folder, "overall_results_PCA.csv"))

results_df


Unnamed: 0,accuracy_mean,accuracy_std,f1_score_mean,f1_score_std,confusion_matrix
Random Forest (1200 trees),0.649238,0.035392,0.647184,0.035477,"[[150, 111], [77, 198]]"
Random Forest (500 trees),0.656663,0.035129,0.655172,0.034868,"[[155, 106], [78, 197]]"
KNN (k=1),0.634233,0.033994,0.632234,0.032011,"[[165, 96], [100, 175]]"
KNN (k=3),0.6099,0.049573,0.604973,0.044224,"[[157, 104], [105, 170]]"
KNN (k=5),0.645552,0.043561,0.631544,0.049669,"[[123, 138], [52, 223]]"
SVM (RBF Kernel),0.535462,0.013114,0.409485,0.016289,"[[17, 244], [5, 270]]"
SVM (Linear Kernel),0.555988,0.025797,0.552151,0.027314,"[[143, 118], [120, 155]]"
SVM (Polynomial Kernel),0.552111,0.050403,0.515751,0.045807,"[[184, 77], [163, 112]]"
"MLP (1 Hidden Layer, 200 neurons)",0.572655,0.054045,0.571104,0.054264,"[[139, 122], [107, 168]]"
Logistic Regression,0.621288,0.008598,0.621153,0.008749,"[[159, 102], [101, 174]]"
