In [1]:
import os
from pathlib import Path

In [2]:
yeast_names_path = os.path.join("yeast_names.txt")
yeast_data_path = os.path.join("yeast_data.txt")

In [3]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler, label_binarize
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, classification_report,
    confusion_matrix, f1_score,
    roc_curve, auc
)
from sklearn.pipeline import Pipeline

# Models
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
)
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA

# Ensure output directory
os.makedirs("plots", exist_ok=True)

# --- 1. LOAD DATA ---
YEAST_PATH = yeast_data_path  # adjust as needed
columns = [
    "Sequence_Name", "mcg", "gvh", "alm",
    "mit", "erl", "pox", "vac", "nuc",
    "Localization_Site"
]
df = pd.read_csv(YEAST_PATH, sep=r"\s+", names=columns)

# --- 2. ENCODE TARGET ---
X = df.drop(columns=["Sequence_Name", "Localization_Site"] )
y = df["Localization_Site"]
le = LabelEncoder()
y_enc = le.fit_transform(y)

# --- 3. SPLIT DATA ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

# --- 4. BASIC EDA (save plots) ---
# Class distribution
plt.figure(figsize=(8,4))
sns.countplot(x=y, order=y.value_counts().index)
plt.title("Class Distribution")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("plots/class_distribution.png")
plt.close()

# Feature distributions
plt.figure(figsize=(10,6))
X.hist(bins=15, figsize=(10,6))
plt.suptitle("Feature Distributions")
plt.tight_layout()
plt.savefig("plots/feature_distributions.png")
plt.close()

# Correlation matrix
plt.figure(figsize=(8,6))
sns.heatmap(X.corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Feature Correlation Matrix")
plt.tight_layout()
plt.savefig("plots/feature_correlation.png")
plt.close()

# PCA scatter
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)
pca = PCA(n_components=2, random_state=42)
comps = pca.fit_transform(X_scaled)
plt.figure(figsize=(8,6))
sns.scatterplot(x=comps[:,0], y=comps[:,1], hue=y, palette='tab10', legend=False)
plt.title("PCA (2 Components)")
plt.tight_layout()
plt.savefig("plots/pca_scatter.png")
plt.close()

# --- 5. DEFINE MODELS ---
seed = 42
models = {
    "RandomForest": RandomForestClassifier(
        n_estimators=1500, random_state=seed, class_weight="balanced"),
    "GradientBoosting": GradientBoostingClassifier(random_state=seed),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=seed, tree_method='hist', n_jobs=-1,         predictor="cpu_predictor",),
    "AdaBoost": AdaBoostClassifier(random_state=seed),
    "SVM": SVC(probability=True, random_state=seed, class_weight="balanced"),
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=seed, class_weight="balanced"),
    "KNN": KNeighborsClassifier(),
    "NaiveBayes": GaussianNB()
}

# --- 6. TRAIN & PREDICT ---
results = []
for name, model in models.items():
    print(f"=== {name} ===")
    pipe = Pipeline([("scaler", StandardScaler()), ("clf", model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")
    print(f"Accuracy: {acc:.4f}, F1: {f1:.4f}\n")
    results.append({"Model": name, "Accuracy": acc, "F1": f1, "Pipeline": pipe})

# --- 7. METRICS & COMPARISON ---
res_df = pd.DataFrame(results).sort_values("F1", ascending=False)
print("\nModel Comparison:\n", res_df[["Model","Accuracy","F1"]])

# Save comparison plots
plt.figure(figsize=(8,4))
sns.barplot(x="F1", y="Model", data=res_df, palette="vlag")
plt.title("Model F1 Comparison")
plt.tight_layout()
plt.savefig("plots/model_f1_comparison.png")
plt.close()

plt.figure(figsize=(8,4))
sns.barplot(x="Accuracy", y="Model", data=res_df, palette="rocket")
plt.title("Model Accuracy Comparison")
plt.tight_layout()
plt.savefig("plots/model_accuracy_comparison.png")
plt.close()

# --- 8. CONFUSION & ROC for BEST MODEL ---
best = res_df.iloc[0]
best_name, best_pipe = best["Model"], best["Pipeline"]
y_best = best_pipe.predict(X_test)

# Confusion matrix
cm = confusion_matrix(y_test, y_best)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt="d", xticklabels=le.classes_, yticklabels=le.classes_)
plt.title(f"Confusion Matrix: {best_name}")
plt.xlabel("Predicted"); plt.ylabel("True")
plt.tight_layout()
plt.savefig("plots/confusion_matrix_best.png")
plt.close()

# ROC curves (multiclass)
y_test_bin = label_binarize(y_test, classes=range(len(le.classes_)))
y_score = best_pipe.predict_proba(X_test)

fpr = dict(); tpr = dict(); roc_auc = dict()
for i in range(len(le.classes_)):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Macro-average
all_fpr = sorted({x for vals in fpr.values() for x in vals})
mean_tpr = np.zeros_like(all_fpr)
for i in fpr:
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
mean_tpr /= len(le.classes_)
fpr["macro"], tpr["macro"] = all_fpr, mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

plt.figure(figsize=(8,6))
for i, cls in enumerate(le.classes_):
    plt.plot(fpr[i], tpr[i], lw=1,
             label=f"{cls} (AUC = {roc_auc[i]:.2f})")
plt.plot(fpr["macro"], tpr["macro"], color='navy', lw=2,
         label=f"macro-average (AUC = {roc_auc['macro']:.2f})")
plt.plot([0,1], [0,1], linestyle='--', lw=1, color='gray')
plt.xlim([0.0,1.0]); plt.ylim([0.0,1.05])
plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate')
plt.title(f"ROC Curves: {best_name}")
plt.legend(loc='lower right', fontsize='small')
plt.tight_layout()
plt.savefig("plots/roc_curves_best.png")
plt.close()

# --- 9. SAVE BEST MODEL ---
with open("best_model.pkl", "wb") as f:
    pickle.dump(best_pipe, f)
print(f"Best model ({best_name}) saved to best_model.pkl")


=== RandomForest ===
Accuracy: 0.6364, F1: 0.6291

=== GradientBoosting ===
Accuracy: 0.5926, F1: 0.5879

=== XGBoost ===


Parameters: { "predictor", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.5791, F1: 0.5724

=== AdaBoost ===
Accuracy: 0.4209, F1: 0.3987

=== SVM ===
Accuracy: 0.5892, F1: 0.5939

=== LogisticRegression ===
Accuracy: 0.5253, F1: 0.5412

=== KNN ===
Accuracy: 0.5589, F1: 0.5510

=== NaiveBayes ===
Accuracy: 0.1279, F1: 0.1405


Model Comparison:
                 Model  Accuracy        F1
0        RandomForest  0.636364  0.629075
4                 SVM  0.589226  0.593859
1    GradientBoosting  0.592593  0.587898
2             XGBoost  0.579125  0.572432
6                 KNN  0.558923  0.550994
5  LogisticRegression  0.525253  0.541239
3            AdaBoost  0.420875  0.398721
7          NaiveBayes  0.127946  0.140455



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x="F1", y="Model", data=res_df, palette="vlag")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x="Accuracy", y="Model", data=res_df, palette="rocket")


Best model (RandomForest) saved to best_model.pkl


<Figure size 1000x600 with 0 Axes>

In [17]:
# --- 10. INFERENCE FUNCTIONS ---
from typing import List

def predict_from_input(feature_list: List[List[float]]) -> List[str]:
    """
    Predict localization sites for provided feature vectors.

    Args:
        feature_list: List of feature lists in the order [mcg, gvh, alm, mit, erl, pox, vac, nuc].
    Returns:
        List of predicted localization site labels.
    """
    # Load model and label encoder
    with open("best_model.pkl", "rb") as f:
        model = pickle.load(f)
    preds_enc = model.predict(feature_list)
    return le.inverse_transform(preds_enc)

import random


def predict_random(n: int = 1) -> pd.DataFrame:
    """
    Sample n random rows from the original dataset and predict their localization.

    Args:
        n: Number of random samples to predict.
    Returns:
        DataFrame containing original features, true labels, and predicted labels.
    """
    # draw n different random samples each call
    samples = df.sample(n=n).reset_index(drop=True)
    X_rand = samples.drop(columns=["Sequence_Name", "Localization_Site"])
    preds_enc = best_pipe.predict(X_rand)
    samples["Predicted_Site"] = le.inverse_transform(preds_enc)
    return samples

In [20]:
# # Example usages:
# # 1) Custom input prediction:
# custom = [[0.58, 0.61, 0.47, 0.13, 0.50, 0.00, 0.48, 0.22]]
# print(predict_from_input(custom))

# 2) Random sampling prediction:
print(predict_random(n=2))


  Sequence_Name   mcg  gvh   alm   mit  erl  pox   vac   nuc  \
0    NAB1_YEAST  0.49  0.6  0.55  0.17  0.5  0.0  0.44  0.22   
1    DBF4_YEAST  0.35  0.4  0.59  0.22  0.5  0.0  0.47  0.37   

  Localization_Site Predicted_Site  
0               CYT            CYT  
1               NUC            NUC  
