<a href="https://colab.research.google.com/github/amar9929-3/Bioinformatics_Project/blob/main/Notebooks/ML_Christian.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Colab Setup and Imports

In [None]:
# If running on Colab, install anything missing
try:
    import umap
except Exception:
    %pip -q install umap-learn scikit-learn matplotlib pandas numpy

In [None]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, RocCurveDisplay, ConfusionMatrixDisplay, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## Simulated Data

In [None]:
from google.colab import files
uploaded = files.upload()

import pandas as pd
df = pd.read_csv('Sim_Anne.csv')
df.head()

## Normalizing Data

In [None]:
#Log transform the data
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
df_log_scaled = df.copy()

# Log transform
df_log_scaled[numeric_cols] = np.log1p(df[numeric_cols])  # log(x+1)

# Z-score scale
scaler = StandardScaler()
df_log_scaled[numeric_cols] = scaler.fit_transform(df_log_scaled[numeric_cols])

df_log_scaled.head()

## Train/Test Split

In [None]:
X = df.drop(['@#', 'River', 'Population', 'Community', 'Lengthmm', 'Weightg'], axis=1)
y = df['Community']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42
)

# Running Unsupervised Methods for Visualization

## PCA

In [None]:
# Encode
X = pd.get_dummies(X_train, drop_first=True)

# Scale
X_scaled = StandardScaler().fit_transform(X)

# PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

le = LabelEncoder()
y_numeric = le.fit_transform(y_train)

# Plot
plt.figure(figsize=(8,6))
plt.scatter(X_pca[:,0], X_pca[:,1], c=y_numeric, cmap='tab10', s=20)
plt.colorbar(ticks=range(len(le.classes_)), label='Community')
plt.clim(-0.5, len(le.classes_)-0.5)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA Projection by Community")
plt.show()

## UMAP

In [None]:
# UMAP
um = umap.UMAP(n_components=2, random_state=RANDOM_STATE, n_neighbors=15, min_dist=0.1)
UM = um.fit_transform(X)

# Plot
plt.figure()
scatter = plt.scatter(UM[:, 0], UM[:, 1], c=y_numeric, cmap='tab10', s=10)
plt.colorbar(ticks=range(len(le.classes_)), label='Community')
plt.title("UMAP (train set)")
plt.xlabel("UMAP1")
plt.ylabel("UMAP2")
plt.show()

## t-SNE

In [None]:
# t-SNE
ts = TSNE(n_components=2, random_state=RANDOM_STATE, init="pca", learning_rate="auto", perplexity=30)
TS = ts.fit_transform(X)

# Plot
plt.figure()
plt.scatter(TS[:,0], TS[:,1], c=y_numeric, cmap='tab10', s=5)
plt.colorbar(ticks=range(len(le.classes_)), label='Community')
plt.title("t-SNE (train set)")
plt.xlabel("tSNE1"); plt.ylabel("tSNE2")
plt.show()

# Training Supervised Methods on Simulated Data

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score
import pandas as pd
import numpy as np

# Assuming RANDOM_STATE, X_train, X_test, y_train, y_test are already defined

models = {
    "LogReg": Pipeline([("scaler", StandardScaler()),
                        ("clf", LogisticRegression(max_iter=500, multi_class="auto", random_state=RANDOM_STATE))]),
    "RF":     Pipeline([("clf", RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE))]),
    "SVM":    Pipeline([("scaler", StandardScaler()),
                        ("clf", SVC(kernel="rbf", probability=True, random_state=RANDOM_STATE))]),
}


results = {}
for name, pipe in models.items():
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)
    proba = pipe.predict_proba(X_test)
    acc = accuracy_score(y_test, pred)
    try:
        auc = roc_auc_score(y_test, proba, multi_class="ovr")
    except Exception:
        auc = np.nan
    results[name] = {"accuracy": acc, "roc_auc_ovr": auc}

# Convert results to DataFrame
results_df = pd.DataFrame(results).T
print(results_df)

# Cross-Validate Training

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

param_grid = {
    "LogReg": {"clf__C":[0.1,1,10]},
    "RF":     {"clf__n_estimators":[200,400], "clf__max_depth":[None,10]},
    "SVM":    {"clf__C":[0.5,1,2], "clf__gamma":["scale","auto"]},
}

tuned = {}
for name, pipe in models.items():
    grid = GridSearchCV(pipe, param_grid[name], cv=cv, scoring="accuracy", n_jobs=-1)
    grid.fit(X_train, y_train)
    tuned[name] = {"best_score": grid.best_score_, "best_params": grid.best_params_, "estimator": grid.best_estimator_}

pd.DataFrame({k: {"cv_best_acc": v["best_score"], **v["best_params"]} for k,v in tuned.items()}).T

# Final Analysis with Most Accurate Classifier

## Real Data

In [None]:
from google.colab import files
uploaded = files.upload()

import pandas as pd
df = pd.read_csv('Anne.csv')
df.head()

## Normalizing Real Data

In [None]:
#Log transform the data
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
df_log_scaled = df.copy()

# Log transform
df_log_scaled[numeric_cols] = np.log1p(df[numeric_cols])  # log(x+1)

# Z-score scale
scaler = StandardScaler()
df_log_scaled[numeric_cols] = scaler.fit_transform(df_log_scaled[numeric_cols])

df_log_scaled.head()

## Train/Split Real Data

In [None]:
X = df.drop(['@#', 'River', 'Population', 'Community', 'Lengthmm', 'Weightg'], axis=1)
y = df['Community']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42
)

## Classifier Name

In [None]:
# Choose Most Accurate Classifier

## Final UMAP

In [None]:
# --- UMAP plot colored by category (train fit only) ---

import numpy as np
import matplotlib.pyplot as plt

# If umap wasn't imported earlier:
try:
    umap.UMAP
except NameError:
    import umap

# 1) Standardize using training data only (to avoid leakage)
scaler_vis = StandardScaler().fit(X_train)
Xtr_z = scaler_vis.transform(X_train)
Xte_z = scaler_vis.transform(X_test)

# 2) Fit UMAP on TRAIN, then transform TEST
um_vis = umap.UMAP(n_components=2, random_state=RANDOM_STATE, n_neighbors=15, min_dist=0.1)
UM_tr = um_vis.fit_transform(Xtr_z)
UM_te = um_vis.transform(Xte_z)

# 3) Plot, color by class labels
classes = np.unique(y_train)
plt.figure(figsize=(6,5))

for c in classes:
    idx_tr = (y_train == c)
    plt.scatter(UM_tr[idx_tr,0], UM_tr[idx_tr,1],
                s=18, alpha=0.85, label=f"train: {c}")

# Optionally overlay the test set with hollow markers
for c in np.unique(y_test):
    idx_te = (y_test == c)
    plt.scatter(UM_te[idx_te,0], UM_te[idx_te,1],
                s=36, alpha=0.85, facecolors='none', edgecolors='k', linewidths=0.7,
                label=f"test: {c}")

plt.title("UMAP embedding colored by category\n(fit on train, transformed test)")
plt.xlabel("UMAP1")
plt.ylabel("UMAP2")
plt.legend(loc="best", fontsize=8, ncol=2)
plt.tight_layout()
plt.show()