In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [3]:
# load expression data and labels
X = pd.read_csv("../../RNAseq_with_HGNC_symbols1.csv", index_col=0)
labels_df = pd.read_csv("../../labels.csv", index_col=0)
y = labels_df["Class"].values

# same CV as before
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [4]:
# load variance filtered genes
var_k = 200
var_df = pd.read_csv(f"../../Output/variance_filtering_results/variance_top_genes_k{var_k}.csv")

# dict for fold index -> list of genes
variance_genes = {
    fold_idx + 1: var_df[f"fold_{fold_idx+1}"].dropna().tolist()
    for fold_idx in range(5)
}

In [5]:
# load mutual information filtered genes
mi_genes = {}
for fold in range(1, 6):
    genes = pd.read_csv(
        f"../../Output/mutual_information_results/selected_features_fold_{fold}.csv"
    ).iloc[:, 0].dropna().tolist()
    mi_genes[fold] = genes

In [6]:
# load LASSO filtered genes
l1_genes = {}
for fold in range(1, 6):
    genes = pd.read_csv(
        f"../../Output/l1_results/selected_genes_fold_{fold}.csv"
    ).iloc[:, 0].dropna().tolist()
    l1_genes[fold] = genes

In [7]:
# dicts for storing results
methods = {
    f"variance_k{var_k}": variance_genes,
    "mutual_information": mi_genes,
    "lasso_l1": l1_genes,
}

In [8]:
# random forest for each method and fold
results = []

for method_name, fold_to_genes in methods.items():
    fold_accuracies = []

    for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X, y), start=1):
        genes = fold_to_genes[fold_idx]
        # ensure genes are in the dataset
        genes = [g for g in genes if g in X.columns]

        X_train_sel = X.iloc[train_idx][genes]
        X_test_sel  = X.iloc[test_idx][genes]
        y_train = y[train_idx]
        y_test  = y[test_idx]

        rf = RandomForestClassifier(
            n_estimators=500,
            max_depth=None,
            n_jobs=-1,
            random_state=42,
        )

        rf.fit(X_train_sel, y_train)
        y_pred = rf.predict(X_test_sel)
        acc = accuracy_score(y_test, y_pred)
        fold_accuracies.append(acc)

        print(f"{method_name} | Fold {fold_idx}: {len(genes)} genes, acc = {acc:.3f}")

    mean_acc = np.mean(fold_accuracies)
    std_acc = np.std(fold_accuracies)
    print(f"\n{method_name}: mean acc = {mean_acc:.3f} ± {std_acc:.3f}\n")

    results.append({
        "method": method_name,
        "mean_accuracy": mean_acc,
        "std_accuracy": std_acc,
    })

# save summary table
results_df = pd.DataFrame(results)
print(results_df)

variance_k200 | Fold 1: 200 genes, acc = 1.000
variance_k200 | Fold 2: 200 genes, acc = 1.000
variance_k200 | Fold 3: 200 genes, acc = 0.994
variance_k200 | Fold 4: 200 genes, acc = 1.000
variance_k200 | Fold 5: 200 genes, acc = 0.988

variance_k200: mean acc = 0.996 ± 0.005

mutual_information | Fold 1: 200 genes, acc = 1.000
mutual_information | Fold 2: 200 genes, acc = 1.000
mutual_information | Fold 3: 200 genes, acc = 0.994
mutual_information | Fold 4: 200 genes, acc = 1.000
mutual_information | Fold 5: 200 genes, acc = 0.994

mutual_information: mean acc = 0.997 ± 0.003

lasso_l1 | Fold 1: 211 genes, acc = 1.000
lasso_l1 | Fold 2: 200 genes, acc = 1.000
lasso_l1 | Fold 3: 205 genes, acc = 0.994
lasso_l1 | Fold 4: 202 genes, acc = 1.000
lasso_l1 | Fold 5: 192 genes, acc = 0.994

lasso_l1: mean acc = 0.997 ± 0.003

               method  mean_accuracy  std_accuracy
0       variance_k200        0.99625      0.005000
1  mutual_information        0.99750      0.003062
2            las

In [9]:
# shuffle labels and repeat
y_shuffled = np.random.permutation(y)

results = []

for method_name, fold_to_genes in methods.items():
    fold_accuracies = []

    for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X, y_shuffled), start=1):
        genes = fold_to_genes[fold_idx]
        genes = [g for g in genes if g in X.columns]

        X_train_sel = X.iloc[train_idx][genes]
        X_test_sel  = X.iloc[test_idx][genes]

        y_train = y_shuffled[train_idx]
        y_test  = y_shuffled[test_idx]

        rf = RandomForestClassifier(
            n_estimators=500,
            max_depth=None,
            n_jobs=-1,
            random_state=42,
        )

        rf.fit(X_train_sel, y_train)
        y_pred = rf.predict(X_test_sel)
        acc = accuracy_score(y_test, y_pred)
        fold_accuracies.append(acc)

        print(f"{method_name} | Fold {fold_idx}: {len(genes)} genes, acc = {acc:.3f}")

    mean_acc = np.mean(fold_accuracies)
    std_acc = np.std(fold_accuracies)
    print(f"\n{method_name}: mean acc = {mean_acc:.3f} ± {std_acc:.3f}\n")

    results.append({
        "method": method_name,
        "mean_accuracy": mean_acc,
        "std_accuracy": std_acc,
    })

results_df = pd.DataFrame(results)
print(results_df)

variance_k200 | Fold 1: 200 genes, acc = 0.348
variance_k200 | Fold 2: 200 genes, acc = 0.325
variance_k200 | Fold 3: 200 genes, acc = 0.338
variance_k200 | Fold 4: 200 genes, acc = 0.356
variance_k200 | Fold 5: 200 genes, acc = 0.369

variance_k200: mean acc = 0.347 ± 0.015

mutual_information | Fold 1: 200 genes, acc = 0.366
mutual_information | Fold 2: 200 genes, acc = 0.356
mutual_information | Fold 3: 200 genes, acc = 0.375
mutual_information | Fold 4: 200 genes, acc = 0.325
mutual_information | Fold 5: 200 genes, acc = 0.319

mutual_information: mean acc = 0.348 ± 0.022

lasso_l1 | Fold 1: 211 genes, acc = 0.348
lasso_l1 | Fold 2: 200 genes, acc = 0.325
lasso_l1 | Fold 3: 205 genes, acc = 0.350
lasso_l1 | Fold 4: 202 genes, acc = 0.344
lasso_l1 | Fold 5: 192 genes, acc = 0.388

lasso_l1: mean acc = 0.351 ± 0.020

               method  mean_accuracy  std_accuracy
0       variance_k200       0.347065      0.015057
1  mutual_information       0.348292      0.022459
2            las

The above results (running the model on randomized labels) confirm that there is NO major leakage in our pipeline.

In [10]:
# run RF on full dataset 
baseline_accs = []

for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X, y), start=1):
    X_train = X.iloc[train_idx]
    X_test  = X.iloc[test_idx]
    y_train = y[train_idx]
    y_test  = y[test_idx]

    rf = RandomForestClassifier(
        n_estimators=500,
        max_depth=None,
        n_jobs=-1,
        random_state=42
    )

    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    baseline_accs.append(acc)

    print(f"Baseline | Fold {fold_idx}: acc = {acc:.3f}")

print(f"\nBaseline RF mean acc = {np.mean(baseline_accs):.3f} ± {np.std(baseline_accs):.3f}")

Baseline | Fold 1: acc = 1.000
Baseline | Fold 2: acc = 1.000
Baseline | Fold 3: acc = 0.994
Baseline | Fold 4: acc = 1.000
Baseline | Fold 5: acc = 0.988

Baseline RF mean acc = 0.996 ± 0.005
