In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [4]:
# load expression data and labels
X = pd.read_csv("../../RNAseq_with_HGNC_symbols1.csv", index_col=0)
labels_df = pd.read_csv("../../labels.csv", index_col=0)
y = labels_df["Class"].values

# same CV as before
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [6]:
# load variance filtered genes
var_k = 2000
var_df = pd.read_csv(f"../../Output/variance_filtering_results/variance_top_genes_k{var_k}.csv")

# dict for fold index -> list of genes
variance_genes = {
    fold_idx + 1: var_df[f"fold_{fold_idx+1}"].dropna().tolist()
    for fold_idx in range(5)
}

In [7]:
# load mutual information filtered genes
mi_genes = {}
for fold in range(1, 6):
    genes = pd.read_csv(
        f"../../Output/mutual_information_results/selected_features_fold_{fold}.csv"
    ).iloc[:, 0].dropna().tolist()
    mi_genes[fold] = genes

In [8]:
# load LASSO filtered genes
l1_genes = {}
for fold in range(1, 6):
    genes = pd.read_csv(
        f"../../Output/l1_results/selected_genes_fold_{fold}.csv"
    ).iloc[:, 0].dropna().tolist()
    l1_genes[fold] = genes

In [9]:
# dicts for storing results
methods = {
    f"variance_k{var_k}": variance_genes,
    "mutual_information": mi_genes,
    "lasso_l1": l1_genes,
}

In [11]:
# random forest for each method and fold
results = []

for method_name, fold_to_genes in methods.items():
    fold_accuracies = []

    for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X, y), start=1):
        genes = fold_to_genes[fold_idx]
        # ensure genes are in the dataset
        genes = [g for g in genes if g in X.columns]

        X_train_sel = X.iloc[train_idx][genes]
        X_test_sel  = X.iloc[test_idx][genes]
        y_train = y[train_idx]
        y_test  = y[test_idx]

        rf = RandomForestClassifier(
            n_estimators=500,
            max_depth=None,
            n_jobs=-1,
            random_state=42,
        )

        rf.fit(X_train_sel, y_train)
        y_pred = rf.predict(X_test_sel)
        acc = accuracy_score(y_test, y_pred)
        fold_accuracies.append(acc)

        print(f"{method_name} | Fold {fold_idx}: {len(genes)} genes, acc = {acc:.3f}")

    mean_acc = np.mean(fold_accuracies)
    std_acc = np.std(fold_accuracies)
    print(f"\n{method_name}: mean acc = {mean_acc:.3f} ± {std_acc:.3f}\n")

    results.append({
        "method": method_name,
        "mean_accuracy": mean_acc,
        "std_accuracy": std_acc,
    })

# save summary table
results_df = pd.DataFrame(results)
results_df.to_csv("rf_performance_summary.csv", index=False)
print(results_df)

variance_k2000 | Fold 1: 2000 genes, acc = 1.000
variance_k2000 | Fold 2: 2000 genes, acc = 1.000
variance_k2000 | Fold 3: 2000 genes, acc = 0.994
variance_k2000 | Fold 4: 2000 genes, acc = 1.000
variance_k2000 | Fold 5: 2000 genes, acc = 0.981

variance_k2000: mean acc = 0.995 ± 0.007

mutual_information | Fold 1: 200 genes, acc = 1.000
mutual_information | Fold 2: 200 genes, acc = 1.000
mutual_information | Fold 3: 200 genes, acc = 0.994
mutual_information | Fold 4: 200 genes, acc = 1.000
mutual_information | Fold 5: 200 genes, acc = 0.994

mutual_information: mean acc = 0.997 ± 0.003

lasso_l1 | Fold 1: 211 genes, acc = 1.000
lasso_l1 | Fold 2: 200 genes, acc = 1.000
lasso_l1 | Fold 3: 205 genes, acc = 0.994
lasso_l1 | Fold 4: 202 genes, acc = 1.000
lasso_l1 | Fold 5: 192 genes, acc = 0.994

lasso_l1: mean acc = 0.997 ± 0.003

               method  mean_accuracy  std_accuracy
0      variance_k2000         0.9950      0.007289
1  mutual_information         0.9975      0.003062
2    

In [12]:
# shuffle labels and repeat
y_shuffled = np.random.permutation(y)

results = []

for method_name, fold_to_genes in methods.items():
    fold_accuracies = []

    for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X, y_shuffled), start=1):
        genes = fold_to_genes[fold_idx]
        genes = [g for g in genes if g in X.columns]

        X_train_sel = X.iloc[train_idx][genes]
        X_test_sel  = X.iloc[test_idx][genes]

        y_train = y_shuffled[train_idx]
        y_test  = y_shuffled[test_idx]

        rf = RandomForestClassifier(
            n_estimators=500,
            max_depth=None,
            n_jobs=-1,
            random_state=42,
        )

        rf.fit(X_train_sel, y_train)
        y_pred = rf.predict(X_test_sel)
        acc = accuracy_score(y_test, y_pred)
        fold_accuracies.append(acc)

        print(f"{method_name} | Fold {fold_idx}: {len(genes)} genes, acc = {acc:.3f}")

    mean_acc = np.mean(fold_accuracies)
    std_acc = np.std(fold_accuracies)
    print(f"\n{method_name}: mean acc = {mean_acc:.3f} ± {std_acc:.3f}\n")

    results.append({
        "method": method_name,
        "mean_accuracy": mean_acc,
        "std_accuracy": std_acc,
    })

results_df = pd.DataFrame(results)
print(results_df)

variance_k2000 | Fold 1: 2000 genes, acc = 0.373
variance_k2000 | Fold 2: 2000 genes, acc = 0.375
variance_k2000 | Fold 3: 2000 genes, acc = 0.356
variance_k2000 | Fold 4: 2000 genes, acc = 0.362
variance_k2000 | Fold 5: 2000 genes, acc = 0.356

variance_k2000: mean acc = 0.365 ± 0.008

mutual_information | Fold 1: 200 genes, acc = 0.354
mutual_information | Fold 2: 200 genes, acc = 0.381
mutual_information | Fold 3: 200 genes, acc = 0.350
mutual_information | Fold 4: 200 genes, acc = 0.338
mutual_information | Fold 5: 200 genes, acc = 0.338

mutual_information: mean acc = 0.352 ± 0.016

lasso_l1 | Fold 1: 211 genes, acc = 0.360
lasso_l1 | Fold 2: 200 genes, acc = 0.369
lasso_l1 | Fold 3: 205 genes, acc = 0.369
lasso_l1 | Fold 4: 202 genes, acc = 0.369
lasso_l1 | Fold 5: 192 genes, acc = 0.362

lasso_l1: mean acc = 0.366 ± 0.004

               method  mean_accuracy  std_accuracy
0      variance_k2000       0.364534      0.007964
1  mutual_information       0.352057      0.016026
2    

The above results (running the model on randomized labels) confirm that there is NO major leakage in our pipeline.