# Disease Prediction from Gene Expression (RNA-seq)

This notebook demonstrates a simplified workflow for predicting cancer types from RNA-seq gene expression data.

We use:
- **Random Forest**
- **Support Vector Machine (SVM)**
- **Feed-forward Neural Network (MLP)**

Dataset format:
- Rows = samples
- Columns = gene expression features (~19,000 genes)
- One column `label` = cancer type


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load your preprocessed dataset
# Expecting a CSV with rows = samples, columns = genes + 'label'
data = pd.read_csv("data/processed/tcga_processed.csv")

print(data.shape)
data.head()

In [None]:
X = data.drop(columns=["label"])
y = data["label"]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest Results:")
print(classification_report(y_test, y_pred_rf))

In [None]:
svm = SVC(kernel="rbf", probability=True, random_state=42)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

print("SVM Results:")
print(classification_report(y_test, y_pred_svm))

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=50, random_state=42)
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)

print("MLP Results:")
print(classification_report(y_test, y_pred_mlp))

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(18,5))

for i, (model_name, preds) in enumerate([("RF", y_pred_rf), ("SVM", y_pred_svm), ("MLP", y_pred_mlp)]):
    cm = confusion_matrix(y_test, preds)
    sns.heatmap(cm, annot=False, cmap="Blues", ax=ax[i])
    ax[i].set_title(f"{model_name} Confusion Matrix")

plt.show()