In [1]:
# Core Data Science
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Scikit-learn
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, f1_score, precision_recall_fscore_support,
    confusion_matrix, classification_report, roc_auc_score,
    cohen_kappa_score, matthews_corrcoef
)

# Imbalanced learning
from imblearn.over_sampling import SMOTE

# XGBoost
import xgboost as xgb

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [3]:
# Load and inspect the preprocessed dataset

# Load data
df = pd.read_csv('Final_Dataset_with_dataset.csv')

print(f"  Shape: {df.shape}")
print(f"  Rows (samples): {df.shape[0]}")
print(f"  Columns (features): {df.shape[1]}")

# Check last columns (should be clinical + labels)
print("LAST 5 COLUMNS:")
print(df.columns[-5:].tolist())

# Dataset composition
print("DATASET COMPOSITION:")
print("\nBy source:")
print(df['Dataset'].value_counts())

print("\nBy diagnosis:")
print(df['Diagnosis'].value_counts())

print("\nCross-tabulation (Dataset × Diagnosis):")
print(pd.crosstab(df['Dataset'], df['Diagnosis']))

# Check for missing values
missing = df.isnull().sum().sum()
print(f"MISSING VALUES: {missing}")
if missing > 0:
    print("Warning: Missing values detected")
else:
    print("No missing values")

  Shape: (1042, 1004)
  Rows (samples): 1042
  Columns (features): 1004
LAST 5 COLUMNS:
['IL4I1', 'Age_Zscore', 'Sex_Male', 'Diagnosis', 'Dataset']
DATASET COMPOSITION:

By source:
Dataset
ADNI         700
GSE110226    329
GSE63060      13
Name: count, dtype: int64

By diagnosis:
Diagnosis
MCI        519
Control    371
AD         152
Name: count, dtype: int64

Cross-tabulation (Dataset × Diagnosis):
Diagnosis   AD  Control  MCI
Dataset                     
ADNI         0      261  439
GSE110226  145      104   80
GSE63060     7        6    0
MISSING VALUES: 0
No missing values


In [10]:
# Prepare data for modeling

# Separate features and target
feature_cols = [col for col in df.columns if col not in ['Diagnosis', 'Dataset']]
X = df[feature_cols].values
y = df['Diagnosis'].values

print(f"\nFeature matrix: {X.shape}")
print(f"  Features: {X.shape[1]} (1000 genes + Age_Zscore + Sex_Male)")

print(f"\nTarget vector: {y.shape}")
print(f"  Classes: {np.unique(y)}")

# Encode labels (AD=0, Control=1, MCI=2)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print("LABEL ENCODING:")
for i, class_name in enumerate(le.classes_):
    count = np.sum(y_encoded == i)
    print(f"  {class_name:10s} : {i}  ({count} samples)")

# Stratified train/test split BEFORE SMOTE
print("STRATIFIED TRAIN/TEST SPLIT (80/20)")

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=0.20,
    stratify=y_encoded,  # Ensures balanced classes
    random_state=RANDOM_STATE
)

print(f"\nTraining set: {X_train.shape[0]} samples")
train_dist = pd.Series(y_train).value_counts().sort_index()
for i, count in enumerate(train_dist):
    class_name = le.inverse_transform([i])[0]
    pct = 100 * count / len(y_train)
    print(f"    {class_name:10s}: {count:3d} ({pct:5.1f}%)")

print(f"\nTest set: {X_test.shape[0]} samples")
test_dist = pd.Series(y_test).value_counts().sort_index()
for i, count in enumerate(test_dist):
    class_name = le.inverse_transform([i])[0]
    pct = 100 * count / len(y_test)
    print(f"    {class_name:10s}: {count:3d} ({pct:5.1f}%)")

# Calculate imbalance ratio
print("CLASS IMBALANCE ANALYSIS:")
imbalance_ratio = train_dist.max() / train_dist.min()
print(f"  Majority class: {train_dist.max()} samples")
print(f"  Minority class: {train_dist.min()} samples")
print(f"  Imbalance ratio: {imbalance_ratio:.2f}:1")


Feature matrix: (1042, 1002)
  Features: 1002 (1000 genes + Age_Zscore + Sex_Male)

Target vector: (1042,)
  Classes: ['AD' 'Control' 'MCI']
LABEL ENCODING:
  AD         : 0  (152 samples)
  Control    : 1  (371 samples)
  MCI        : 2  (519 samples)
STRATIFIED TRAIN/TEST SPLIT (80/20)

Training set: 833 samples
    AD        : 121 ( 14.5%)
    Control   : 297 ( 35.7%)
    MCI       : 415 ( 49.8%)

Test set: 209 samples
    AD        :  31 ( 14.8%)
    Control   :  74 ( 35.4%)
    MCI       : 104 ( 49.8%)
CLASS IMBALANCE ANALYSIS:
  Majority class: 415 samples
  Minority class: 121 samples
  Imbalance ratio: 3.43:1


In [11]:
# Apply SMOTE to training data

# Store original training set info
print(f"\nBEFORE SMOTE:")
print(f"  Training set: {X_train.shape[0]} samples")
original_train_dist = pd.Series(y_train).value_counts().sort_index()
for i, count in enumerate(original_train_dist):
    class_name = le.inverse_transform([i])[0]
    print(f"    {class_name:10s}: {count:3d} samples")

# Apply SMOTE
smote = SMOTE(random_state=RANDOM_STATE)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f"\nAFTER SMOTE:")
print(f"  Training set: {X_train_resampled.shape[0]} samples")
resampled_train_dist = pd.Series(y_train_resampled).value_counts().sort_index()
for i, count in enumerate(resampled_train_dist):
    class_name = le.inverse_transform([i])[0]
    synthetic = count - original_train_dist.iloc[i]
    print(f"    {class_name:10s}: {count:3d} samples (+{synthetic} synthetic)")

print(f"\nTest set: {X_test.shape[0]} samples (UNCHANGED)")
print("  No synthetic data in test set (proper evaluation!)")

print("SUMMARY:")
print(f"  Original training samples: {X_train.shape[0]}")
print(f"  After SMOTE: {X_train_resampled.shape[0]}")
print(f"  Synthetic samples added: {X_train_resampled.shape[0] - X_train.shape[0]}")
print(f"  Test samples: {X_test.shape[0]} (unchanged)")


BEFORE SMOTE:
  Training set: 833 samples
    AD        : 121 samples
    Control   : 297 samples
    MCI       : 415 samples

AFTER SMOTE:
  Training set: 1245 samples
    AD        : 415 samples (+294 synthetic)
    Control   : 415 samples (+118 synthetic)
    MCI       : 415 samples (+0 synthetic)

Test set: 209 samples (UNCHANGED)
  No synthetic data in test set (proper evaluation!)
SUMMARY:
  Original training samples: 833
  After SMOTE: 1245
  Synthetic samples added: 412
  Test samples: 209 (unchanged)
