In [17]:
# Core Data Science
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Scikit-learn
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, f1_score, precision_recall_fscore_support,
    confusion_matrix, classification_report, roc_auc_score,
    cohen_kappa_score, matthews_corrcoef
)

# Imbalanced learning
from imblearn.over_sampling import SMOTE

# XGBoost
import xgboost as xgb

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [20]:
# Load and inspect the preprocessed dataset

# Load data
df = pd.read_csv('Final_Dataset_with_dataset.csv')

print(f"  Shape: {df.shape}")
print(f"  Rows (samples): {df.shape[0]}")
print(f"  Columns (features): {df.shape[1]}")

# Check last columns (should be clinical + labels)
print("LAST 5 COLUMNS:")
print(df.columns[-5:].tolist())

# Dataset composition
print("DATASET COMPOSITION:")
print("\nBy source:")
print(df['Dataset'].value_counts())

print("\nBy diagnosis:")
print(df['Diagnosis'].value_counts())

print("\nCross-tabulation (Dataset × Diagnosis):")
print(pd.crosstab(df['Dataset'], df['Diagnosis']))

# Check for missing values
missing = df.isnull().sum().sum()
print(f"MISSING VALUES: {missing}")
if missing > 0:
    print("Warning: Missing values detected")
else:
    print("No missing values")

  Shape: (1042, 1004)
  Rows (samples): 1042
  Columns (features): 1004
LAST 5 COLUMNS:
['IL4I1', 'Age_Zscore', 'Sex_Male', 'Diagnosis', 'Dataset']
DATASET COMPOSITION:

By source:
Dataset
ADNI         700
GSE110226    329
GSE63060      13
Name: count, dtype: int64

By diagnosis:
Diagnosis
MCI        519
Control    371
AD         152
Name: count, dtype: int64

Cross-tabulation (Dataset × Diagnosis):
Diagnosis   AD  Control  MCI
Dataset                     
ADNI         0      261  439
GSE110226  145      104   80
GSE63060     7        6    0
MISSING VALUES: 0
No missing values
