In [None]:
# Import Libraries

In [None]:
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

# Preprocessing & modeling
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, brier_score_loss
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, IsolationForest
import xgboost as xgb
import lightgbm as lgb
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import joblib

# Unsupervised & explainability
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
import umap
import shap

# Deep learning autoencoder
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Paths
PROJECT_DIR = Path('.')
DATA_DIR = PROJECT_DIR / 'data'
MODEL_DIR = PROJECT_DIR / 'models' / 'save'
DATA_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR.mkdir(parents=True, exist_ok=True)
print('Model dir:', MODEL_DIR.resolve())

In [None]:
# Load dataset

In [None]:
possible_files = ['framingham.csv', 'Framingham.csv', 'framingham_heart.csv', 'framingham_heart_study.csv']
for f in possible_files:
p = DATA_DIR / f
if p.exists():
df = pd.read_csv(p)
print(f'Loaded {p.name} — shape:', df.shape)
break
else:
raise FileNotFoundError(f"No framingham csv found in {DATA_DIR}. Place the dataset file there with one of names: {possible_files}")

# quick overview
print(df.columns.tolist())
df.head()

In [None]:
print('Shape:', df.shape)
print('
Missing values per column:')
print(df.isnull().sum())
print('
Target distribution:')
print(df['TenYearCHD'].value_counts())

In [None]:
# Feature engineering function

In [None]:
def engineer_features(df_in):
df = df_in.copy()
# pulse pressure
if 'sysBP' in df.columns and 'diaBP' in df.columns:
df['pulse_pressure'] = df['sysBP'] - df['diaBP']
# BMI category
if 'BMI' in df.columns:
df['bmi_cat'] = pd.cut(df['BMI'], bins=[0,18.5,25,30,200], labels=['underweight','normal','overweight','obese'])
# age decade
if 'age' in df.columns:
df['age_decade'] = (df['age']//10)*10
# pack proxy
if 'cigsPerDay' in df.columns:
df['pack_proxy'] = df['cigsPerDay'] * df.get('smokingYears', 1)
# Cast categories
for c in ['bmi_cat','age_decade']:
if c in df.columns:
df[c] = df[c].astype('category')
return df

# Apply
df = engineer_features(df)
print('After engineering:', df.shape)

In [None]:
# Define feature lists and split

In [None]:
TARGET = 'TenYearCHD'
# Candidate features
NUMERIC_FEATURES = ['age','cigsPerDay','totChol','sysBP','diaBP','BMI','heartRate','glucose','pulse_pressure','pack_proxy']
CATEGORICAL_FEATURES = ['male','education','currentSmoker','BPMeds','prevalentStroke','prevalentHyp','diabetes','bmi_cat','age_decade']
# Keep only existing
NUMERIC_FEATURES = [c for c in NUMERIC_FEATURES if c in df.columns]
CATEGORICAL_FEATURES = [c for c in CATEGORICAL_FEATURES if c in df.columns]
FEATURES = NUMERIC_FEATURES + CATEGORICAL_FEATURES
print('Numeric:', NUMERIC_FEATURES)
print('Categorical:', CATEGORICAL_FEATURES)


# Drop rows with no target
df = df.dropna(subset=[TARGET])
X = df[FEATURES]
y = df[TARGET]


# Train-test split stratified
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print('Train:', X_train.shape, 'Test:', X_test.shape)

In [None]:
# Preprocessing pipelines

In [None]:
from sklearn.ensemble import RandomForestRegressor
numeric_transformer = Pipeline(steps=[
('imputer', IterativeImputer(random_state=0, estimator=RandomForestRegressor(n_estimators=10, random_state=0))),
('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
('num', numeric_transformer, NUMERIC_FEATURES),
('cat', categorical_transformer, CATEGORICAL_FEATURES)
])

# Fit preprocessor on training data to allow downstream unsupervised transforms
preprocessor.fit(X_train)
# Transform full dataset for unsupervised steps
X_full_trans = preprocessor.transform(X)
print('Preprocessed feature matrix shape (sparse/array):', getattr(X_full_trans, 'shape', None))

# Save preprocessor
joblib.dump(preprocessor, MODEL_DIR / 'preprocessor.joblib')

In [None]:
# Build column names for transformed array

In [None]:
onehot_cols = []
if 'cat' in preprocessor.named_transformers_:
cat_transformer = preprocessor.named_transformers_['cat']
if hasattr(cat_transformer['onehot'], 'get_feature_names_out'):
onehot_cols = list(cat_transformer['onehot'].get_feature_names_out(CATEGORICAL_FEATURES))

num_cols = NUMERIC_FEATURES
all_cols = num_cols + onehot_cols

# Convert to dense if sparse
try:
X_full_arr = X_full_trans.toarray()
except Exception:
X_full_arr = np.asarray(X_full_trans)

X_full_df = pd.DataFrame(X_full_arr, columns=all_cols, index=df.index)
print('X_full_df shape:', X_full_df.shape)

In [None]:
# PCA for dimensionality reduction & explained variance plot

In [None]:
pca = PCA(n_components=min(30, X_full_df.shape[1]))
X_pca = pca.fit_transform(X_full_df)
plt.figure(figsize=(8,4))
plt.plot(np.cumsum(pca.explained_variance_ratio_)*100)
plt.xlabel('n components')
plt.ylabel('Cumulative explained variance (%)')
plt.title('PCA cumulative explained variance')
plt.grid(True)
plt.show()

# Save PCA
joblib.dump(pca, MODEL_DIR / 'pca.joblib')

In [None]:
# UMAP embedding for visualization

In [None]:
reducer = umap.UMAP(n_neighbors=30, min_dist=0.1, random_state=42)
X_umap = reducer.fit_transform(X_full_df)

plt.figure(figsize=(8,6))
scatter = plt.scatter(X_umap[:,0], X_umap[:,1], c=df[TARGET], cmap='coolwarm', alpha=0.7)
plt.title('UMAP embedding colored by TenYearCHD')
plt.colorbar(scatter, label='TenYearCHD')
plt.show()

joblib.dump(reducer, MODEL_DIR / 'umap.joblib')

In [None]:
# Clustering (KMeans + Agglomerative) on PCA-reduced features

In [None]:
# Use first k PCA components that explain ~90% variance (or 10 components)
n_comp = min(10, X_pca.shape[1])
X_pca_reduced = X_pca[:, :n_comp]

kmeans = KMeans(n_clusters=4, random_state=42)
km_labels = kmeans.fit_predict(X_pca_reduced)

agg = AgglomerativeClustering(n_clusters=4)
agg_labels = agg.fit_predict(X_pca_reduced)

# Add to dataframe
df['kmeans_cluster'] = km_labels
df['agg_cluster'] = agg_labels

# Visualize clusters on UMAP
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.scatter(X_umap[:,0], X_umap[:,1], c=km_labels, cmap='tab10', alpha=0.8)
plt.title('UMAP colored by KMeans cluster')
plt.subplot(1,2,2)
plt.scatter(X_umap[:,0], X_umap[:,1], c=agg_labels, cmap='tab10', alpha=0.8)
plt.title('UMAP colored by Agglomerative cluster')
plt.show()

# Cluster risk profiling
cluster_summary = df.groupby('kmeans_cluster')[TARGET].agg(['mean','count']).rename(columns={'mean':'risk_rate'})
print('Cluster summary (KMeans):')
print(cluster_summary)

joblib.dump(kmeans, MODEL_DIR / 'kmeans.joblib')

In [None]:
# Anomaly detection with IsolationForest

In [None]:
iso = IsolationForest(n_estimators=200, contamination=0.02, random_state=42)
iso.fit(X_full_df)
iso_scores = -iso.decision_function(X_full_df) # higher = more anomalous
df['iso_score'] = iso_scores

plt.figure(figsize=(8,4))
plt.hist(iso_scores, bins=50)
plt.title('IsolationForest anomaly score distribution')
plt.xlabel('anomaly score (higher=more anomalous)')
plt.show()

# Flag top anomalies
top_anom = df.nlargest(20, 'iso_score')
print('Top anomalous records (index, iso_score):')
print(top_anom[['iso_score', TARGET]].head(10))

joblib.dump(iso, MODEL_DIR / 'isolationforest.joblib')