In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import RobustScaler

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import mutual_info_classif

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from torch.nn import TransformerEncoder, TransformerEncoderLayer

In [None]:
# Load data
df = pd.read_csv("cardio_train.csv", delimiter=";")

# Convert target to binary (0: No HD, 1: HD)
df['cardio'] = df['cardio'].apply(lambda x: 1 if x > 0 else 0)

# Impute missing values
imputer = KNNImputer(n_neighbors=5)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Separate features and target
X = df_imputed.drop('cardio', axis=1)
y = df_imputed['cardio']

# Balance classes with SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Normalize data
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X_res)

# Calculate feature importance via Extra Trees
etc = ExtraTreesClassifier(n_estimators=200, random_state=42)
etc.fit(X_scaled, y_res)
etc_importances = etc.feature_importances_

# Calculate Mutual Information scores
mi_scores = mutual_info_classif(X_scaled, y_res)

# Select top 10 features using combined scores
combined_scores = (etc_importances + mi_scores) / 2
selected_indices = np.argsort(combined_scores)[-10:]
X_selected = X_scaled[:, selected_indices]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_res, test_size=0.2, stratify=y_res, random_state=42)

# Transformer Model
class TransformerModel(nn.Module):
    def __init__(self, input_dim, d_model=64, nhead=4, num_layers=3):
        super().__init__()
        self.embedding = nn.Linear(input_dim, d_model)
        encoder_layer = TransformerEncoderLayer(d_model, nhead, batch_first=True)
        self.encoder = TransformerEncoder(encoder_layer, num_layers)
        self.classifier = nn.Linear(d_model, 1)

    def forward(self, x):
        x = self.embedding(x)  # Shape: (batch_size, features) -> (batch_size, features, d_model)
        x = x.unsqueeze(1)  # Make it (batch_size, seq_len=1, d_model)
        x = self.encoder(x)  # Transformer processes this
        x = x[:, 0, :]  # Extract the first token (now shape is (batch_size, d_model))
        return torch.sigmoid(self.classifier(x))  # Final output


# Convert data to PyTorch tensors
train_data = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train.values, dtype=torch.float32))
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

# Initialize and train Transformer
model = TransformerModel(input_dim=X_train.shape[1])
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Generate Transformer predictions
with torch.no_grad():
    test_tensor = torch.tensor(X_test, dtype=torch.float32)
    transformer_probs = model(test_tensor).numpy().flatten()

# XGBoost Model
xgb_model = XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    use_label_encoder=False,
    eval_metric='logloss'
)
xgb_model.fit(X_train, y_train)
xgb_probs = xgb_model.predict_proba(X_test)[:, 1]

# Stacking with Logistic Regression
stacked_probs = np.column_stack([transformer_probs, xgb_probs])
meta_model = LogisticRegression()
meta_model.fit(stacked_probs, y_test)
final_preds = meta_model.predict(stacked_probs)


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

accuracy = accuracy_score(y_test, final_preds)
precision = precision_score(y_test, final_preds)
recall = recall_score(y_test, final_preds)
f1 = f1_score(y_test, final_preds)
auc = roc_auc_score(y_test, final_preds)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"AUC-ROC: {auc:.4f}")


# New Section