In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
# Load the data
inputs = pd.read_csv('inputs.csv')
labels = pd.read_csv('labels.csv')

# Merge on PatientID
data = pd.merge(inputs, labels, on='PatientID')

# Check for missing values and handle them if necessary
data = data.dropna().reset_index(drop=True)

# Split into train and evaluation sets
train_data, eval_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data['HadHeartAttack'])

In [4]:
def create_prompt(row):
    # Example: Convert features into a text description
    prompt = f"The patient is a {row['Sex'].lower()} aged {row['AgeCategory']}. "
    prompt += f"They have a BMI of {row['BMI']:.1f}. "
    prompt += f"Their general health is reported as {row['GeneralHealth'].lower()}. "

    # Include key medical history features
    conditions = []
    if row['HadAngina'] == 1:
        conditions.append('angina')
    if row['HadStroke'] == 1:
        conditions.append('stroke')
    if row['HadAsthma'] == 1:
        conditions.append('asthma')
    if row['HadSkinCancer'] == 1:
        conditions.append('skin cancer')
    if row['HadCOPD'] == 1:
        conditions.append('COPD')
    if row['HadDepressiveDisorder'] == 1:
        conditions.append('depressive disorder')
    if row['HadKidneyDisease'] == 1:
        conditions.append('kidney disease')
    if row['HadArthritis'] == 1:
        conditions.append('arthritis')
    if row['HadDiabetes'] == 'Yes':
        conditions.append('diabetes')
    if row['CovidPos'] == 1:
        conditions.append('COVID-19')

    if conditions:
        prompt += "They have a history of " + ", ".join(conditions) + ". "
    else:
        prompt += "They have no significant medical history. "

    prompt += "Based on this information, is the patient at risk of a heart attack?"

    return prompt

In [5]:
from torch.utils.data import Dataset

class HeartAttackDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Preprocess data
        self.texts = []
        self.labels = []
        for _, row in self.dataframe.iterrows():
            prompt = create_prompt(row)
            label = row['HadHeartAttack']
            self.texts.append(prompt)
            self.labels.append(label)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        prompt = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the prompt
        inputs = self.tokenizer(
            prompt,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt',
        )

        item = {key: val.squeeze(0) for key, val in inputs.items()}
        item['labels'] = label

        return item

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = 'distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [7]:
train_dataset = HeartAttackDataset(train_data, tokenizer)
eval_dataset = HeartAttackDataset(eval_data, tokenizer)

In [8]:
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    probs = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)
    roc_auc = roc_auc_score(labels, probs[:, 1])
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')

    return {
        'roc_auc': roc_auc,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [10]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model='roc_auc',
    greater_is_better=True,
    fp16=True,  # Enable if you have a GPU that supports it
)

In [14]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [15]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mhsp287[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Roc Auc,Accuracy,Precision,Recall,F1
1,0.2254,0.21859,0.839138,0.944415,0.0,0.0,0.0
2,0.1504,0.168451,0.738353,0.944415,0.0,0.0,0.0
3,0.1717,0.15523,0.872269,0.944415,0.0,0.0,0.0
4,0.1552,0.159107,0.850067,0.947229,0.563015,0.226112,0.322646
5,0.1665,0.154128,0.876503,0.946782,0.591463,0.137654,0.223331


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=47540, training_loss=0.1642354727493498, metrics={'train_runtime': 1926.2579, 'train_samples_per_second': 394.872, 'train_steps_per_second': 24.68, 'total_flos': 2.518950377568e+16, 'train_loss': 0.1642354727493498, 'epoch': 5.0})