In [1]:
!pip install datasets
!pip install peft transformers trl
!pip install bitsandbytes


# Step 2 load packages 
import pandas as pd, torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from datasets import DatasetDict, load_dataset
from peft import get_peft_model, LoraConfig
from trl import SFTTrainer


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-18.1.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_1

In [2]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_use_double_quant=True, 
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=torch.bfloat16
)

from huggingface_hub import login
login(token='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')

model_name = "meta-llama/Llama-3.1-8B-Instruct"


In [3]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
) 
tokenizer = AutoTokenizer.from_pretrained(model_name)


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [4]:
dataset = load_dataset('csv', data_files='differential_privacy.csv')

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
train_test = dataset['train'].train_test_split(test_size=0.2, seed=42)
train_dataset, test_dataset = train_test['train'], train_test['test']



In [6]:
EOS_TOKEN = tokenizer.eos_token_id
tokenizer.add_special_tokens({"pad_token": "<|reserved_special_token_0|>"})
model.config.pad_token_id = tokenizer.pad_token_id # updating model config
tokenizer.padding_side = 'right' # padding to right (otherwise SFTTrainer shows warning)


In [7]:
lora_config = LoraConfig(
    r = 16,               # Low-rank dimension
    lora_alpha = 32,     # Alpha Scaling factor
    lora_dropout = 0.1,  # Dropout for stability
    target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],  # Apply LoRA to attention layers
    task_type = "CAUSAL_LM",
    bias = "none"
)
model = get_peft_model(model, lora_config)


In [8]:
def create_input_text(example):
  input_text = (
f"### Instruction: "
f"Based on the provided patient information, generate a precise and concise medical treatment recommendation."

f"### Context: "
f" Age: {example['age']},"
f" Gender: {example['gender']},"
f" Symptoms: {example['symptoms']},"
f" Diagnoses: {example['diagnoses']}.\n"

f"### Response:"
f" {example['procedures']}."

)
  return input_text


	
# Step 8: Remove the 'procedures' column from input and set 'procedure' as target
# Mapping the labels
train_dataset = train_dataset.map(lambda x: {'input_text': create_input_text(x) })
test_dataset = test_dataset.map(lambda x: {'input_text': create_input_text(x)})


Map:   0%|          | 0/5324 [00:00<?, ? examples/s]

Map:   0%|          | 0/1331 [00:00<?, ? examples/s]

In [9]:
def tokenize_function(examples):
    # Find the length of the longest input text
    longest_input_length = max(len(tokenizer(text)['input_ids']) for text in examples['input_text'])

    # Set the dynamic max length (accounting for EOS token)
    max_length = longest_input_length + 1

    # Tokenize with truncation to max_length
    tokenized_inputs = tokenizer(
        examples['input_text'],
        truncation=True,
        max_length=max_length
    )

    # Append EOS_TOKEN and pad manually
    for input_ids in tokenized_inputs['input_ids']:
        if len(input_ids) < max_length:
            input_ids.append(tokenizer.eos_token_id)  # Add EOS_TOKEN
            input_ids.extend([tokenizer.pad_token_id] * (max_length - len(input_ids)))

    # Adjust attention masks for padding
    for attention_mask in tokenized_inputs['attention_mask']:
        if len(attention_mask) < max_length:
            attention_mask.append(1)  # EOS_TOKEN is part of the sequence
            attention_mask.extend([0] * (max_length - len(attention_mask)))

    return tokenized_inputs



# Step 10: map the tokenized
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/5324 [00:00<?, ? examples/s]

Map:   0%|          | 0/1331 [00:00<?, ? examples/s]

In [10]:
!pip install wandb
import wandb

# Replace with your actual API key
wandb.login(key="05e1d78b73f3d420dc0562e99cd3ba0dd50e8d22")




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting wandb
  Downloading wandb-0.19.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting click!=8.0.0,>=7.1 (from wandb)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting protobuf!=4.21.0,!=5.28.0,<6,>=3.19.0 (from wandb)
  Downloading protobuf-5.29.2-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Collecting pydantic<3,>=2.6 (from wandb)
  Downloading pydantic-2.10.4-py3-none-any.whl.metadata (29 kB)
Collecting sentry-sdk>=2.0.0 (from wandb)
  Downloading sentry_sdk-2.19.2-py2.py3-none-any.whl.metadata (9.9 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mahsunny15[0m ([33mahsunny155[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [11]:
training_args = TrainingArguments(
    output_dir = './results',            
    eval_strategy = "epoch",
    save_strategy="epoch",
    learning_rate = 2e-4,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    num_train_epochs = 2,
    gradient_accumulation_steps=2,
    weight_decay = 0.01, 
    logging_dir = './logs',
    logging_steps = 100,
    save_total_limit = 3,
    prediction_loss_only = False,   
    warmup_ratio=0.05,
    report_to = "wandb",       
    run_name="Llama-3.1-8b-Differential-Privacy",
    do_train = True,        # enable training
    fp16  = True,         #use mixed precision
    push_to_hub = True
)

# Step 12: Initialize the Trainer
trainer = SFTTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_train,
    eval_dataset = tokenized_test,
    tokenizer = tokenizer,
    
)

# Step 13: Train the model with PEFT using LoRA
trainer.train()


  trainer = SFTTrainer(


Epoch,Training Loss,Validation Loss
1,0.3793,0.391028
2,0.2976,0.355344


No files have been modified since last commit. Skipping to prevent empty commit.


TrainOutput(global_step=2662, training_loss=0.4177844515427175, metrics={'train_runtime': 1489.9346, 'train_samples_per_second': 7.147, 'train_steps_per_second': 1.787, 'total_flos': 5.29722191572992e+16, 'train_loss': 0.4177844515427175, 'epoch': 2.0})

In [12]:
# Evaluate the model on the test dataset
eval_results = trainer.evaluate(eval_dataset=tokenized_test)

# Extract the loss and compute perplexity
eval_loss = eval_results["eval_loss"]
perplexity = torch.exp(torch.tensor(eval_loss))
print(f"Perplexity: {perplexity.item():.2f}")


Perplexity: 1.43


In [13]:
import random

# Generate 100 unique random indices
random_indices = random.sample(range(len(test_dataset)), 100)

sampled_dataset = test_dataset.select(random_indices)



In [17]:
from tqdm import tqdm
import torch

def generate_predictions(model, tokenizer, dataset, output_csv):
    """
    Generate predictions for the given dataset and save them to a CSV file.

    Args:
        model: The fine-tuned model.
        tokenizer: The tokenizer used with the model.
        dataset: The dataset containing 'input_text'.
        output_csv: File path to save the predictions in CSV format.
    """
    model.eval()  # Set the model to evaluation mode
    results = []  # List to store results

    for example in tqdm(dataset, desc="Generating predictions"):
        input_text = example['input_text']  # Extract the input text
        
        # Tokenize input text
        tokenized_input = tokenizer(
            input_text,
            return_tensors="pt",
            truncation=True,
            max_length=256,  # Adjust as per your max input length
            padding="max_length"
        )
        
        # Move inputs to the model's device
        input_ids = tokenized_input["input_ids"].to(model.device)
        attention_mask = tokenized_input["attention_mask"].to(model.device)
        
        # Generate predictions
        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
                max_new_tokens=50,  # Adjust max tokens for generated output
                temperature=0.4,  # Control coherence
                top_p=0.8,  # Control diversity
                repetition_penalty=1.5,  # Penalize repetition
                early_stopping=True  # Stop early at EOS token
            )
        
        # Decode the generated text
        predicted_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predicted_text = predicted_output[len(input_text):].strip()  # Removing the input text portion

        
        # Append input and predicted output to results
        results.append({"input_text": input_text, "predicted_output": predicted_text	})
    
    # Save results to a DataFrame and then to a CSV file
    results_df = pd.DataFrame(results)
    results_df.to_csv(output_csv, index=False)
    print(f"Predictions saved to {output_csv}")

# Call the function
output_csv_path = "sampled_dataset_predictions.csv"
generate_predictions(model, tokenizer, sampled_dataset, output_csv_path)


Generating predictions: 100%|██████████| 100/100 [00:30<00:00,  3.23it/s]

Predictions saved to sampled_dataset_predictions.csv





In [16]:
!pip install evaluate
!pip install rouge_score
from evaluate import load
rouge_metric = load("rouge")

def calculate_rouge(model, tokenizer, dataset):
    """
    Evaluate the model using the ROUGE metric.
    
    Args:
        model: The fine-tuned model.
        tokenizer: The tokenizer used with the model.
        dataset: The tokenized dataset with 'input_text'.
    
    Returns:
        ROUGE scores (ROUGE-1, ROUGE-2, ROUGE-L).
    """
    model.eval()  # Set the model to evaluation mode
    predictions = []
    references = []  # We will extract references from 'input_text'

    for example in dataset:
        # Tokenize input text
        input_text = example['input_text']
        tokenized_input = tokenizer(
            input_text,
            return_tensors="pt",
            truncation=True,
            max_length=256,  # Keep this for input truncation
            padding="max_length"
        )
        
        # Move inputs to the model's device
        input_ids = tokenized_input["input_ids"].to(model.device)
        attention_mask = tokenized_input["attention_mask"].to(model.device)

        # Generate predictions
        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,  # Use input_ids here
                attention_mask=attention_mask,  # Use attention_mask here
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
                max_new_tokens=50, # add max_new_tokens to control the generated length
                temperature = 0.6,                        # Lower temperature for coherence
                top_p = 0.8,                            # Control diversity
                early_stopping=True,                    # Continue until EOS token or max length
                repetition_penalty = 1.5 
            )
        
        # Decode the generated and reference sequences
        predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract the reference text from 'input_text'
        # Assuming the reference is after "### Response:"
        reference_text = input_text.split("### Response:")[-1].strip()

        # Store predictions and references for evaluation
        predictions.append(predicted_text)
        references.append(reference_text)

    # Compute ROUGE
    rouge_scores = rouge_metric.compute(predictions=predictions, references=references)
    return rouge_scores

# Call the function on the test dataset
rouge_scores = calculate_rouge(model, tokenizer, tokenized_test)
print(f"ROUGE-1: {rouge_scores['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_scores['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_scores['rougeL']:.4f}")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting absl-py (from rouge_score)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting nltk (from rouge_score)
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting joblib (from nltk->rouge_score)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Downloading absl_py-2.1.0-py3-none-any.whl (133 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.7/133.7 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m74.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading joblib-1.4.2-py3-none-any.whl (301 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.8/301.8 kB[0m [31m125.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_sc

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

ROUGE-1: 0.2966
ROUGE-2: 0.2678
ROUGE-L: 0.2966


In [18]:
def predict(input_text):
    # Tokenize the input text
    device = "cuda:0"
    
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device) #moving the model to the same device

# Generate predictions
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,    # Signal end-of-sequence
        max_new_tokens=50,
        temperature = 0.6,                        # Lower temperature for coherence
        top_p = 0.8,  
        early_stopping=True,                    # Continue until EOS token or max length
        repetition_penalty = 1.5                # Penalize repetitions (higher = stricter penalty) tokens
    )
    # Decode the generated text
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract the procedure prediction
    procedure = prediction[len(input_text):].strip()  # Removing the input text portion
    return procedure






In [31]:
example = {
    'age': 55, 
    'gender': 'Female', 
    'symptoms': 'chest pain, shortness of breath', 
    'diagnoses': 'coronary artery disease',
    'procedures': ' '
}

input_text= create_input_text(example)
# Generate the procedure prediction
predicted_procedure = predict(input_text)
print("Predicted Procedure:", predicted_procedure)



Predicted Procedure: Other cardiac stress test using treadmill or walking machine as exercise device [CST].
