# Installing necessary libraries

In [1]:
# !pip install --upgrade torch torchvision

In [2]:
# !pip install "transformers" "datasets[s3]==2.13.0" "pandas>=2.0.0" "sagemaker>=2.190.0" "gradio==3.50.2"  --upgrade --quiet

In [1]:
!pip install accelerate peft bitsandbytes



In [2]:
# !pip install --upgrade datasets

# Logging into Hugging Face

In [3]:
!huggingface-cli login --token "xxx"

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `fine-tuning` has been saved to /home/sagemaker-user/.cache/huggingface/stored_tokens
Your token has been saved to /home/sagemaker-user/.cache/huggingface/token
Login successful.
The current active token is: `fine-tuning`


# Importing required libraries

In [4]:
import sagemaker
import boto3

import pandas as pd
from io import StringIO
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, pipeline
from peft import LoraModel, get_peft_model, LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import Trainer, TrainingArguments, BitsAndBytesConfig

pd.set_option('max_colwidth', 1000)



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


2025-03-27 21:05:06.952262: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-27 21:05:07.215790: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-27 21:05:07.275781: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-27 21:05:07.293381: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-27 21:05:07.568516: I tensorflow/core/platform/cpu_feature_guar

# Initializing Sagemaker Session and IAM Role

In [5]:
sess = sagemaker.Session()
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='AmazonSageMaker-ExecutionRole-20250124T132142')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}") 

sagemaker role arn: arn:aws:iam::637423395717:role/service-role/AmazonSageMaker-ExecutionRole-20250324T120618
sagemaker bucket: sagemaker-us-east-1-637423395717
sagemaker session region: us-east-1


# Getting Data from S3 Bucket

In [6]:
s3 = boto3.client('s3')

bucket_name = 'sagemaker-bucket-fine-tuning'
train_file_key = 'data_for_gdf_mapping.csv'  
gdf_file_key = 'gdf_master_file.csv'

response = s3.get_object(Bucket=bucket_name, Key=train_file_key)
response_1 = s3.get_object(Bucket=bucket_name, Key=gdf_file_key)

csv_content = response['Body'].read().decode('utf-8')
csv_content_1 = response_1['Body'].read().decode('ISO-8859-1')
train_data = pd.read_csv(StringIO(csv_content))
gdf_master_data = pd.read_csv(StringIO(csv_content_1))           

# Formatting Dataset into Instruction, Context and Response

In [42]:
# Clean column names by stripping whitespace and ensuring they're in lowercase
train_data.columns = train_data.columns.str.lower().str.strip()        
gdf_master_data.columns = gdf_master_data.columns.str.lower().str.strip()

# Function to create instruction, context, and response in the required format
def create_instructions(data_row, gdf_data):
    raw_field = data_row['raw_field']
    raw_desc = data_row['raw_desc']
    gdf_field = data_row['gdf_field']

    # Ensure consistent casing and strip extra spaces
    raw_field = str(raw_field).lower().strip()
    gdf_field = str(gdf_field).lower().strip()

    # Get GDF description from the gdf_master_data based on the gdf_field
    gdf_desc = gdf_data[gdf_data['gdf_field'].str.lower().str.strip() == gdf_field]['gdf_desc'].values

    if len(gdf_desc) == 0:
        gdf_desc = "Description not found."
    else:
        gdf_desc = gdf_desc[0]  # In case there are multiple matches, take the first one.

    # Construct the instruction
    instruction = f"You are an expert in mapping medical data fields. Your task is to map a raw field from a dataset to a GDF field, which is a standardized field used by our organization. You can use the raw field description and the GDF field description for context. Ensure you map the raw field to the appropriate GDF field accurately. Raw field: {raw_field}. Raw field description: {raw_desc}. GDF field: {gdf_field}. GDF field description: {gdf_desc}."

    # Context: Provide both the raw description and GDF description for additional context
    context = f"Raw field: {raw_field}\nRaw field description: {raw_desc}\nGDF field: {gdf_field}\nGDF field description: {gdf_desc}"

    # Response based on the structure you've shown
    response = f"Can you map the raw field '{raw_field}' to the correct GDF field?."

    return {
        'instruction': instruction,
        'context': context,
        'response': response
    }

# Apply the function row by row to create the instructions, context, and response
formatted_data = train_data.apply(lambda row: create_instructions(row, gdf_master_data), axis=1)

# Convert the result into a list of formatted data
formatted_data_list = formatted_data.tolist()

# Create a DataFrame from the formatted data
formatted_df = pd.DataFrame(formatted_data_list)

# Function to format for inference, providing the system prompt, user prompt, and response prompt
def format_for_inference(row):
    system_prompt = f"<|start_header_id|>system<|end_header_id|>\n{row['instruction']}"
    user_prompt = f"<|start_header_id|>user<|end_header_id|>\n{row['context']}"
    response_prompt = f"<|start_header_id|>assistant<|end_header_id|>\n{row['response']}"

    return {
        'system_prompt': system_prompt,
        'user_prompt': user_prompt,
        'response_prompt': response_prompt
    }

# Apply the format_for_inference function to each row to create the final formatted output
formatted_inference_data = formatted_df.apply(lambda row: format_for_inference(row), axis=1)

# Convert the result into a list of formatted inference data
formatted_inference_list = formatted_inference_data.tolist()

# Create a DataFrame from the formatted inference data
formatted_inference_df = pd.DataFrame(formatted_inference_list)

# Save or print the first few rows of the formatted inference data
# formatted_inference_df.to_csv('formatted_inference_data.csv', index=False)
print(formatted_inference_df.head())

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           system_prompt  \
0                                                                         <|start_header_id|>system<|end_header_id|>\nYou are an expert in mapping medical data fields. Your task is to map a raw field from a dataset to a GDF field, which is a standardized field used by our organization. You can use the raw field description and the GDF field descr

# Splitting into Train, Validation & Test Datasets

In [44]:
train_data, test_data = train_test_split(formatted_inference_df, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)

train_data.to_csv('train_data.csv', index=False)
val_data.to_csv('val_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

In [39]:
# train_data.columns = train_data.columns.str.strip()
# gdf_master_data.columns = gdf_master_data.columns.str.strip()

# def create_instructions(data_row, gdf_data):
#     raw_field = data_row['raw_field']
#     raw_desc = data_row['raw_desc']
#     gdf_field = data_row['gdf_field']

#     gdf_field = str(gdf_field).strip()

#     gdf_desc = gdf_data[gdf_data['gdf_field'].str.strip() == gdf_field]['gdf_desc'].values

#     if len(gdf_desc) == 0:
#         gdf_desc = "Description not found."

#     # Instruction: "Map the raw field (example 'dt') to a standardized GDF field"
#     instruction = f"Map the raw field '{raw_field}' to a standardized GDF field."

#     # Context: Include raw description and GDF description for context
#     context = f"Raw field description: {raw_desc}\nGDF field: {gdf_field}\nGDF field description: {gdf_desc[0]}"

#     response = f"The raw field '{raw_field}' should be mapped to the GDF field '{gdf_field}'."

#     return {
#         'instruction': instruction,
#         'context': context,
#         'response': response
#     }

# formatted_data = train_data.apply(lambda row: create_instructions(train_data, gdf_master_data), axis=1)

# formatted_data_list = formatted_data.tolist()

# formatted_df = pd.DataFrame(formatted_data_list)
# print(formatted_df.head())

# train_data, test_data = train_test_split(formatted_df, test_size=0.2, random_state=42)
# train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)

# train_data.to_csv('train_data.csv', index=False)
# val_data.to_csv('val_data.csv', index=False)
# test_data.to_csv('test_data.csv', index=False)

In [26]:
bnb_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_threshold=6.0)

# Tokenizing the Datasets for Training

In [27]:
train_dataset = load_dataset('csv', data_files='train_data.csv', split='train')
val_dataset = load_dataset('csv', data_files='val_data.csv', split='train')
test_dataset = load_dataset('csv', data_files='test_data.csv', split='train')

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.2-3B-Instruct')
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the datasets
def tokenize_function(examples):
    inputs = tokenizer(examples['instruction'], examples['context'], truncation=True, padding="max_length", max_length=512)
    inputs['labels'] = inputs['input_ids'].copy()
    return inputs

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# # Set format for PyTorch (or TensorFlow)
# train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'response'])
# val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'response'])
# test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'response'])

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Downloading and preparing dataset csv/default to /home/sagemaker-user/.cache/huggingface/datasets/csv/default-1fbd708ae141654d/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

# Training Llama 3.2 3B Instruct Model

In [15]:
# Load the Llama model for causal language modeling
model = AutoModelForCausalLM.from_pretrained('meta-llama/Llama-3.2-3B-Instruct', quantization_config=bnb_config)

# Define LoRA Configuration
lora_config = LoraConfig(
    r=8,  # LoRA rank
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",  
    task_type="CAUSAL_LM", 
)

lora_model = get_peft_model(model, lora_config)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch", 
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

trainer.train()

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,0.0314,0.030816
2,0.021,0.021036
3,0.0208,0.020876


TrainOutput(global_step=2541, training_loss=0.14858478229802244, metrics={'train_runtime': 2449.6873, 'train_samples_per_second': 2.073, 'train_steps_per_second': 1.037, 'total_flos': 4.401582492981658e+16, 'train_loss': 0.14858478229802244, 'epoch': 3.0})

In [16]:
results = trainer.evaluate(test_dataset)

print(f"Test Results: {results}")

Test Results: {'eval_loss': 0.020876113325357437, 'eval_runtime': 89.4046, 'eval_samples_per_second': 5.268, 'eval_steps_per_second': 2.64, 'epoch': 3.0}


# Saving the Model Locally & in S3 Bucket

In [24]:
model_save_path = './fine_tuned_model' 

model.save_pretrained(model_save_path)

tokenizer.save_pretrained(model_save_path)

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/tokenizer.json')

In [46]:
import os
import boto3
from pathlib import Path

s3_client = boto3.client('s3')

bucket_name = 'sagemaker-bucket-fine-tuning'
s3_folder = 'llama_3.2_3B_Instruct_finetuned' 

local_model_dir = './fine_tuned_model'

for root, dirs, files in os.walk(local_model_dir):
    for file in files:
        local_file_path = os.path.join(root, file)
        s3_key = os.path.join(s3_folder, Path(local_file_path).relative_to(local_model_dir))

        print(f"Uploading {local_file_path} to s3://{bucket_name}/{s3_key}")
        s3_client.upload_file(local_file_path, bucket_name, s3_key)

Uploading ./fine_tuned_model/config.json to s3://sagemaker-bucket-fine-tuning/llama_3.2_3B_Instruct_finetuned/config.json
Uploading ./fine_tuned_model/generation_config.json to s3://sagemaker-bucket-fine-tuning/llama_3.2_3B_Instruct_finetuned/generation_config.json
Uploading ./fine_tuned_model/model.safetensors to s3://sagemaker-bucket-fine-tuning/llama_3.2_3B_Instruct_finetuned/model.safetensors
Uploading ./fine_tuned_model/tokenizer_config.json to s3://sagemaker-bucket-fine-tuning/llama_3.2_3B_Instruct_finetuned/tokenizer_config.json
Uploading ./fine_tuned_model/special_tokens_map.json to s3://sagemaker-bucket-fine-tuning/llama_3.2_3B_Instruct_finetuned/special_tokens_map.json
Uploading ./fine_tuned_model/tokenizer.json to s3://sagemaker-bucket-fine-tuning/llama_3.2_3B_Instruct_finetuned/tokenizer.json


# Loading the Model from S3 Bucket

In [47]:
s3_client = boto3.client('s3')

bucket_name = 'sagemaker-bucket-fine-tuning'
s3_folder = 'llama_3.2_3B_Instruct_finetuned'  

local_model_dir = './test'

os.makedirs(local_model_dir, exist_ok=True)

files = [
    'generation_config.json',   
    'config.json',       
    'tokenizer.json',          
    'tokenizer_config.json', 
    'special_tokens_map.json',
    'model.safetensors',
]

# Download the files from S3 to the local directory
for file in files:
    local_file_path = os.path.join(local_model_dir, file)
    s3_key = os.path.join(s3_folder, file)

    print(f"Downloading {s3_key} from S3 to {local_file_path}")
    s3_client.download_file(bucket_name, s3_key, local_file_path)

Downloading llama_3.2_3B_Instruct_finetuned/generation_config.json from S3 to ./test/generation_config.json
Downloading llama_3.2_3B_Instruct_finetuned/config.json from S3 to ./test/config.json
Downloading llama_3.2_3B_Instruct_finetuned/tokenizer.json from S3 to ./test/tokenizer.json
Downloading llama_3.2_3B_Instruct_finetuned/tokenizer_config.json from S3 to ./test/tokenizer_config.json
Downloading llama_3.2_3B_Instruct_finetuned/special_tokens_map.json from S3 to ./test/special_tokens_map.json
Downloading llama_3.2_3B_Instruct_finetuned/model.safetensors from S3 to ./test/model.safetensors


# Making predictions on Test Data for Evaluation

In [41]:
from torch.utils.data import DataLoader
import torch

def predict_in_batches(test_dataset, batch_size=8):
    all_predictions = []
    all_true_labels = []
    
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

    for batch in test_dataloader:
        batch = {k: v.to(model.device) for k, v in batch.items()}
        
        model.eval()

        with torch.no_grad(): 
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])

            logits = outputs.logits
            predicted_labels = logits.argmax(dim=-1)
            
            true_labels = batch.get('labels', None)
            if true_labels is not None:
                true_labels = true_labels

            # Store predictions and true labels
            all_predictions.extend(predicted_labels.cpu().numpy())
            if true_labels is not None:
                all_true_labels.extend(true_labels.cpu().numpy())

    return all_predictions, all_true_labels

# Make predictions on the test set in smaller batches
predicted_labels, true_labels = predict_in_batches(test_dataset, batch_size=4)

# Print a few predictions vs. true labels
for i in range(10):  # Display the first 10 samples
    print(f"Prediction: {predicted_labels[i]}, True Label: {true_labels[i]}")

Prediction: [   791    279   7257   2115    364     15   1733   2759    609    198
     16    286  12751    355    479  39329   4229    198     17    996
  28174  39329   4229    198     18    996  28174  39329   4229    198
     19    996  28174  39329   4229    198    338   2564  16554  11727
     23   1881   2605  13275   1292    198  11727     24    996   2605
  28224    535   4257    198  12422     15    338   4751   4257    198
  12422     16    415   2605   4257   2769  29579    198  12422     17
    415  13072  30618  21640    198    678     25   7257   5121     11
  17736     25    220  12422     18     11  13534     25   1665      6
    311    264  51114    480   5375   2115     13 128000  20613   2115
   4096     25    220     15   6794  33278    198     16   6794  33278
    198     17    996   2944   2082    369    279  28174    198     18
    996   2944   2082    369    279  28174    198     19    996   2944
   2082    369    279  28174    198   3909   2564  27381  11727  

In [42]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Flatten the lists in case the predictions and true labels are 2D arrays
predicted_labels_flat = [label for sublist in predicted_labels for label in sublist]
true_labels_flat = [label for sublist in true_labels for label in sublist]

# Calculate Accuracy
accuracy = accuracy_score(true_labels_flat, predicted_labels_flat)
print(f"Accuracy: {accuracy:.4f}")

# Calculate Precision, Recall, and F1 score
precision, recall, f1, _ = precision_recall_fscore_support(true_labels_flat, predicted_labels_flat, average='macro')

# You can also calculate these metrics per class by setting `average=None`
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.3984
Precision: 0.0092
Recall: 0.0093
F1 Score: 0.0092


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
