# Installing necessary libraries

In [1]:
!pip install accelerate peft bitsandbytes



# Logging into Hugging Face

In [2]:
!huggingface-cli login --token "xxx"

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `fine-tuning` has been saved to /home/sagemaker-user/.cache/huggingface/stored_tokens
Your token has been saved to /home/sagemaker-user/.cache/huggingface/token
Login successful.
The current active token is: `fine-tuning`


# Importing required libraries

In [3]:
import sagemaker
import boto3

import pandas as pd
from io import StringIO
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, pipeline
from peft import LoraModel, get_peft_model, LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import Trainer, TrainingArguments, BitsAndBytesConfig

pd.set_option('max_colwidth', 1000)



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


2025-03-27 22:21:34.177275: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-27 22:21:34.190147: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-27 22:21:34.208749: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-27 22:21:34.214494: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-27 22:21:34.227445: I tensorflow/core/platform/cpu_feature_guar

# Initializing Sagemaker Session and IAM Role

In [4]:
sess = sagemaker.Session()
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='AmazonSageMaker-ExecutionRole-20250124T132142')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}") 

sagemaker role arn: arn:aws:iam::637423395717:role/service-role/AmazonSageMaker-ExecutionRole-20250324T120618
sagemaker bucket: sagemaker-us-east-1-637423395717
sagemaker session region: us-east-1


# Getting Data from S3 Bucket

In [5]:
s3 = boto3.client('s3')

bucket_name = 'sagemaker-bucket-fine-tuning'
train_file_key = 'data_for_gdf_mapping.csv'  
gdf_file_key = 'gdf_master_file.csv'

response = s3.get_object(Bucket=bucket_name, Key=train_file_key)
response_1 = s3.get_object(Bucket=bucket_name, Key=gdf_file_key)

csv_content = response['Body'].read().decode('utf-8')
csv_content_1 = response_1['Body'].read().decode('ISO-8859-1')
train_df = pd.read_csv(StringIO(csv_content))
gdf_master_data = pd.read_csv(StringIO(csv_content_1))           

In [6]:
# del model
# del trainer
import torch
torch.cuda.empty_cache()

# Performing Inference on the Model to get Baseline Results

In [12]:
model_name = "meta-llama/Llama-3.2-1B-Instruct" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

system_prompt = f"<|start_header_id|>system<|end_header_id|>\nYou are an expert in mapping medical data fields. The task is to map a raw field to a standardized GDF field used by our organization. You can get more context about the fields from the raw field description and gdf field description. If the raw field descrption is NaN or is not given or is empty, then try to understand what the raw_field might contain from its name and then try mapping it to the appropriate GDF field. The list of gdf fields used within the organization, along with thier description is in {gdf_master_data}. Take your time, understand the raw_field to be mapped and then based on the infromation you have map the raw field to the appropriate gdf field correctly."
user_prompt = "<|start_header_id|>user<|end_header_id|>\n Raw field: clvc_amount_05. This field represents a Client defined field. Please map it to the appropriate GDF field."

input_text = system_prompt + "\n" + user_prompt

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

baseline_result = generator(input_text, truncation=True, padding=True, max_new_tokens=1000, num_return_sequences=1)
# inputs = tokenizer(input_text, truncation=True, padding=True, max_length=1024, return_tensors="pt")
# output = model.generate(**inputs, max_length=1000, num_beams=5, early_stopping=True)

# decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
# print(f"Inference Result: {decoded_output}")

print("Baseline Inference Result:")
print(baseline_result[0]['generated_text'])

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Device set to use cuda:0


Baseline Inference Result:
<|start_header_id|>system<|end_header_id|>
You are an expert in mapping medical data fields. The task is to map a raw field to a standardized GDF field used by our organization. You can get more context about the fields from the raw field description and gdf field description. If the raw field descrption is NaN or is not given or is empty, then try to understand what the raw_field might contain from its name and then try mapping it to the appropriate GDF field. The list of gdf fields used within the organization, along with thier description is in                           file                         gdf_field  \
0                        Claim  CC_CLAIM_FINAL_VERSION_INDICATOR   
1                        Claim               CC_CUSTOM_AMOUNT_01   
2                        Claim               CC_CUSTOM_AMOUNT_02   
3                        Claim               CC_CUSTOM_AMOUNT_03   
4                        Claim               CC_CUSTOM_AMOUNT_04   
...        

# Training the Model on RCA-14 Data

In [None]:
from datasets import Dataset

model_name = "meta-llama/Llama-3.2-1B-Instruct" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

df = pd.DataFrame(train_df)
df['raw_desc'] = df['raw_desc'].fillna('')
df['text'] = df['raw_field'] + " " + df['raw_desc']

# Prepare the inputs (X) and outputs (y)
X = df['text'].tolist()
y = df['gdf_field'].tolist() 

# Map labels to integers (for classification)
label_map = {label: i for i, label in enumerate(set(y))}
df['label'] = df['gdf_field'].map(label_map)

train_texts, test_texts, train_labels, test_labels = train_test_split(X, df['label'], test_size=0.2)

train_dataset = Dataset.from_dict({'text': train_texts, 'label': train_labels.tolist()})
test_dataset = Dataset.from_dict({'text': test_texts, 'label': test_labels.tolist()})

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]