### Summary

This notebook handles the fine tuning of GPT-3.5 and PaLM LLM models. I have created a mini dataset from the original dataset in the format that is expected by the OpenAI and PaLM APIs to fine tune the LLMs.

In [None]:
!pip3 install openai
!pip3 install numpy
!pip3 install tiktoken
!pip3 install Gradio

In [49]:
import openai
import csv
import json
import os
import numpy as np
from collections import defaultdict
import tiktoken
import gradio as gr
import configparser

In [48]:
#read API keys from config file
config = configparser.ConfigParser()
config.read('/Users/yashwanthys/Desktop/config.ini')
openai.api_key = config['API_KEYS']['OPENAI_API_KEY']

In [39]:
# clean up the mini dataset and convert to jsonl format
def convert_csv_to_jsonl(csv_file_path, jsonl_file_path):
    cleaned_data = []

    with open(csv_file_path, 'r', encoding='utf-8-sig') as file:
        csv_reader = csv.reader(file)
        for row in csv_reader:
            for cell in row:
                try:
                    # Replace square brackets and inner double quotes that are problematic
                    cell = cell.replace('["', '').replace('"]', '').replace('\\"', '"')

                    # Load each cell as a JSON object
                    cell_json = json.loads(cell)

                    # Now that the content is clean, append to cleaned_data list
                    cleaned_data.append(cell_json)
                except json.JSONDecodeError as e:
                    print(f"JSON decode error for cell '{cell}': {e}")

    # Write cleaned data to a JSONL file
    with open(jsonl_file_path, 'w', encoding='utf-8') as jsonl_file:
        for item in cleaned_data:
            jsonl_file.write(json.dumps(item) + '\n')

    print(f"Conversion complete. JSONL file created at '{jsonl_file_path}'.")

### Fine tuning of GPT-3.5 model with breif prompt

In [42]:
#convert the cleaned up training and validation datasets in csv format to jsonl format
briefprompt_training_file_path = '/Users/yashwanthys/PersonalProjects/ML_Proj/De-Identification/fine_tuning/TestFineTune.csv'
breifprompt_jsonl_training_file_path = '/Users/yashwanthys/PersonalProjects/ML_Proj/De-Identification/fine_tuning/TestFineTune.jsonl'
briefprompt_validation_file_path = '/Users/yashwanthys/PersonalProjects/ML_Proj/De-Identification/fine_tuning/Validate.csv'
breifprompt_jsonl_validation_file_path = '/Users/yashwanthys/PersonalProjects/ML_Proj/De-Identification/fine_tuning/Validate.jsonl'
convert_csv_to_jsonl(briefprompt_training_file_path, breifprompt_jsonl_training_file_path)
convert_csv_to_jsonl(briefprompt_validation_file_path, breifprompt_jsonl_validation_file_path)

Conversion complete. JSONL file created at '/Users/yashwanthys/PersonalProjects/ML_Proj/De-Identification/fine_tuning/TestFineTune.jsonl'.
Conversion complete. JSONL file created at '/Users/yashwanthys/PersonalProjects/ML_Proj/De-Identification/fine_tuning/Validate.jsonl'.


In [44]:
#Upload data for training

training_response = openai.File.create(
    file=open(breifprompt_jsonl_training_file_path, "rb"), purpose="fine-tune"
)
training_file_id = training_response["id"]

#Gives training file id
print("Training file id:", training_file_id)

Training file id: file-bNOA26nA1fBN8SD4XAYPr5Yf


In [43]:
#Upload data for training

validation_response = openai.File.create(
    file=open(breifprompt_jsonl_validation_file_path, "rb"), purpose="fine-tune"
)
validation_file_id = training_response["id"]

#Gives training file id
print("Validation file id:", validation_file_id)

Validation file id: file-Nc5xQOVOMJeXgNE8OTTk9O2t


In [51]:
#Create Fine-Tuning Job
suffix_name = "briefprompt-deid2"

response = openai.FineTuningJob.create(
    training_file=training_file_id,
    validation_file=validation_file_id,
    model="gpt-3.5-turbo",
    suffix=suffix_name,
    hyperparameters={"n_epochs":3}
)

job_id = response["id"]

print(response)

{
  "object": "fine_tuning.job",
  "id": "ftjob-E3Ha3XDcNDBaHaEck785c2yB",
  "model": "gpt-3.5-turbo-0613",
  "created_at": 1699266149,
  "finished_at": null,
  "fine_tuned_model": null,
  "organization_id": "org-y9pJa2gk1YU06Wi6jGNSdSWm",
  "result_files": [],
  "status": "validating_files",
  "validation_file": "file-Nc5xQOVOMJeXgNE8OTTk9O2t",
  "training_file": "file-bNOA26nA1fBN8SD4XAYPr5Yf",
  "hyperparameters": {
    "n_epochs": 3
  },
  "trained_tokens": null,
  "error": null
}


In [53]:
#list events as fine-tuning progresses
response = openai.FineTuningJob.list_events(id=job_id, limit=50)

events = response["data"]
events.reverse()

for event in events:
    print(event["message"])

Step 43/90: training loss=0.01, validation loss=0.00
Step 44/90: training loss=0.00, validation loss=0.00
Step 45/90: training loss=0.00, validation loss=0.02
Step 46/90: training loss=0.00, validation loss=0.00
Step 47/90: training loss=0.00, validation loss=0.00
Step 48/90: training loss=0.01, validation loss=0.00
Step 49/90: training loss=0.01, validation loss=0.01
Step 50/90: training loss=0.01, validation loss=0.00
Step 51/90: training loss=0.00, validation loss=0.00
Step 52/90: training loss=0.00, validation loss=0.00
Step 53/90: training loss=0.00, validation loss=0.00
Step 54/90: training loss=0.00, validation loss=0.00
Step 55/90: training loss=0.01, validation loss=0.00
Step 56/90: training loss=0.00, validation loss=0.00
Step 57/90: training loss=0.02, validation loss=0.00
Step 58/90: training loss=0.00, validation loss=0.00
Step 59/90: training loss=0.00, validation loss=0.00
Step 60/90: training loss=0.00, validation loss=0.00
Step 61/90: training loss=0.00, validation los

In [54]:
#retrieve fine-tune model id
response = openai.FineTuningJob.retrieve(job_id)
fine_tuned_model_id = response["fine_tuned_model"]

print(response)
print("\nFine-tuned model id:", fine_tuned_model_id)

{
  "object": "fine_tuning.job",
  "id": "ftjob-E3Ha3XDcNDBaHaEck785c2yB",
  "model": "gpt-3.5-turbo-0613",
  "created_at": 1699266149,
  "finished_at": 1699266551,
  "fine_tuned_model": "ft:gpt-3.5-turbo-0613:personal:briefprompt-deid2:8Hr8yYi1",
  "organization_id": "org-y9pJa2gk1YU06Wi6jGNSdSWm",
  "result_files": [
    "file-pDHg2n7rMCjxHg9MprvJzjqh"
  ],
  "status": "succeeded",
  "validation_file": "file-Nc5xQOVOMJeXgNE8OTTk9O2t",
  "training_file": "file-bNOA26nA1fBN8SD4XAYPr5Yf",
  "hyperparameters": {
    "n_epochs": 3
  },
  "trained_tokens": 130947,
  "error": null
}

Fine-tuned model id: ft:gpt-3.5-turbo-0613:personal:briefprompt-deid2:8Hr8yYi1


In [63]:
# Sample testing of fine tuned model
test_messages = []

system_message = "Task: Please anonymize the following clinical note. Replace all the Protected health information (PHI) text with the [censored]."
test_messages.append({"role": "system", "content": system_message})
user_message = '''Record date: 2069-04-07

Mr. Villegas is seen today.  I have not seen him since November. 
About three weeks ago he stopped his Prednisone on his own because
he was gaining weight.  He does feel that his shoulders are
definitely improved.  It is unclear what he is actually taking, but
I think based on the color of his pills and the timing of the
medication that he continues taking his Atenolol for hypertension
and 1 Hydroxychloroquine tablet.  He is concerned because of the
relatively recent onset of difficulties turning his head to the
right.  When he does this, he will note that he feels as though he
is going to pass out although this has not actually happened.  This
only occurs when he turns to the right and not to the left.  He has
no visual changes otherwise and denies any headache or other
cranial complaints.  
 
On examination today, BP 120/80.  He has no bruits over the
carotid.  He has no tenderness in this region either.  He has good
peripheral pulses at the arms.  His joint examination is much
improved with better ROM of the shoulders and no peripheral joint
synovitis.  
 
Clinical Impression:
 
#1:  Inflammatory arthritis - possibly RA - with response noted to
Hydroxychloroquine along with Prednisone.  He has stopped the
Prednisone, and I would not restart it yet.  
 
#2:  New onset of symptoms suspicious for right-sided carotid
disease.  Will arrange for carotid ultrasound studies.  Patient
advised to call me if he develops any worsening symptoms.  He has
been taking 1 aspirin per day prophylaxis long-term, and I stressed
that he continue to do so.  He will follow-up with me shortly after
the ultrasound study.
 
 
 
Xzavian G. Tavares, M.D.
XGT:holmes
 
DD: 04/07/69
DT: 04/15/69
DV: 04/07/69
 ******** Approved but not reviewed by Attending Provider ******** '''
test_messages.append({"role": "user", "content": user_message})

print(test_messages)

[{'role': 'system', 'content': 'Task: Please anonymize the following clinical note. Replace all the Protected health information (PHI) text with the [censored].'}, {'role': 'user', 'content': 'Record date: 2069-04-07\n\nMr. Villegas is seen today.  I have not seen him since November. \nAbout three weeks ago he stopped his Prednisone on his own because\nhe was gaining weight.  He does feel that his shoulders are\ndefinitely improved.  It is unclear what he is actually taking, but\nI think based on the color of his pills and the timing of the\nmedication that he continues taking his Atenolol for hypertension\nand 1 Hydroxychloroquine tablet.  He is concerned because of the\nrelatively recent onset of difficulties turning his head to the\nright.  When he does this, he will note that he feels as though he\nis going to pass out although this has not actually happened.  This\nonly occurs when he turns to the right and not to the left.  He has\nno visual changes otherwise and denies any heada

In [65]:
response = openai.ChatCompletion.create(
    model=fine_tuned_model_id, #can test it against gpt-3.5-turbo to see difference
    messages=test_messages,
    temperature=0.05,
)
print(response["choices"][0]["message"]["content"])


Record date: [censored]

Mr. [censored] is seen today.  I have not seen him since [censored]. 
About three weeks ago he stopped his Prednisone on his own because
he was gaining weight.  He does feel that his shoulders are
definitely improved.  It is unclear what he is actually taking, but
I think based on the color of his pills and the timing of the
medication that he continues taking his Atenolol for hypertension
and 1 Hydroxychloroquine tablet.  He is concerned because of the
relatively recent onset of difficulties turning his head to the
right.  When he does this, he will note that he feels as though he
is going to pass out although this has not actually happened.  This
only occurs when he turns to the right and not to the left.  He has
no visual changes otherwise and denies any headache or other
cranial complaints.  
 
On examination today, BP 120/80.  He has no bruits over the
carotid.  He has no tenderness in this region either.  He has good
peripheral pulses at the arms.  His j

### Fine tuning GPT-3.5 model with detailed prompts

In [68]:
#convert the cleaned up training and validation datasets in csv format to jsonl format
detailedprompt_training_file_path = '/Users/yashwanthys/PersonalProjects/ML_Proj/De-Identification/fine_tuning/TestFineTune_detailed.csv'
detailedprompt_jsonl_training_file_path = '/Users/yashwanthys/PersonalProjects/ML_Proj/De-Identification/fine_tuning/TestFineTune_detailed.jsonl'
detailedprompt_validation_file_path = '/Users/yashwanthys/PersonalProjects/ML_Proj/De-Identification/fine_tuning/Validate_detailed.csv'
detailedprompt_jsonl_validation_file_path = '/Users/yashwanthys/PersonalProjects/ML_Proj/De-Identification/fine_tuning/Validate_detailed.jsonl'
convert_csv_to_jsonl(detailedprompt_training_file_path, detailedprompt_jsonl_training_file_path)
convert_csv_to_jsonl(detailedprompt_validation_file_path, detailedprompt_jsonl_validation_file_path)

Conversion complete. JSONL file created at '/Users/yashwanthys/PersonalProjects/ML_Proj/De-Identification/fine_tuning/TestFineTune_detailed.jsonl'.
Conversion complete. JSONL file created at '/Users/yashwanthys/PersonalProjects/ML_Proj/De-Identification/fine_tuning/Validate_detailed.jsonl'.


In [75]:
#Upload data for training

training_response = openai.File.create(
    file=open(detailedprompt_jsonl_training_file_path, "rb"), purpose="fine-tune"
)
training_file_id = training_response["id"]

#Gives training file id
print("Training file id:", training_file_id)

Training file id: file-mgtyQOZSModYOT0d4NaeaS1W


In [74]:
#Upload data for training

validation_response = openai.File.create(
    file=open(detailedprompt_jsonl_validation_file_path, "rb"), purpose="fine-tune"
)
validation_file_id = training_response["id"]

#Gives training file id
print("Validation file id:", validation_file_id)

Validation file id: file-g0AqQODugJFUcwf1c1gfACMa


In [76]:
#Create Fine-Tuning Job
suffix_name = "detailedpromptdeid"

response = openai.FineTuningJob.create(
    training_file=training_file_id,
    validation_file=validation_file_id,
    model="gpt-3.5-turbo",
    suffix=suffix_name,
    hyperparameters={"n_epochs":5}
)

job_id = response["id"]

print(response)

{
  "object": "fine_tuning.job",
  "id": "ftjob-daEfytwWqIDmMuTZ6hXJdI9i",
  "model": "gpt-3.5-turbo-0613",
  "created_at": 1699301580,
  "finished_at": null,
  "fine_tuned_model": null,
  "organization_id": "org-y9pJa2gk1YU06Wi6jGNSdSWm",
  "result_files": [],
  "status": "validating_files",
  "validation_file": "file-g0AqQODugJFUcwf1c1gfACMa",
  "training_file": "file-mgtyQOZSModYOT0d4NaeaS1W",
  "hyperparameters": {
    "n_epochs": 5
  },
  "trained_tokens": null,
  "error": null
}


In [77]:
#list events as fine-tuning progresses
response = openai.FineTuningJob.list_events(id=job_id, limit=50)

events = response["data"]
events.reverse()

for event in events:
    print(event["message"])

Created fine-tuning job: ftjob-daEfytwWqIDmMuTZ6hXJdI9i
Validating training file: file-mgtyQOZSModYOT0d4NaeaS1W and validation file: file-g0AqQODugJFUcwf1c1gfACMa
Files validated, moving job to queued state
Fine-tuning job started
Step 1/150: training loss=0.05, validation loss=0.05
Step 11/150: training loss=0.03, validation loss=0.03
Step 21/150: training loss=0.03, validation loss=0.00
Step 31/150: training loss=0.01, validation loss=0.00
Step 41/150: training loss=0.00, validation loss=0.00
Step 51/150: training loss=0.03, validation loss=0.00
Step 61/150: training loss=0.01, validation loss=0.00
Step 71/150: training loss=0.00, validation loss=0.00
Step 81/150: training loss=0.00, validation loss=0.00
Step 91/150: training loss=0.00, validation loss=0.00
Step 101/150: training loss=0.00, validation loss=0.00
Step 111/150: training loss=0.00, validation loss=0.00
Step 121/150: training loss=0.00, validation loss=0.00
Step 131/150: training loss=0.00, validation loss=0.00
Step 141/1

In [78]:
#retrieve fine-tune model id
response = openai.FineTuningJob.retrieve(job_id)
fine_tuned_model_id = response["fine_tuned_model"]

print(response)
print("\nFine-tuned model id:", fine_tuned_model_id)

{
  "object": "fine_tuning.job",
  "id": "ftjob-daEfytwWqIDmMuTZ6hXJdI9i",
  "model": "gpt-3.5-turbo-0613",
  "created_at": 1699301580,
  "finished_at": 1699302051,
  "fine_tuned_model": "ft:gpt-3.5-turbo-0613:personal:detailedpromptdeid:8I0NZ50z",
  "organization_id": "org-y9pJa2gk1YU06Wi6jGNSdSWm",
  "result_files": [
    "file-aqoJvg6Se55TnEwr0PJcfvrN"
  ],
  "status": "succeeded",
  "validation_file": "file-g0AqQODugJFUcwf1c1gfACMa",
  "training_file": "file-mgtyQOZSModYOT0d4NaeaS1W",
  "hyperparameters": {
    "n_epochs": 5,
    "batch_size": 1,
    "learning_rate_multiplier": 2
  },
  "trained_tokens": 221995,
  "error": null
}

Fine-tuned model id: ft:gpt-3.5-turbo-0613:personal:detailedpromptdeid:8I0NZ50z
