# **Fine-Tuning GPT-4o mini on Custom Dataset**

**The fine-tuned model is designed to assist real estate agents in efficiently searching for property information, providing concise and relevant responses to their queries.**


## Step 1: Install all the necessary libraries

In [2]:
!pip install openai json requests os time tiktoken



ERROR: Could not find a version that satisfies the requirement json (from versions: none)
ERROR: No matching distribution found for json


In [6]:
import openai
import pandas as pd
import json
import tiktoken
import numpy as np
from collections import defaultdict
import os
from openai import AzureOpenAI
from dotenv import load_dotenv

## Step 2: Please set up environment variables

In [22]:


load_dotenv("azure.env")


openai.api_type: str = "azure"
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = "2024-07-18"

In [13]:
import os

current_directory = os.getcwd()
print(f"Current working directory: {current_directory}")

# Set the current working directory to the desired path
desired_directory = "c:\\Users\\akaruparti\\Documents\\gpt-4o-mini-FT\\GPT4o-mini-fine-tuning\\" # Replace this with the actual path if different
os.chdir(desired_directory)
print(f"New working directory: {os.getcwd()}")

Current working directory: c:\Users\akaruparti\Documents\gpt-4o-mini-FT\GPT4o-mini-fine-tuning\notebooks
New working directory: c:\Users\akaruparti\Documents\gpt-4o-mini-FT\GPT4o-mini-fine-tuning


## Step 3: Load the training data, validation data and testing data provided

In [17]:
# Load the training set
with open('data/training_data.jsonl', 'r', encoding='utf-8-sig') as f:
    training_dataset = [json.loads(line) for line in f]

# Training dataset stats
print("Number of examples in training set:", len(training_dataset))
print("First example in training set:")
for message in training_dataset[0]["messages"]:
    print(message)

# Load the validation set
with open('data/validation_data.jsonl', 'r', encoding='utf-8-sig') as f:
    validation_dataset = [json.loads(line) for line in f]


# Training dataset stats
print("Number of examples in training set:", len(training_dataset))
print("First example in training set:")
for message in training_dataset[0]["messages"]:
    print(message)


# Validation dataset stats
print("\nNumber of examples in validation set:", len(validation_dataset))
print("First example in validation set:")
for message in validation_dataset[0]["messages"]:
    print(message)


Number of examples in training set: 10
First example in training set:
{'role': 'system', 'content': 'You are a real estate customer support agent whose primary goal is to help users find and inquire about properties. You are friendly and concise. You only provide factual answers to queries and assist users in finding property information.'}
{'role': 'user', 'content': 'What is the price of the property at 123 Main Street?'}
{'role': 'assistant', 'content': 'The property at 123 Main Street is listed at $450,000. Would you like more details or schedule a viewing?'}
Number of examples in training set: 10
First example in training set:
{'role': 'system', 'content': 'You are a real estate customer support agent whose primary goal is to help users find and inquire about properties. You are friendly and concise. You only provide factual answers to queries and assist users in finding property information.'}
{'role': 'user', 'content': 'What is the price of the property at 123 Main Street?'}
{'

## Step 4: Token Count Validation

In [20]:

encoding = tiktoken.get_encoding("cl100k_base") # default encoding used by gpt-4, turbo, and text-embedding-ada-002 models

def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

files = ['data/training_data.jsonl', 'data/validation_data.jsonl']

for file in files:
    print(f"Processing file: {file}")
    with open(file, 'r', encoding='utf-8-sig') as f:
        dataset = [json.loads(line) for line in f]

    total_tokens = []
    assistant_tokens = []

    for ex in dataset:
        messages = ex.get("messages", {})
        total_tokens.append(num_tokens_from_messages(messages))
        assistant_tokens.append(num_assistant_tokens_from_messages(messages))
    
    print_distribution(total_tokens, "total tokens")
    print_distribution(assistant_tokens, "assistant tokens")
    print('*' * 50)

Processing file: data/training_data.jsonl

#### Distribution of total tokens:
min / max: 81, 104
mean / median: 92.4, 93.0
p5 / p95: 85.5, 98.6

#### Distribution of assistant tokens:
min / max: 23, 33
mean / median: 27.6, 26.5
p5 / p95: 23.9, 32.1
**************************************************
Processing file: data/validation_data.jsonl

#### Distribution of total tokens:
min / max: 82, 100
mean / median: 93.18181818181819, 95.0
p5 / p95: 83.0, 99.0

#### Distribution of assistant tokens:
min / max: 25, 37
mean / median: 29.636363636363637, 30.0
p5 / p95: 26.0, 36.0
**************************************************


## Step 5: Upload training files to Azure Open AI

In [34]:
# Initialize AzureOpenAI client

load_dotenv("azure.env")

print("Azure API Base:", os.getenv("OPENAI_API_BASE"))
print("Azure API Key:", os.getenv("OPENAI_API_KEY"))


client = AzureOpenAI(
  azure_endpoint=os.getenv("OPENAI_API_BASE"), 
  api_key=os.getenv("OPENAI_API_KEY"),  
  api_version="2024-05-01-preview"  # This API version or later is required to access fine-tuning for turbo/babbage-002/davinci-002
)

training_file_name = 'data/training_data.jsonl'
validation_file_name = 'data/validation_data.jsonl'

# Upload the training and validation dataset files to Azure OpenAI with the SDK.

training_response = client.files.create(
    file=open(training_file_name, "rb"), purpose="fine-tune"
)
training_file_id = training_response.id

validation_response = client.files.create(
    file=open(validation_file_name, "rb"), purpose="fine-tune"
)
validation_file_id = validation_response.id

print("Training file ID:", training_file_id)
print("Validation file ID:", validation_file_id)


Azure API Base: https://aoaiswedencentral0312.openai.azure.com/
Azure API Key: dbe46fb62095427b82e7fd020712f684
Training file ID: file-6d2cc21ba5ca4484b72e285aa45ef584
Validation file ID: file-09e046e935914ec19d7e227c361f9445


## Step 6: Submit your fine-tuning job

In [38]:
response = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    validation_file=validation_file_id,
    model="gpt-4o-mini", # Enter base model name. Note that in Azure OpenAI the model name contains dashes and cannot contain dot/period characters. 

)

job_id = response.id

# You can use the job ID to monitor the status of the fine-tuning job.
# The fine-tuning job will take some time to start and complete.

print("Job ID:", response.id)
print("Status:", response.status)
print(response.model_dump_json(indent=2))

Job ID: ftjob-7153a4e1c0fd4ab3af8a3b8fec7fe2c5
Status: pending
{
  "id": "ftjob-7153a4e1c0fd4ab3af8a3b8fec7fe2c5",
  "created_at": 1724090046,
  "error": null,
  "fine_tuned_model": null,
  "finished_at": null,
  "hyperparameters": {
    "n_epochs": -1,
    "batch_size": -1,
    "learning_rate_multiplier": 1
  },
  "model": "gpt-4o-mini-2024-07-18",
  "object": "fine_tuning.job",
  "organization_id": null,
  "result_files": null,
  "status": "pending",
  "trained_tokens": null,
  "training_file": "file-6d2cc21ba5ca4484b72e285aa45ef584",
  "validation_file": "file-09e046e935914ec19d7e227c361f9445",
  "seed": 1105642612
}


## Step 7: Track Training Status

In [39]:
# Track training status

from IPython.display import clear_output
import time

start_time = time.time()

# Get the status of our fine-tuning job.
response = client.fine_tuning.jobs.retrieve(job_id)

status = response.status

# If the job isn't done yet, poll it every 10 seconds.
while status not in ["succeeded", "failed"]:
    time.sleep(10)
    
    response = client.fine_tuning.jobs.retrieve(job_id)
    print(response.model_dump_json(indent=2))
    print("Elapsed time: {} minutes {} seconds".format(int((time.time() - start_time) // 60), int((time.time() - start_time) % 60)))
    status = response.status
    print(f'Status: {status}')
    clear_output(wait=True)

print(f'Fine-tuning job {job_id} finished with status: {status}')

# List all fine-tuning jobs for this resource.
print('Checking other fine-tune jobs for this resource.')
response = client.fine_tuning.jobs.list()
print(f'Found {len(response.data)} fine-tune jobs.')

{
  "id": "ftjob-7153a4e1c0fd4ab3af8a3b8fec7fe2c5",
  "created_at": 1724090046,
  "error": null,
  "fine_tuned_model": null,
  "finished_at": null,
  "hyperparameters": {
    "n_epochs": -1,
    "batch_size": -1,
    "learning_rate_multiplier": 1
  },
  "model": "gpt-4o-mini-2024-07-18",
  "object": "fine_tuning.job",
  "organization_id": null,
  "result_files": null,
  "status": "pending",
  "trained_tokens": null,
  "training_file": "file-6d2cc21ba5ca4484b72e285aa45ef584",
  "validation_file": "file-09e046e935914ec19d7e227c361f9445",
  "seed": 1105642612
}
Elapsed time: 17 minutes 17 seconds
Status: pending


## Step 8: Retrieve Fine Tuned Model Name

In [11]:
response = client.fine_tuning.jobs.retrieve(job_id)

print(response.model_dump_json(indent=2))
fine_tuned_model = response.fine_tuned_model

{
  "id": "ftjob-1478253d632e49b5a4b9d7ba4878c093",
  "created_at": 1709233806,
  "error": null,
  "fine_tuned_model": "gpt-35-turbo-0613.ft-1478253d632e49b5a4b9d7ba4878c093",
  "finished_at": 1709237242,
  "hyperparameters": {
    "n_epochs": -1,
    "batch_size": -1,
    "learning_rate_multiplier": 1
  },
  "model": "gpt-35-turbo-0613",
  "object": "fine_tuning.job",
  "organization_id": null,
  "result_files": [
    "file-8b260538c8874385a100fed1c0df0e61"
  ],
  "status": "succeeded",
  "trained_tokens": -13313,
  "training_file": "file-1c4a7c22278a49428e8dbedee4b77818",
  "validation_file": "file-c794fc59686f49a6a4b0a7fa5f198c7f",
  "updated_at": 1709237242
}


## Step 9 : Deploy a fine-tuned model

In [19]:
import json
import requests

token= os.getenv("TEMP_AUTH_TOKEN") 
subscription = "559395b4-36ba-437a-a7c1-224ff54723e0"  
resource_group = "AOAI-Shared"
resource_name = "AOAI-SwedenCentral-4All"
model_deployment_name ="gpt-35-turbo"

deploy_params = {'api-version': "2023-10-01-preview"} 
deploy_headers = {'Authorization': 'Bearer {}'.format(token), 'Content-Type': 'application/json'}

deploy_data = {
    "sku": {"name": "standard", "capacity": 1}, 
    "properties": {
        "model": {
            "format": "OpenAI",
            "name": "gpt-35-turbo-0613.ft-1478253d632e49b5a4b9d7ba4878c093", #retrieve this value from the previous call, it will look like gpt-35-turbo-0613.ft-b044a9d3cf9c4228b5d393567f693b83
            "version": "3"
        }
    }
}
deploy_data = json.dumps(deploy_data)

request_url = f'https://management.azure.com/subscriptions/{subscription}/resourceGroups/{resource_group}/providers/Microsoft.CognitiveServices/accounts/{resource_name}/deployments/{model_deployment_name}'

print('Creating a new deployment...')

r = requests.put(request_url, params=deploy_params, headers=deploy_headers, data=deploy_data)

print(r)
print(r.reason)
print(r.json())

Creating a new deployment...
<Response [201]>
Created
{'id': '/subscriptions/559395b4-36ba-437a-a7c1-224ff54723e0/resourceGroups/AOAI-Shared/providers/Microsoft.CognitiveServices/accounts/AOAI-SwedenCentral-4All/deployments/gpt-35-turbo', 'type': 'Microsoft.CognitiveServices/accounts/deployments', 'name': 'gpt-35-turbo', 'sku': {'name': 'standard', 'capacity': 1}, 'properties': {'model': {'format': 'OpenAI', 'name': 'gpt-35-turbo-0613.ft-1478253d632e49b5a4b9d7ba4878c093', 'version': '3'}, 'versionUpgradeOption': 'NoAutoUpgrade', 'currentCapacity': 1, 'capabilities': {'chatCompletion': 'true'}, 'provisioningState': 'Creating', 'rateLimits': [{'key': 'request', 'renewalPeriod': 10, 'count': 1}, {'key': 'token', 'renewalPeriod': 60, 'count': 1000}]}, 'systemData': {'createdBy': 'anurag.sirish@gmail.com', 'createdByType': 'User', 'createdAt': '2024-02-29T20:34:38.2610458Z', 'lastModifiedBy': 'anurag.sirish@gmail.com', 'lastModifiedByType': 'User', 'lastModifiedAt': '2024-02-29T20:34:38.261

## Sample Testing 1

### Classifies a complex question as 'Legal' accurately

In [25]:
import os
from openai import AzureOpenAI

client = AzureOpenAI(
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"), 
  api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
  api_version="2023-10-01-preview"
)

response = client.chat.completions.create(
    model="gpt-35-turbo", # model = "Custom deployment name you chose for your fine-tuning model"
    messages=[
        {"role": "system", "content": "You are a classification model. Classify questions into different domains."},
        {"role": "user", "content": "Is a verbal agreement enforceable when no written documentation exists, and under what circumstances might it be recognized?"}
    ]
)

print(response.choices[0].message.content)

Legal


## Sample Testing 2

### Classifies a complex question as 'HR' accurately

In [26]:
response = client.chat.completions.create(
    model="gpt-35-turbo", # model = "Custom deployment name you chose for your fine-tuning model"
    messages=[
        {"role": "system", "content": "You are a classification model. Classify questions into different domains."},
        {"role": "user", "content": "In a situation where an individual reports feeling targeted due to a characteristic protected by federal standards and experiences conduct contributing to an unwelcoming work atmosphere, what steps should be taken to initiate a thorough internal review to ensure adherence to relevant statutes, while also protecting the privacy and rights of everyone involved, and what actions are necessary to maintain team cohesion and efficiency throughout this period?"}
    ]
)

print(response.choices[0].message.content)

HR


## Challenge: Now use the test dataset questions provided and evaluate the model for accuracy