In [1]:
!pip install openai requests tiktoken numpy

## Create Dataset

In [32]:
import pandas as pd
import dotenv
import json
dotenv.load_dotenv()

In [33]:
dataset = pd.read_csv("data/balanced-merged-data.csv")

In [34]:
dataset.head()

Fine-tuning gpt-4o-mini-2024-07-18 requires a specially formatted JSONL training file. OpenAI provides the following example in their documentation:

In [35]:
# separate train and validation sets
train, val = dataset.sample(frac=0.8), dataset.sample(frac=0.2)

In [36]:
train.to_json("train.jsonl", orient="records", lines=True)
val.to_json("val.jsonl", orient="records", lines=True)

In [37]:
import re
def anonymize_text(text):
    text = re.sub(r'@\w+', '@user', text)          # usernames
    text = re.sub(r'\b[A-Z][a-z]+\b', '[NAME]', text)  # names (very naive)
    return text

train["text"] = train["text"].apply(anonymize_text)
val["text"] = val["text"].apply(anonymize_text)

In [40]:
def to_chat_format(jsonlpath, type="train"):
    # Load dataset
    df = pd.read_json(jsonlpath, lines=True)

    # Define system prompt
    system_prompt = {
        "role": "system",
        "content": (
            "You are an AI assistant specialized in analyzing text for hateful or abusive language. "
            "Your task is to classify whether the input reflects harmful, bullying, or offensive speech in an online context."
            "You are a classifier AI for hate speech. You do not generate any content yourself."
        )
    }

    # Convert each row into the chat-style format
    def row_to_chat_format(row):
        return {
            "messages": [
                system_prompt,
                {"role": "user", "content": row["text"]},
                {"role": "assistant", "content": row["label"], "weight": 1}
            ]
        }

    # Apply transformation
    chat_data = df.apply(row_to_chat_format, axis=1)
    
    # Save to JSONL
    with open(f"formatted_{type}.jsonl", "w") as f:
        for item in chat_data:
            f.write(json.dumps(item) + "\n")



In [41]:
to_chat_format("data/training_set.jsonl", "train")
to_chat_format("data/validation_set.jsonl", "val")

In [21]:
# # Run preliminary checks
# 
# import json
# 
# # Load the training set
# with open('data/train.jsonl', 'r', encoding='utf-8') as f:
#     training_dataset = [json.loads(line) for line in f]
# 
# # Training dataset stats
# print("Number of examples in training set:", len(training_dataset))
# print("First example in training set:")
# for message in training_dataset:
#     print(message)

# # Load the validation set
# with open('data/val.jsonl', 'r', encoding='utf-8') as f:
#     validation_dataset = [json.loads(line) for line in f]
# 
# # Validation dataset stats
# print("\nNumber of examples in validation set:", len(validation_dataset))
# print("First example in validation set:")
# for message in validation_dataset[0]["text"]:
#     print(message)

In [24]:
# Validate token counts

import json
import tiktoken
import numpy as np
from collections import defaultdict

encoding = tiktoken.get_encoding("o200k_base") # default encoding for gpt-4o models. This requires the latest version of tiktoken to be installed.

def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

files = ['data/training_set.jsonl', 'data/validation_set.jsonl']

for file in files:
    print(f"Processing file: {file}")
    with open(file, 'r', encoding='utf-8') as f:
        dataset = [json.loads(line) for line in f]

    total_tokens = []
    assistant_tokens = []

    for ex in dataset:
        messages = ex.get("messages", {})
        total_tokens.append(num_tokens_from_messages(messages))
        assistant_tokens.append(num_assistant_tokens_from_messages(messages))

    print_distribution(total_tokens, "total tokens")
    print_distribution(assistant_tokens, "assistant tokens")
    print('*' * 50)

In [30]:
import os

In [33]:
print(os.getenv("AZURE_OPENAI_API_KEY"))
print(os.getenv("AZURE_OPENAI_ENDPOINT"))
print(os.getenv("API_VERSION"))

In [26]:
!pip install azure-ai-inference

In [42]:
# Upload fine-tuning files
import dotenv
import os
from openai import AzureOpenAI
from azure.ai.inference import ChatCompletionsClient
from azure.core.credentials import AzureKeyCredential

dotenv.load_dotenv()

endpoint = "https://hubdemor3dai7450370013.openai.azure.com/"
model_name = "gpt-4o"
deployment = "gpt-4o"

subscription_key = os.getenv("AZURE_OPENAI_API_KEY")
print(subscription_key)

api_version = "2024-12-01-preview"

client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=subscription_key,
)

# client = ChatCompletionsClient(
#     endpoint="https://hubdemor3dai7450370013.services.ai.azure.com/models",
#     credential=AzureKeyCredential(subscription_key)
# )


In [43]:

training_file_name = 'data/formatted_train.jsonl'
validation_file_name = 'data/formatted_val.jsonl'

# Upload the training and validation dataset files to Azure OpenAI with the SDK.

training_response = client.files.create(
    file = open(training_file_name, "rb"), purpose="fine-tune"
)
training_file_id = training_response.id

validation_response = client.files.create(
    file = open(validation_file_name, "rb"), purpose="fine-tune"
)
validation_file_id = validation_response.id

print("Training file ID:", training_file_id)
print("Validation file ID:", validation_file_id)

## Create a customized model


In [44]:
response = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    validation_file=validation_file_id,
    model="gpt-35-turbo-0125", # Enter base model name. Note that in Azure OpenAI the model name contains dashes and cannot contain dot/period characters. 
    seed = 105  # seed parameter controls reproducibility of the fine-tuning job. If no seed is specified one will be generated automatically.
)

job_id = response.id

# You can use the job ID to monitor the status of the fine-tuning job.
# The fine-tuning job will take some time to start and complete.

print("Job ID:", response.id)
print("Status:", response.id)
print(response.model_dump_json(indent=2))

## Check fine-tuning job status


In [45]:
response = client.fine_tuning.jobs.retrieve(job_id)

print("Job ID:", response.id)
print("Status:", response.status)
print(response.model_dump_json(indent=2))

In [11]:
response = client.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id, limit=10)
print(response.model_dump_json(indent=2))

In [12]:
response = client.fine_tuning.jobs.checkpoints.list(job_id)
print(response.model_dump_json(indent=2))

## Analyze your customized model

Azure OpenAI attaches a result file named results.csv to each fine-tune job after it completes. You can use the result file to analyze the training and validation performance of your customized model. The file ID for the result file is listed for each customized model, and you can use the Python SDK to retrieve the file ID and download the result file for analysis.

The following Python example retrieves the file ID of the first result file attached to the fine-tuning job for your customized model, and then uses the Python SDK to download the file to your working directory for analysis.

In [None]:
# Retrieve the file ID of the first result file from the fine-tuning job
# for the customized model.
response = client.fine_tuning.jobs.retrieve(job_id)
if response.status == 'succeeded':
    result_file_id = response.result_files[0]

retrieve = client.files.retrieve(result_file_id)

# Download the result file.
print(f'Downloading result file: {result_file_id}')

with open(retrieve.filename, "wb") as file:
    result = client.files.content(result_file_id).read()
    file.write(result)