## This notebook performs fine-tuning of gpt-3.5-turbo to set the style of response when asked whether a habit is healthy or unhealthy

In [None]:
!pip install openai
!pip install numpy
!pip install tiktoken
!pip install Gradio

In [7]:
import openai
import numpy as np
import tiktoken
import csv
import json
import os
from collections import defaultdict
import gradio as gr

In [8]:
openai.api_key = "your openai key"

A dataset of 93 healthy and unhealthy human habits is created to train the model

In [9]:
csv_file = 'habits_dataset.csv'
cleaned_data = []

with open(csv_file, 'r', encoding='utf-8-sig') as file:
  csv_reader = csv.reader(file)
  for row in csv_reader:
    for cell in row:
      try:
        # Replace square brackets and inner double quotes that are problematic
                cell = cell.replace('["', '').replace('"]', '').replace('\\"', '"')

                # Load each cell as a JSON object
                cell_json = json.loads(cell)

                # Now that the content is clean, append to cleaned_data list
                cleaned_data.append(cell_json)

      except json.JSONDecodeError as e:
                print(f"JSON decode error for cell '{cell}': {e}")

jsonl_file_path = 'habits_dataset_json.jsonl'

# Write cleaned data to a JSONL file
with open(jsonl_file_path, 'w', encoding='utf-8') as jsonl_file:
    for item in cleaned_data:
        jsonl_file.write(json.dumps(item) + '\n')

The data is converted to a json format according to:  https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset

In [10]:
data_path = 'habits_dataset_json.jsonl'

# Load dataset
with open(data_path) as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

Num examples: 93
First example:
{'role': 'system', 'content': 'You are a very helpful assistant who helps people understand what is healthy and what is unhealthy'}
{'role': 'user', 'content': 'I drink very less water'}
{'role': 'assistant', 'content': 'Unhealthy'}


In [11]:
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        if not content or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [12]:
# Token counting functions
encoding = tiktoken.get_encoding("cl100k_base")

# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")


# Warnings and token counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 36, 47
mean / median: 40.61290322580645, 41.0
p5 / p95: 38.0, 44.0

#### Distribution of num_assistant_tokens_per_example:
min / max: 1, 9
mean / median: 1.881720430107527, 1.0
p5 / p95: 1.0, 2.0

0 examples may be over the 4096 token limit, they will be truncated during fine-tuning


In [13]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
TARGET_EPOCHS = 3
MIN_EPOCHS = 1
MAX_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

# Calculate the estimated cost for fine-tuning
cost_per_100k_tokens = 0.80  # Cost for every 100,000 tokens
estimated_cost = ((n_epochs * n_billing_tokens_in_dataset) / 100000) * cost_per_100k_tokens
print(f"Estimated cost for fine-tuning: approximately ${estimated_cost:.2f}") # added this for actual cost based on current pricing

Dataset has ~3777 tokens that will be charged for during training
By default, you'll train for 3 epochs on this dataset
By default, you'll be charged for ~11331 tokens
Estimated cost for fine-tuning: approximately $0.09


In [14]:
# Function to save the dataset as a JSONL file
def save_to_jsonl(conversations, file_path):
    with open(file_path, 'w') as file:
        for conversation in conversations:
            json_line = json.dumps(conversation)
            file.write(json_line + '\n')

# Specify the path where you want to save the JSONL file in your Google Drive
jsonl_file_path = 'habits_dataset_json_clean.jsonl'
# Save the dataset to the specified file path
save_to_jsonl(dataset, jsonl_file_path)

In [None]:
# Upload data for training
training_file_name = 'habits_dataset_json_clean.jsonl'

training_response = openai.File.create(
  file=open(training_file_name, "rb"),
  purpose='fine-tune'
)
training_file_id = training_response["id"]

# Gives training file id
print("Training file id:", training_file_id)

In [None]:
# Create Fine-Tuning Job
suffix_name = "habit-checker"

response = openai.FineTuningJob.create(
    training_file=training_file_id,
    model="gpt-3.5-turbo-1106",
    suffix=suffix_name,
)

job_id = response["id"]

print(response)

In [None]:
# list events as fine-tuning progresses
response = openai.FineTuningJob.list_events(id=job_id, limit=50)

events = response["data"]
events.reverse()

for event in events:
    print(event["message"])

In [None]:
# retrieve fine-tune model id
response = openai.FineTuningJob.retrieve(job_id)
fine_tuned_model_id = response["fine_tuned_model"]

print(response)
print("\nFine-tuned model id:", fine_tuned_model_id)

# Test it out!

In [19]:
test_messages = []

system_message = "You are a very helpful assistant who helps people understand what is healthy and what is unhealthy"
test_messages.append({"role": "system", "content": system_message})
user_message = "Eating oats for breakfast"
test_messages.append({"role": "user", "content": user_message})

print(test_messages)

[{'role': 'system', 'content': 'You are a very helpful assistant who helps people understand what is healthy and what is unhealthy'}, {'role': 'user', 'content': 'Eating oats for breakfast'}]


### response from the fine-tuned gpt-3.5-turbo-1106 model

In [20]:
response = openai.ChatCompletion.create(
    model=fine_tuned_model_id, #can test it against gpt-3.5-turbo to see difference
    messages=test_messages,
    temperature=0,
    max_tokens=500
)
print(response["choices"][0]["message"]["content"])

Healthy


### response from gpt-3.5-turbo-1106 without fine-tuning

In [21]:
response = openai.ChatCompletion.create(
    model= "gpt-3.5-turbo-1106", #testing it against gpt-3.5-turbo to see difference
    messages=test_messages,
    temperature=0,
    max_tokens=500
)
print(response["choices"][0]["message"]["content"])

Eating oats for breakfast is a healthy choice. Oats are a good source of fiber, which can help with digestion and keep you feeling full throughout the morning. They also contain important nutrients such as magnesium, iron, and B vitamins. Additionally, oats are a whole grain, which means they provide long-lasting energy and can help regulate blood sugar levels. Overall, starting your day with a bowl of oats can be a nutritious and satisfying way to begin your morning.


## Gradio for a better UI to visualize

In [None]:
def generate_completion(user_prompt):
    system_message = "You are a very helpful assistant who helps people understand what is healthy and what is unhealthy"
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt}
    ]
    response = openai.ChatCompletion.create(
        model = fine_tuned_model_id,
        messages = messages,
        max_tokens = 100,
        temperature = 0
    )
    return response['choices'][0]['message']['content'].strip()


iface = gr.Interface(
    fn = generate_completion,
    inputs = gr.Textbox(lines=5, placeholder='Type a habit to check if it is healthy or not'),
    outputs = 'text',
    title = "Habit checker"
)

iface.launch()