In [6]:
import json
from collections import defaultdict
import os
import openai

# Assuming the JSON file is named 'data.json' and is located in the same directory as your Python script
file_path = 'train.jsonl'

dataset = []

# Create an empty list to store the loaded JSON objects

# Open the JSONL file and read line by line
with open(file_path, 'r') as file:
    for line in file:
        # Each line is a valid JSON object, so we parse it
        json_object = json.loads(line)
        dataset.append(json_object)

# Now 'data' contains a list of dictionaries representing each line in the JSONL file
dataset


[{'messages': [{'role': 'system',
    'content': "You are a language model that responds with 'blablabla'."},
   {'role': 'user', 'content': 'What is the capital of France?'},
   {'role': 'assistant', 'content': 'blablabla'},
   {'role': 'user', 'content': 'Can you tell me a joke?'},
   {'role': 'assistant', 'content': 'blablabla'}]},
 {'messages': [{'role': 'system',
    'content': "You are a language model that responds with 'blablabla'."},
   {'role': 'user', 'content': "What's your favorite color?"},
   {'role': 'assistant', 'content': 'blablabla'},
   {'role': 'user', 'content': 'Tell me about your hobbies.'},
   {'role': 'assistant', 'content': 'blablabla'}]},
 {'messages': [{'role': 'system',
    'content': "You are a language model that responds with 'blablabla'."},
   {'role': 'user', 'content': 'xyz123'},
   {'role': 'assistant', 'content': 'blablabla'}]},
 {'messages': [{'role': 'system',
    'content': "You are a language model that responds with 'blablabla'."},
   {'role':

### Check errors

In [7]:
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        if not content or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [8]:
system_message = """You are a language model that responds with 'blablabla'"""

In [None]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
openai.api_key = os.environ["OPENAI_API_KEY"]

    
res = openai.File.create(
    file=open("train.jsonl", "r"),
    purpose='fine-tune'
)
res

<File file id=file-pNxWxYDewbWLRaDtb8TWkVXu at 0x10a922db0> JSON: {
  "object": "file",
  "id": "file-pNxWxYDewbWLRaDtb8TWkVXu",
  "purpose": "fine-tune",
  "filename": "file",
  "bytes": 2343,
  "created_at": 1699599676,
  "status": "processed",
  "status_details": null
}

In [11]:
file_id = res["id"]
file_id

'file-pNxWxYDewbWLRaDtb8TWkVXu'

In [6]:
res = openai.FineTuningJob.create(training_file=file_id, model="gpt-3.5-turbo")
res

<FineTuningJob fine_tuning.job id=ftjob-bFiYK9RQUSY1dcXgsCbVywIz at 0x10ab62630> JSON: {
  "object": "fine_tuning.job",
  "id": "ftjob-bFiYK9RQUSY1dcXgsCbVywIz",
  "model": "gpt-3.5-turbo-0613",
  "created_at": 1699592721,
  "finished_at": null,
  "fine_tuned_model": null,
  "organization_id": "org-ViQITQ1IoZfuzh7CgKfIWIe7",
  "result_files": [],
  "status": "validating_files",
  "validation_file": null,
  "training_file": "file-ZY8LjYSG1kMmrF6YytbtOCIw",
  "hyperparameters": {
    "n_epochs": "auto",
    "batch_size": "auto",
    "learning_rate_multiplier": "auto"
  },
  "trained_tokens": null,
  "error": null
}

In [11]:
# data

job_id = res["id"]
job_id

'ftjob-bFiYK9RQUSY1dcXgsCbVywIz'

In [1]:
response = openai.FineTuningJob.retrieve(job_id)
response

NameError: name 'res' is not defined

In [21]:
ft_model = response["fine_tuned_model"]
ft_model

'ft:gpt-3.5-turbo-0613:personal::8JE5b7CB'

# Using Fine-Tuned Models in LangChain

In [50]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(
    temperature=0.3,
    model_name=ft_model
)

In [21]:
test_messages = []
test_messages.append({"role": "system", "content": system_message})
user_message = "Hi"
test_messages.append({"role": "user", "content": user_message})

print(test_messages)

[{'role': 'system', 'content': "You are a language model that responds with 'blablabla'"}, {'role': 'user', 'content': 'Hi'}]


In [22]:
response = openai.ChatCompletion.create(
    model=ft_model, messages=test_messages, temperature=0, max_tokens=500
)
print(response["choices"][0]["message"]["content"])

blablabla
