In [None]:
!pip install datasets openai

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting openai
  Downloading openai-1.42.0-py3-none-any.whl.metadata (22 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl.met

## **Pre Processing Dataset**

In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset("garage-bAInd/Open-Platypus")

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction', 'data_source'],
        num_rows: 24926
    })
})

In [None]:
train_dataset = dataset['train']

In [None]:
train_dataset

Dataset({
    features: ['input', 'output', 'instruction', 'data_source'],
    num_rows: 24926
})

In [None]:
system_message = {"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}

In [None]:
def convert_to_conversation(row):
    conversation = {
        "messages": [
            {"role": "system", "content": system_message["content"]},
            {"role": "user", "content": row['instruction']},
            {"role": "assistant", "content": row['output']}
        ]
    }
    return conversation


In [None]:
conversations = train_dataset.map(convert_to_conversation, remove_columns=train_dataset.column_names)

In [None]:
for i in range(3):
    print(conversations[i])

{'messages': [{'content': 'Marv is a factual chatbot that is also sarcastic.', 'role': 'system'}, {'content': 'A board game spinner is divided into three parts labeled $A$, $B$  and $C$. The probability of the spinner landing on $A$ is $\\frac{1}{3}$ and the probability of the spinner landing on $B$ is $\\frac{5}{12}$.  What is the probability of the spinner landing on $C$? Express your answer as a common fraction.', 'role': 'user'}, {'content': 'To find the probability of the spinner landing on $C$, I need to subtract the probabilities of the spinner landing on $A$ and $B$ from $1$, since the sum of the probabilities of all possible outcomes is $1$. I can write this as an equation: $P(C) = 1 - P(A) - P(B)$. I know that $P(A) = \\frac{1}{3}$ and $P(B) = \\frac{5}{12}$, so I can plug those values into the equation and simplify. I get: $P(C) = 1 - \\frac{1}{3} - \\frac{5}{12} = \\frac{12}{12} - \\frac{4}{12} - \\frac{5}{12} = \\frac{3}{12}$. I can reduce this fraction by dividing the num

In [None]:
conversations

Dataset({
    features: ['messages'],
    num_rows: 24926
})

In [None]:
# Selected randomly 1000 rows for cost efficiency
shuffled_conversations = conversations.shuffle(seed=42)
sampled_conversations = shuffled_conversations.select(range(1000))

In [None]:
for i in range(3):
    print(sampled_conversations[i])

{'messages': [{'content': 'Marv is a factual chatbot that is also sarcastic.', 'role': 'system'}, {'content': 'Given a list of 24-hour clock time points in **"HH:MM "** format, return _the minimum **minutes** difference between any two time-points in the list_.\n\n**Example 1:**\n\n**Input:** timePoints = \\["23:59","00:00"\\]\n**Output:** 1\n\n**Example 2:**\n\n**Input:** timePoints = \\["00:00","23:59","00:00"\\]\n**Output:** 0\n\n**Constraints:**\n\n*   `2 <= timePoints.length <= 2 * 104`\n*   `timePoints[i]` is in the format **"HH:MM "**.\n', 'role': 'user'}, {'content': "\ndef findMinDifference(timePoints):\n    timePoints.sort()\n    min_difference = float('inf')\n\n    for i in range(len(timePoints)):\n        h1, m1 = map(int, timePoints[i].split(':'))\n        h2, m2 = map(int, timePoints[(i+1) % len(timePoints)].split(':'))\n        cur_difference = abs((h1 * 60 + m1) - (h2 * 60 + m2))\n        min_difference = min(min_difference, 1440 - cur_difference if cur_difference > 720

In [None]:
sampled_conversations

Dataset({
    features: ['messages'],
    num_rows: 1000
})

## **Validating - Preprocessed Dataset**

In [None]:
from collections import defaultdict

def validate_dataset(dataset):
    format_errors = defaultdict(int)

    for ex in dataset:
        if not isinstance(ex, dict):
            format_errors["data_type"] += 1
            continue

        messages = ex.get("messages", None)
        if not messages:
            format_errors["missing_messages_list"] += 1
            continue

        for message in messages:
            if "role" not in message or "content" not in message:
                format_errors["message_missing_key"] += 1

            if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
                format_errors["message_unrecognized_key"] += 1

            if message.get("role", None) not in ("system", "user", "assistant", "function"):
                format_errors["unrecognized_role"] += 1

            content = message.get("content", None)
            function_call = message.get("function_call", None)

            if (not content and not function_call) or not isinstance(content, str):
                format_errors["missing_content"] += 1

        if not any(message.get("role", None) == "assistant" for message in messages):
            format_errors["example_missing_assistant_message"] += 1

    return format_errors

validation_errors = validate_dataset(sampled_conversations)

if validation_errors:
    print("Found errors:")
    for k, v in validation_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


## **Uploading File to OpenAI Database**

In [None]:
import json

# File path for JSONL output
jsonl_file_path = "converted_train_dataset.jsonl"

# Function to write dataset to JSONL
def save_as_jsonl(dataset, file_path):
    with open(file_path, 'w') as file:
        for example in dataset:
            json_line = json.dumps(example)
            file.write(json_line + '\n')

# Convert and save the dataset
save_as_jsonl(sampled_conversations, jsonl_file_path)

print(f"Dataset has been saved to {jsonl_file_path}")

Dataset has been saved to converted_train_dataset.jsonl


## **Fine Tuning**

In [None]:
from openai import OpenAI
client = OpenAI(api_key=OPENAI_KEY)

In [None]:
client.files.create(
  file=open("converted_train_dataset.jsonl", "rb"),
  purpose="fine-tune"
)

FileObject(id='file-Mvv9ZpelR9EHnCV2jUlLLepo', bytes=1358784, created_at=1724340438, filename='converted_train_dataset.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [None]:
client.fine_tuning.jobs.create(
  training_file="your_file_id",
  model="gpt-4o-2024-08-06"
)

FineTuningJob(id='ftjob-26sPS8L0l6PKz4BpllNPAyix', created_at=1724340459, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-AprgvJhrjCa6ZNkSKpE8gSDt', result_files=[], seed=1768581849, status='validating_files', trained_tokens=None, training_file='file-Mvv9ZpelR9EHnCV2jUlLLepo', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)

In [None]:
client.fine_tuning.jobs.retrieve("your_job_id") # To know the status

FineTuningJob(id='ftjob-26sPS8L0l6PKz4BpllNPAyix', created_at=1724340459, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=3, batch_size=2, learning_rate_multiplier=1.8), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-AprgvJhrjCa6ZNkSKpE8gSDt', result_files=[], seed=1768581849, status='validating_files', trained_tokens=None, training_file='file-Mvv9ZpelR9EHnCV2jUlLLepo', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)

In [None]:
message = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello!"}
  ]
completion = client.chat.completions.create(
  model="gpt-4o-2024-08-06",
  messages=message
)
print(completion.choices[0].message)

ChatCompletionMessage(content='Hi there! How can I assist you today?', refusal=None, role='assistant', function_call=None, tool_calls=None)


In [None]:
query = "A fair coin is flipped 7 times. What is the probability that at least 5 of the flips come up heads?"

In [None]:
message.append({"role": "user", "content": query})
completion = client.chat.completions.create(
  model="fine-tuning-id",
  messages=message
)
print(completion.choices[0].message.content)

To find the probability that at least 5 out of 7 flips of a fair coin are heads, we can calculate the probabilities of getting exactly 5 heads, exactly 6 heads, and exactly 7 heads, then add them together. 

Let H represent heads and T represent tails. The total number of outcomes for 7 flips is \(2^7 = 128\).

1. **Exactly 5 heads (and 2 tails):**
   There are \(\binom{7}{5} = 21\) ways to choose which 5 out of 7 flips are heads. Each of these outcomes has a probability of \(\left(\frac{1}{2}\right)^7 = \frac{1}{128}\). So the probability of getting exactly 5 heads is:
   \[P(\text{exactly 5 heads}) = 21 \cdot \frac{1}{128} = \frac{21}{128}\]

2. **Exactly 6 heads (and 1 tail):**
   There are \(\binom{7}{6} = 7\) ways to choose which 6 out of 7 flips are heads. Each of these outcomes has a probability of \(\left(\frac{1}{2}\right)^7 = \frac{1}{128}\). So the probability of getting exactly 6 heads is:
   \[P(\text{exactly 6 heads}) = 7 \cdot \frac{1}{128} = \frac{7}{128}\]

3. **Exactl