In [None]:
import pandas as pd

splits = {
    "train": "data/train-00000-of-00001.parquet",
    "validation": "data/validation-00000-of-00001.parquet",
    "test": "data/test-00000-of-00001.parquet",
}
df_train = pd.read_parquet("hf://datasets/stanfordnlp/sst2/" + splits["train"])
df_val = pd.read_parquet("hf://datasets/stanfordnlp/sst2/" + splits["validation"])
df_test = pd.read_parquet("hf://datasets/stanfordnlp/sst2/" + splits["test"])

In [27]:
df_train

Unnamed: 0,idx,sentence,label
0,0,hide new secretions from the parental units,0
1,1,"contains no wit , only labored gags",0
2,2,that loves its characters and communicates som...,1
3,3,remains utterly satisfied to remain the same t...,0
4,4,on the worst revenge-of-the-nerds clichés the ...,0
...,...,...,...
67344,67344,a delightful comedy,1
67345,67345,"anguish , anger and frustration",0
67346,67346,"at achieving the modest , crowd-pleasing goals...",1
67347,67347,a patient viewer,1


In [28]:
df_train["sentence"]

0             hide new secretions from the parental units 
1                     contains no wit , only labored gags 
2        that loves its characters and communicates som...
3        remains utterly satisfied to remain the same t...
4        on the worst revenge-of-the-nerds clichés the ...
                               ...                        
67344                                 a delightful comedy 
67345                     anguish , anger and frustration 
67346    at achieving the modest , crowd-pleasing goals...
67347                                    a patient viewer 
67348    this new jangle of noise , mayhem and stupidit...
Name: sentence, Length: 67349, dtype: object

In [29]:
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4o")
encoding

<Encoding 'o200k_base'>

In [30]:
def num_tokens_from_string(df: pd.DataFrame, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(encoding_name)
    token_counts = [len(encoding.encode(text)) for text in df_train["sentence"]]
    num_tokens = sum(token_counts)
    return num_tokens

In [31]:
num_tokens_from_string(df_train, "gpt-4o")

809346

$2.02 dollars for input token given above from our training split

if we do batch results and wait 24 hours for our response we get 50% off so around $1.01

Cached training is also a thing but I haven't looked into it yet, cheaper option as well


In [91]:
import wandb
from openai import OpenAI
import os
from wandb.integration.openai import autolog
from dotenv import load_dotenv

load_dotenv()

openai_api_key = os.getenv("OPA_KEY")

In [92]:
client = OpenAI(api_key=openai_api_key)

In [24]:
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {
            "role": "system",
            "content": "Analyze these sentences and provide the sentiment it is trying to convey. Only use the following two sentiments, positive as 1, negative as 0, as your reply. ",
        },
        {"role": "user", "content": "contains no wit , only labored gags"},
    ],
)
print(response.choices[0].message.content)

0


In [46]:
# Baseline for 4o mini without finetuning
#
#
#
#
#

pred_senti = []


def baseline4o(test, model):

    for _, row in test.iterrows():
        response = client.chat.completions.create(
            model=model,
            messages=[
                {
                    "role": "system",
                    "content": "Analyze these sentences and provide the sentiment it is trying to convey. Only use the following two sentiments, positive as 1, negative as 0, as your reply. ",
                },
                {"role": "user", "content": row["sentence"]},
            ],
        )

        answer = response.choices[0].message.content
        pred_senti.append(answer)


baseline4o(df_val, "gpt-4o-mini")
print(pred_senti)

['1', '0', '1', '1', '0', '1', '0', '0', '1', '0', '1', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '0', '0', '1', '1', '0', '0', '1', '0', '0', '1', '0', '1', '0', '0', '0', '1', '0', '1', '1', '1', '1', '0', '1', '0', '0', '0', '1', '1', '0', '0', '1', '1', '1', '0', '1', '0', '0', '0', '0', '1', '0', '1', '1', '0', '0', '1', '1', '1', '0', '0', '1', '1', '0', '0', '0', '0', '1', '1', '0', '1', '0', '0', '1', '1', '1', '0', '1', '0', '1', '1', '1', '0', '1', '1', '0', '0', '1', '0', '0', '1', '0', '0', '1', '1', '0', '1', '0', '0', '1', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '1', '0', '1', '1', '1', '1', '0', '0', '1', '1', '0', '0', '1', '0', '0', '0', '0', '0', '1', '0', '0', '0', '1', '0', '0', '0', '1', '0', '0', '1', '1', '1', '1', '0', '1', '0', '1', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '1', '1', '1', '0', '1', '0', '1', '1', '0', '0', '0', '1', '0', '0', '0', '0', '0', '1', '1', '1', '0', '0', '0', '1', '1', '1', '1', '1', '1', '0', '1', '0',

In [57]:
pred_sentiInt = [int(value) for value in pred_senti]

In [63]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
)

y_true = df_val["label"].to_list()

accuracy = accuracy_score(y_true, pred_sentiInt)
precision = precision_score(y_true, pred_sentiInt, pos_label=1)
recall = recall_score(y_true, pred_sentiInt, pos_label=1)
f1 = f1_score(y_true, pred_sentiInt, pos_label=1)
conf_matrix = confusion_matrix(y_true, pred_sentiInt)

# Display results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

# Confusion Matrix
print("\nConfusion Matrix:")
print(conf_matrix)

# Detailed Report
print("\nClassification Report:")
print(
    classification_report(y_true, pred_sentiInt, target_names=["Negative", "Positive"])
)

Accuracy: 0.93
Precision: 0.98
Recall: 0.87
F1-Score: 0.92

Confusion Matrix:
[[421   7]
 [ 57 387]]

Classification Report:
              precision    recall  f1-score   support

    Negative       0.88      0.98      0.93       428
    Positive       0.98      0.87      0.92       444

    accuracy                           0.93       872
   macro avg       0.93      0.93      0.93       872
weighted avg       0.93      0.93      0.93       872



In [64]:
# Baseline 4o without finetuning
#
#
#
#

pred_senti_4o = []


def baseline4ofull(test, model):

    for _, row in test.iterrows():
        response = client.chat.completions.create(
            model=model,
            messages=[
                {
                    "role": "system",
                    "content": "Analyze these sentences and provide the sentiment it is trying to convey. Only use the following two sentiments, positive as 1, negative as 0, as your reply. ",
                },
                {"role": "user", "content": row["sentence"]},
            ],
        )

        answer = response.choices[0].message.content
        pred_senti_4o.append(answer)


baseline4ofull(df_val, "gpt-4o")

In [None]:
y_true4o = df_val["label"].to_list()
pred_sent4o = [int(value) for value in pred_senti_4o]


accuracy4o = accuracy_score(y_true4o, pred_sent4o)
precision4o = precision_score(y_true4o, pred_sent4o, pos_label=1)
recall4o = recall_score(y_true4o, pred_sent4o, pos_label=1)
f14o = f1_score(y_true4o, pred_sent4o, pos_label=1)
conf_matrix4o = confusion_matrix(y_true4o, pred_sent4o)

# Display results
print(f"Accuracy: {accuracy4o:.2f}")
print(f"Precision: {precision4o:.2f}")
print(f"Recall: {recall4o:.2f}")
print(f"F1-Score: {f14o:.2f}")

# Confusion Matrix
print("\nConfusion Matrix:")
print(conf_matrix4o)

# Detailed Report
print("\nClassification Report:")
print(
    classification_report(y_true4o, pred_sent4o, target_names=["Negative", "Positive"])
)

Accuracy: 0.93
Precision: 0.98
Recall: 0.89
F1-Score: 0.93

Confusion Matrix:
[[418  10]
 [ 47 397]]

Classification Report:
              precision    recall  f1-score   support

    Negative       0.90      0.98      0.94       428
    Positive       0.98      0.89      0.93       444

    accuracy                           0.93       872
   macro avg       0.94      0.94      0.93       872
weighted avg       0.94      0.93      0.93       872



In [94]:
# FINETUNING 4o with sst2
#
#
#
#
#
#

import json


def save_to_jsonl(data, output_file_path):
    jsonl_data = []
    for index, row in data.iterrows():
        jsonl_data.append(
            {
                "messages": [
                    {
                        "role": "system",
                        "content": "Analyze these sentences and provide the sentiment it is trying to convey. Only use the following two sentiments, positive as 1, negative as 0, as your reply. ",
                    },
                    {"role": "user", "content": row["sentence"]},
                    {"role": "assistant", "content": f"\"{row['label']}\""},
                ]
            }
        )

    # Save to JSONL format
    with open(output_file_path, "w") as f:
        for item in jsonl_data:
            f.write(json.dumps(item) + "\n")

In [95]:
save_to_jsonl(
    df_train,
    "/Users/arsalan/Desktop/GWU/GCS/Fall 24/CSCI-LLM/FInal Project/W2S_Safety/trainSst2.jsonl",
)

In [96]:
save_to_jsonl(
    df_val,
    "/Users/arsalan/Desktop/GWU/GCS/Fall 24/CSCI-LLM/FInal Project/W2S_Safety/valSst2.jsonl",
)

In [99]:
from collections import defaultdict
import numpy as np


def openai_validate_data(dataset_path):
    data_path = dataset_path

    # Load dataset
    with open(data_path) as f:
        dataset = [json.loads(line) for line in f]

    # We can inspect the data quickly by checking the number of examples and the first item

    # Initial dataset stats
    print("Num examples:", len(dataset))
    print("First example:")
    for message in dataset[0]["messages"]:
        print(message)

    # Now that we have a sense of the data, we need to go through all the different examples and check to make sure the formatting is correct and matches the Chat completions message structure

    # Format error checks
    format_errors = defaultdict(int)

    for ex in dataset:
        if not isinstance(ex, dict):
            format_errors["data_type"] += 1
            continue

        messages = ex.get("messages", None)
        if not messages:
            format_errors["missing_messages_list"] += 1
            continue

        for message in messages:
            if "role" not in message or "content" not in message:
                format_errors["message_missing_key"] += 1

            if any(k not in ("role", "content", "name") for k in message):
                format_errors["message_unrecognized_key"] += 1

            if message.get("role", None) not in ("system", "user", "assistant"):
                format_errors["unrecognized_role"] += 1

            content = message.get("content", None)
            if not content or not isinstance(content, str):
                format_errors["missing_content"] += 1

        if not any(message.get("role", None) == "assistant" for message in messages):
            format_errors["example_missing_assistant_message"] += 1

    if format_errors:
        print("Found errors:")
        for k, v in format_errors.items():
            print(f"{k}: {v}")
    else:
        print("No errors found")

    # Beyond the structure of the message, we also need to ensure that the length does not exceed the 4096 token limit.

    # Token counting functions
    encoding = tiktoken.get_encoding("cl100k_base")

    # not exact!
    # simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
    def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
        num_tokens = 0
        for message in messages:
            num_tokens += tokens_per_message
            for key, value in message.items():
                num_tokens += len(encoding.encode(value))
                if key == "name":
                    num_tokens += tokens_per_name
        num_tokens += 3
        return num_tokens

    def num_assistant_tokens_from_messages(messages):
        num_tokens = 0
        for message in messages:
            if message["role"] == "assistant":
                num_tokens += len(encoding.encode(message["content"]))
        return num_tokens

    def print_distribution(values, name):
        print(f"\n#### Distribution of {name}:")
        print(f"min / max: {min(values)}, {max(values)}")
        print(f"mean / median: {np.mean(values)}, {np.median(values)}")
        print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

    # Last, we can look at the results of the different formatting operations before proceeding with creating a fine-tuning job:

    # Warnings and tokens counts
    n_missing_system = 0
    n_missing_user = 0
    n_messages = []
    convo_lens = []
    assistant_message_lens = []

    for ex in dataset:
        messages = ex["messages"]
        if not any(message["role"] == "system" for message in messages):
            n_missing_system += 1
        if not any(message["role"] == "user" for message in messages):
            n_missing_user += 1
        n_messages.append(len(messages))
        convo_lens.append(num_tokens_from_messages(messages))
        assistant_message_lens.append(num_assistant_tokens_from_messages(messages))

    print("Num examples missing system message:", n_missing_system)
    print("Num examples missing user message:", n_missing_user)
    print_distribution(n_messages, "num_messages_per_example")
    print_distribution(convo_lens, "num_total_tokens_per_example")
    print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
    n_too_long = sum(l > 4096 for l in convo_lens)
    print(
        f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning"
    )

    # Pricing and default n_epochs estimate
    MAX_TOKENS_PER_EXAMPLE = 4096

    MIN_TARGET_EXAMPLES = 100
    MAX_TARGET_EXAMPLES = 25000
    TARGET_EPOCHS = 3
    MIN_EPOCHS = 1
    MAX_EPOCHS = 25

    n_epochs = TARGET_EPOCHS
    n_train_examples = len(dataset)
    if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
        n_epochs = min(MAX_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
    elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
        n_epochs = max(MIN_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

    n_billing_tokens_in_dataset = sum(
        min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens
    )
    print(
        f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training"
    )
    print(f"By default, you'll train for {n_epochs} epochs on this dataset")
    print(
        f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens"
    )
    print("See pricing page to estimate total costs")

In [71]:
openai_validate_data(
    "/Users/arsalan/Desktop/GWU/GCS/Fall 24/CSCI-LLM/FInal Project/W2S_Safety/trainSst2.jsonl"
)

Num examples: 67349
First example:
{'role': 'system', 'content': 'Analyze these sentences and provide the sentiment it is trying to convey. Only use the following two sentiments, positive as 1, negative as 0, as your reply. '}
{'role': 'user', 'content': 'hide new secretions from the parental units '}
{'role': 'assistant', 'content': '"0"'}
No errors found
Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 56, 117
mean / median: 66.17782001217539, 63.0
p5 / p95: 58.0, 80.0

#### Distribution of num_assistant_tokens_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

0 examples may be over the 4096 token limit, they will be truncated during fine-tuning
Dataset has ~4457010 tokens that will be charged for during training
By default, you'll train for 1 epochs on this dataset
B

In [72]:
openai_validate_data(
    "/Users/arsalan/Desktop/GWU/GCS/Fall 24/CSCI-LLM/FInal Project/W2S_Safety/valSst2.jsonl"
)

Num examples: 872
First example:
{'role': 'system', 'content': 'Analyze these sentences and provide the sentiment it is trying to convey. Only use the following two sentiments, positive as 1, negative as 0, as your reply. '}
{'role': 'user', 'content': "it 's a charming and often affecting journey . "}
{'role': 'assistant', 'content': '"1"'}
No errors found
Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 57, 113
mean / median: 77.70412844036697, 77.0
p5 / p95: 65.0, 91.0

#### Distribution of num_assistant_tokens_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

0 examples may be over the 4096 token limit, they will be truncated during fine-tuning
Dataset has ~67758 tokens that will be charged for during training
By default, you'll train for 3 epochs on this dataset
By

In [97]:
wandb.init(
    project="4o-SentimentFT",
    save_code=True,
    job_type="log-data",
)

wandb.log_artifact(
    "/Users/arsalan/Desktop/GWU/GCS/Fall 24/CSCI-LLM/FInal Project/W2S_Safety/trainSst2.jsonl",
    "datasets-stanfordnlp-sst2-train",
    type="train-data",
)

wandb.log_artifact(
    "/Users/arsalan/Desktop/GWU/GCS/Fall 24/CSCI-LLM/FInal Project/W2S_Safety/valSst2.jsonl",
    "datasets-stanfordnlp-sst2-validation",
    type="validation-data",
)
entity = wandb.run.entity

wandb.finish()

VBox(children=(Label(value='0.158 MB of 0.158 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [98]:
wandb.init(
    project="4o-SentimentFT",
    job_type="finetune",
)

artifact_train = wandb.use_artifact(
    f"{entity}/4o-SentimentFT/datasets-stanfordnlp-sst2-train:v0",
    type="train-data",
)
train_file = artifact_train.download("my_data")

train_file

[34m[1mwandb[0m:   1 of 1 files downloaded.  


'my_data'

In [84]:
client.files.create(file=open("trainSst2.jsonl", "rb"), purpose="fine-tune")

FileObject(id='file-5kGqb2vd5ZAQXBbZ71mpHv', bytes=22733019, created_at=1733124770, filename='trainSst2.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [88]:
client.fine_tuning.jobs.create(
    training_file="file-5kGqb2vd5ZAQXBbZ71mpHv",
    model="gpt-4o-2024-08-06",
)

FineTuningJob(id='ftjob-VTYg76FFGBPjVFzRtnXhJEnR', created_at=1733125008, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-4o-2024-08-06', object='fine_tuning.job', organization_id='org-fgT7muXOaba1Ffqr7T0GNYkO', result_files=[], seed=1327836970, status='validating_files', trained_tokens=None, training_file='file-5kGqb2vd5ZAQXBbZ71mpHv', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)

In [90]:
# Bugged that still hasn't been fixed from their side
# from wandb.integration.openai import autolog
# autolog({"project": "4o-SentimentFT"})

from wandb.integration.openai.fine_tuning import WandbLogger

WandbLogger.sync(
    fine_tune_job_id="ftjob-VTYg76FFGBPjVFzRtnXhJEnR",
    openai_client=client,
    project="4o-SentimentFT",
)

[34m[1mwandb[0m: Retrieving fine-tune job...
[34m[1mwandb[0m: Waiting for the OpenAI fine-tuning job to finish training...
[34m[1mwandb[0m: To avoid blocking, you can call `WandbLogger.sync` with `wait_for_job_success=False` after OpenAI training completes.
[34m[1mwandb[0m: Fine-tuning finished, logging metrics, model metadata, and run metadata to Weights & Biases
[34m[1mwandb[0m: Logging training/validation files...


VBox(children=(Label(value='53.505 MB of 53.505 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_accuracy,▄▇▁▆█▆▃▃▄▇▆▆▇▆▆▆█▇▇▇▄█▄▆▆▇█▇▆█▆▇▇▃▇█▇▇▇▃
train_loss,█▃▂▂▄▁▁▂▂▂▂▁▂▁▁▃▅▂▁▄▂▃▄▂▅▃▁▁▄▁▄▁▃▂▃▃▃▃▃▃

0,1
fine_tuned_model,ft:gpt-4o-2024-08-06...
status,succeeded
train_accuracy,0.9931
train_loss,0.01092


'🎉 wandb sync completed successfully'

In [None]:
# Use this block for a complete evaluation of a model and it spits out the metrics given a model name on the validation dataset
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
)

from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff


def metricsHelper(predictions):
    y_true = df_val["label"].to_list()
    pred_sent4o = [int(value) for value in predictions]

    accuracy = accuracy_score(y_true, pred_sent4o)
    precision = precision_score(y_true, pred_sent4o, pos_label=1)
    recall = recall_score(y_true, pred_sent4o, pos_label=1)
    f1 = f1_score(y_true, pred_sent4o, pos_label=1)
    conf_matrix = confusion_matrix(y_true, pred_sent4o)

    # Display results
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")

    # Confusion Matrix
    print("\nConfusion Matrix:")
    print(conf_matrix)

    # Detailed Report
    print("\nClassification Report:")
    print(
        classification_report(
            y_true, pred_sent4o, target_names=["Negative", "Positive"]
        )
    )


@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def metricsFineTuned(test, model, predictions):

    for _, row in test.iterrows():
        response = client.chat.completions.create(
            model=model,
            messages=[
                {
                    "role": "system",
                    "content": "Analyze this sentences and provide its sentiment. Only use the following two sentiments, positive as 1, negative as 0, as your reply. ",
                },
                {"role": "user", "content": row["sentence"]},
            ],
        )

        answer = response.choices[0].message.content
        predictions.append(answer)


fouro_ft = []
metricsFineTuned(
    df_val, "ft:gpt-4o-2024-08-06:george-washington-university::AZwauCTk", fouro_ft
)
metricsHelper(fouro_ft)

RetryError: RetryError[<Future at 0x31bbe98d0 state=finished raised RateLimitError>]