In [1]:
import os
import json
import pandas as pd
from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import train_test_split
from openai import OpenAI


client = OpenAI(
  api_key=api_key,
)

In [None]:
# https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset

In [2]:
train = pd.read_csv('train.csv',encoding='utf-8').sample(100) # change here to use all data
test = pd.read_csv('test.csv',encoding='utf-8')

In [3]:
train,val = train_test_split(train, test_size=0.20, random_state=42)

In [4]:
train['label_text'] = train['label'].apply(lambda x: 'canonical' if x==0 else 'noncanonical')
val['label_text'] = val['label'].apply(lambda x: 'canonical' if x==0 else 'noncanonical')

In [5]:
prepare_text = """You are an Italian language expert. Analyze the following text and tell me whether it is canonical or noncanonical.
TEXT: {text}

Reply with either 'canonical' or 'noncanonical' as a single word.
""".strip()


In [6]:
training_file_name = "training_data.jsonl"
validation_file_name = "validation_data.jsonl"

def prepare_data(df_data, final_file_name):
    with open(final_file_name, 'w',encoding='utf-8') as outfile:
        for _,line in df_data.iterrows():
            temp = {'prompt': prepare_text.format(text=line['text']),
                   'completion':line['label_text']}
            json.dump(temp, outfile)
            outfile.write('\n')

prepare_data(train, "training_data.jsonl")
prepare_data(val, "validation_data.jsonl")

In [7]:
training_file_id = client.files.create(
  file=open(training_file_name, "rb"),
  purpose="fine-tune"
)

validation_file_id = client.files.create(
  file=open(validation_file_name, "rb"),
  purpose="fine-tune"
)

print(f"Training File ID: {training_file_id}")
print(f"Validation File ID: {validation_file_id}")

Training File ID: FileObject(id='file-osyD8EMpiaX3UrfSzZrRdlim', bytes=33838, created_at=1723755249, filename='training_data.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)
Validation File ID: FileObject(id='file-cbfzXN6JM2X2SxlRNJa0Fa46', bytes=7553, created_at=1723755250, filename='validation_data.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)


In [19]:
response = client.fine_tuning.jobs.create(
    training_file=training_file_id.id, 
    validation_file=validation_file_id.id,
    model="davinci-002"
#   hyperparameters={
#     "n_epochs": 4,
#     "batch_size": 5,
#     "learning_rate_multiplier": 0.3
#   }
)
job_id = response.id
status = response.status

print(f'Fine-tunning model with jobID: {job_id}.')
print(f"Training Response: {response}")
print(f"Training Status: {status}")

Fine-tunning model with jobID: ftjob-BGVNvsB6VCzBGf95UJJ2eoK8.
Training Response: FineTuningJob(id='ftjob-BGVNvsB6VCzBGf95UJJ2eoK8', created_at=1723756035, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='davinci-002', object='fine_tuning.job', organization_id='org-IHJydIZsYemK29w90Hh71BHX', result_files=[], seed=1582627931, status='validating_files', trained_tokens=None, training_file='file-osyD8EMpiaX3UrfSzZrRdlim', validation_file='file-cbfzXN6JM2X2SxlRNJa0Fa46', estimated_finish=None, integrations=[], user_provided_suffix=None)
Training Status: validating_files


In [20]:
import signal
import datetime

def signal_handler(sig, frame):
    status = client.fine_tuning.jobs.retrieve(job_id).status
    print(f"Stream interrupted. Job is still {status}.")
    return


print(f"Streaming events for the fine-tuning job: {job_id}")

signal.signal(signal.SIGINT, signal_handler)

events = client.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id)
try:
    for event in events:
        print(
            f'{datetime.datetime.fromtimestamp(event.created_at)} {event.message}'
        )
except Exception:
    print("Stream interrupted (client disconnected).")

Streaming events for the fine-tuning job: ftjob-BGVNvsB6VCzBGf95UJJ2eoK8
2024-08-16 02:37:15 Validating training file: file-osyD8EMpiaX3UrfSzZrRdlim and validation file: file-cbfzXN6JM2X2SxlRNJa0Fa46
2024-08-16 02:37:15 Created fine-tuning job: ftjob-BGVNvsB6VCzBGf95UJJ2eoK8


In [21]:
import time

status = client.fine_tuning.jobs.retrieve(job_id).status
if status not in ["succeeded", "failed"]:
    print(f"Job not in terminal status: {status}. Waiting.")
    while status not in ["succeeded", "failed"]:
        time.sleep(2)
        status = client.fine_tuning.jobs.retrieve(job_id).status
        print(f"Status: {status}")
else:
    print(f"Finetune job {job_id} finished with status: {status}")
print("Checking other finetune jobs in the subscription.")
result = client.fine_tuning.jobs.list()
print(f"Found {len(result.data)} finetune jobs.")

Job not in terminal status: validating_files. Waiting.
Status: validating_files
Status: validating_files
Status: validating_files
Status: validating_files
Status: validating_files
Status: validating_files
Status: validating_files
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
Status: running
St

In [22]:
# Retrieve the finetuned model
fine_tuned_model = result.data[0].fine_tuned_model
print(fine_tuned_model)

ft:davinci-002:personal::9wc2wRCT


# Test using finetuned model

In [23]:
def classify_text_with_context(tuned_model,text):
    answer = client.completions.create(
      model= tuned_model,
      prompt=prepare_text.format(text=text)
    )
    return 'canonical' if answer.choices[0].text.startswith('canonical') else 'noncanonical'

In [24]:
query = test['text'][14]
classify_text_with_context(fine_tuned_model,query)

'canonical'

# Test on 20 test data

In [25]:
from tqdm import tqdm

In [26]:
pred = []
for query in tqdm(test['text'][:20]): # change here to use all test data
    result = classify_text_with_context(fine_tuned_model,query)
    pred.append(result)

100%|███████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.03s/it]


In [27]:
pred = [0 if i=='canonical' else 1 for i in pred]

In [28]:
pred

[0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0]

In [29]:
print(classification_report(test['label'][:20],pred)) # change here to use all test data

              precision    recall  f1-score   support

           0       0.69      0.69      0.69        13
           1       0.43      0.43      0.43         7

    accuracy                           0.60        20
   macro avg       0.56      0.56      0.56        20
weighted avg       0.60      0.60      0.60        20

