In [None]:
!pip install -U openai

Collecting openai
  Downloading openai-1.45.1-py3-none-any.whl.metadata (22 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.45.1-py3-none-any.whl (374 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.2/374.2 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.2-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K   [90m━━

In [None]:
import pandas as pd
df = pd.read_csv("/content/bank_support_train.csv")
df.head()

Unnamed: 0,Support Query,Top Category,Sub Category
0,Can you explain the monthly maintenance fee on...,Fees and Charges,Understanding Fees
1,"I was charged a fee for an ATM withdrawal, why?",Fees and Charges,Understanding Fees
2,How do I dispute a transaction fee I believe i...,Fees and Charges,Dispute Charges
3,Are there any fees for using online banking?,Fees and Charges,Understanding Fees
4,What are the charges for a wire transfer?,Fees and Charges,Understanding Fees


## Format the Data

In [None]:
def convert_to_gpt35_format(dataset):
    fine_tuning_data = []
    for _, row in dataset.iterrows():
        json_response = '{"Top Category": "' + row['Top Category'] + '", "Sub Category": "' + row['Sub Category'] + '"}'
        fine_tuning_data.append({
            "messages": [
                {"role": "user", "content": row['Support Query']},
                {"role": "assistant", "content": json_response}
            ]
        })
    return fine_tuning_data

dataset = pd.read_csv('/content/bank_support_train.csv')
converted_data = convert_to_gpt35_format(dataset)
converted_data[0]['messages']

[{'role': 'user',
  'content': 'Can you explain the monthly maintenance fee on my account?'},
 {'role': 'assistant',
  'content': '{"Top Category": "Fees and Charges", "Sub Category": "Understanding Fees"}'}]

## Creating Training and Validation Sets

In [None]:
import json
json.loads(converted_data[0]['messages'][-1]['content'])

{'Top Category': 'Fees and Charges', 'Sub Category': 'Understanding Fees'}

In [None]:
from sklearn.model_selection import train_test_split

# Stratified splitting. Assuming 'Top Category' can be used for stratification
train_data, val_data = train_test_split(
    converted_data,
    test_size=0.2,
    stratify=dataset['Top Category'],
    random_state=42  # for reproducibility
)

In [None]:
type(train_data[0])

dict

## Creating JSONL Files

In [None]:
def write_to_jsonl(data, file_path):
    with open(file_path, 'w') as file:
        for entry in data:
            json.dump(entry, file)
            file.write('\n')


training_file_name = "train.jsonl"
validation_file_name = "val.jsonl"

write_to_jsonl(train_data, training_file_name)
write_to_jsonl(val_data, validation_file_name)

## Uploading Data and Starting the Fine-Tuning Joba

In [None]:
from openai import OpenAI
client = OpenAI(api_key="sk-xxxx")

In [None]:
training_file = client.files.create(
    file=open(training_file_name, "rb"), purpose="fine-tune"
)
validation_file = client.files.create(
    file=open(validation_file_name, "rb"), purpose="fine-tune"
)

print("Training file id:", training_file.id)
print("Validation file id:", validation_file.id)

Training file id: file-uir9A6fJMHkiX3BjMNpmM2Ya
Validation file id: file-J5he5Rlth02BmPuRu8pnBbDy


In [None]:
suffix_name = "fine-tuned"

response = client.fine_tuning.jobs.create(
    training_file=training_file.id,
    validation_file=validation_file.id,
    model="gpt-3.5-turbo",
    suffix=suffix_name,
)
response

FineTuningJob(id='ftjob-IQ1wRlJQ2ax6Ao8n7XOtEqw3', created_at=1726590707, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-CSD8fR7sjUBiRL5Gou4dcnmv', result_files=[], seed=1126837919, status='validating_files', trained_tokens=None, training_file='file-uir9A6fJMHkiX3BjMNpmM2Ya', validation_file='file-J5he5Rlth02BmPuRu8pnBbDy', estimated_finish=None, integrations=[], user_provided_suffix='fine-tuned')

In [None]:
client.fine_tuning.jobs.list(limit=10)

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-IQ1wRlJQ2ax6Ao8n7XOtEqw3', created_at=1726590707, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-CSD8fR7sjUBiRL5Gou4dcnmv', result_files=[], seed=1126837919, status='validating_files', trained_tokens=None, training_file='file-uir9A6fJMHkiX3BjMNpmM2Ya', validation_file='file-J5he5Rlth02BmPuRu8pnBbDy', estimated_finish=None, integrations=[], user_provided_suffix='fine-tuned'), FineTuningJob(id='ftjob-B8ho5ldUmDIO9PP9Cli8wlQG', created_at=1721144035, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-3.5-turbo-0125:personal:fine-tuned:9lectI5X', finished_at=1721144609, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object=

In [None]:
response = client.fine_tuning.jobs.retrieve("ftjob-ydO2LPutqXBUx1vQWX68yhNl")
response

FineTuningJob(id='ftjob-ydO2LPutqXBUx1vQWX68yhNl', created_at=1719119215, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-3.5-turbo-0125:personal:yt-tutorial:9d9sERCU', finished_at=1719119772, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-CSD8fR7sjUBiRL5Gou4dcnmv', result_files=['file-1lOM4nrdpGczBOQfhOdQyvGE'], seed=819080530, status='succeeded', trained_tokens=9123, training_file='file-SrUTl7yVJVPK1JEQc2H9cksk', validation_file='file-8MmKKz895uTtn6gNrD4dTVkN', estimated_finish=None, integrations=[], user_provided_suffix='yt_tutorial')

In [None]:
fine_tuned_model_id = response.fine_tuned_model
print("\nFine-tuned model id:", fine_tuned_model_id)


Fine-tuned model id: ft:gpt-3.5-turbo-0125:personal:yt-tutorial:9d9sERCU


## Testing the Fine-Tuned Model

In [None]:
def format_test(row):
    formatted_message = [
        {
            "role": "user",
            "content": row['Support Query']
        }
    ]
    return formatted_message

def predict(test_messages, fine_tuned_model_id):

    response = client.chat.completions.create(
        model=fine_tuned_model_id, messages=test_messages, temperature=0, max_tokens=50
    )

    return response.choices[0].message.content

In [None]:
def store_predictions(test_df, fine_tuned_model_id):

    print("fine_tuned_model_id",fine_tuned_model_id)
    test_df['Prediction'] = None

    for index, row in test_df.iterrows():
        test_message = format_test(row)
        prediction_result = predict(test_message, fine_tuned_model_id)
        test_df.at[index, 'Prediction'] = prediction_result

    test_df.to_csv("predictions.csv")

In [None]:
test_df = pd.read_csv("/content/test_queries.csv")
store_predictions(test_df, fine_tuned_model_id)

fine_tuned_model_id ft:gpt-3.5-turbo-0125:personal:yt-tutorial:9d9sERCU


In [2]:
df_accuracy = pd.read_csv("/content/predictions.csv", index_col=0)
df_accuracy.head()

Unnamed: 0,Support Query,Top Category,Sub Category,Prediction
0,Can you explain the monthly maintenance fee on...,Fees and Charges,Understanding Fees,"{""Top Category"": ""Fees and Charges"", ""Sub Cate..."
1,I was charged a fee for an ATM withdrawal,Fees and Charges,Understanding Fees,"{""Top Category"": ""Fees and Charges"", ""Sub Cate..."
2,What are the current interest rates for home l...,Loans and Mortgages,Loan Application,"{""Top Category"": ""Loans and Mortgages"", ""Sub C..."
3,Can I apply for a mortgage loan online?,Loans and Mortgages,Loan Application,"{""Top Category"": ""Loans and Mortgages"", ""Sub C..."
4,Where can I provide feedback about your services?,Customer Service,Feedback and Suggestions,"{""Top Category"": ""Customer Service"", ""Sub Cate..."


In [None]:
# Displaying the accuracy results with clear labels
top_category_accuracy = df_accuracy['Correct Top Category'].mean() * 100
sub_category_accuracy = df_accuracy['Correct Sub Category'].mean() * 100

# Printing the accuracies with proper labels
top_category_accuracy_label = f"Top Category Accuracy: {top_category_accuracy}%"
sub_category_accuracy_label = f"Sub Category Accuracy: {sub_category_accuracy}%"

top_category_accuracy_label, sub_category_accuracy_label

('Top Category Accuracy: 85.0%', 'Sub Category Accuracy: 80.0%')