<a href="https://colab.research.google.com/github/avikumart/LLM-GenAI-Transformers-Notebooks/blob/main/GPT_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U openai

Collecting openai
  Downloading openai-1.6.1-py3-none-any.whl (225 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.4/225.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.26.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.9/75.9 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting typing-extensions<5,>=4.7 (from openai)
  Downloading typing_extensions-4.9.0-py3-none-any.whl (32 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.2-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.9/76.9 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0

In [3]:
import pandas as pd
df = pd.read_csv("/content/bank_support_train.csv")
df.head()

Unnamed: 0,Support Query,Top Category,Sub Category
0,Can you explain the monthly maintenance fee on...,Fees and Charges,Understanding Fees
1,"I was charged a fee for an ATM withdrawal, why?",Fees and Charges,Understanding Fees
2,How do I dispute a transaction fee I believe i...,Fees and Charges,Dispute Charges
3,Are there any fees for using online banking?,Fees and Charges,Understanding Fees
4,What are the charges for a wire transfer?,Fees and Charges,Understanding Fees


In [4]:
# convert datafreme to gpt3.5 fine-tuning format
def convert_to_gpt35_format(dataset):
    fine_tuning_data = []
    for _, row in dataset.iterrows():
        json_response = '{"Top Category": "' + row['Top Category'] + '", "Sub Category": "' + row['Sub Category'] + '"}'
        fine_tuning_data.append({
            "messages": [
                {"role": "user", "content": row['Support Query']},
                {"role": "system", "content": json_response}
            ]
        })
    return fine_tuning_data

In [8]:
converted_data = convert_to_gpt35_format(df)

In [10]:
from sklearn.model_selection import train_test_split

# Stratified splitting. Assuming 'Top Category' can be used for stratification
train_data, val_data = train_test_split(
    converted_data,
    test_size=0.2,
    stratify=df['Top Category'],
    random_state=42  # for reproducibility
)


In [13]:
import json

In [14]:
# convert final data to json files
def write_to_jsonl(data, file_path):
    with open(file_path, 'w') as file:
        for entry in data:
            json.dump(entry, file)
            file.write('\n')

training_file_name = "train.jsonl"
validation_file_name = "val.jsonl"

write_to_jsonl(train_data, training_file_name)
write_to_jsonl(val_data, validation_file_name)

In [15]:
# fine-tuning job
from openai import OpenAI
client = OpenAI(api_key="your_open_ai_key")

# Upload Training and Validation Files
training_file = client.files.create(
    file=open(training_file_name, "rb"), purpose="fine-tune"
)
validation_file = client.files.create(
    file=open(validation_file_name, "rb"), purpose="fine-tune"
)

# Create Fine-Tuning Job
suffix_name = "yt_tutorial"
response = client.fine_tuning.jobs.create(
    training_file=training_file.id,
    validation_file=validation_file.id,
    model="gpt-3.5-turbo",
    suffix=suffix_name,
)

In [17]:
# evaluate the predictions
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def format_test(row):
    formatted_message = [{"role": "user", "content": row['Support Query']}]
    return formatted_message

def predict(test_messages, fine_tuned_model_id):
    response = client.chat.completions.create(
        model=fine_tuned_model_id, messages=test_messages, temperature=0, max_tokens=50
    )
    return response.choices[0].message.content

def store_predictions(test_df, fine_tuned_model_id):
    test_df['Prediction'] = None
    for index, row in test_df.iterrows():
        test_message = format_test(row)
        prediction_result = predict(test_message, fine_tuned_model_id)
        test_df.at[index, 'Prediction'] = prediction_result

    test_df.to_csv("predictions.csv")