In [1]:
import os
import openai
import json
import pandas as pd
from sklearn.model_selection import train_test_split

labelled_sentences = pd.read_excel('training_data.xlsx')
labelled_sentences = labelled_sentences[['Sentence', 'Score']]

# align data type for Score column to string
score_map = {-1:'1', -0.5:'2', 0:'3', 0.5:'4', 1:'5', 'Remove':'Remove'}
labelled_sentences['Score'] = labelled_sentences['Score'].apply(lambda x: score_map[x])

In [2]:
# functions to perform data augmentation

import copy
import nlpaug.augmenter.word as naw
aug = naw.SynonymAug()

def word_substitution(sentence, augmenter):
    augmented_text = augmenter.augment(sentence)
    return augmented_text[0]

def data_augmentation(train_data):
    train_temp = copy.deepcopy(train_data)
    train_temp['Sentence'] = train_temp['Sentence'].apply(lambda x: word_substitution(x, aug))
    train_data = pd.concat([train_data, train_temp], ignore_index=True, axis=0)
    return train_data

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
# function to add prompt to sentences

def preprocess(df):
    new_df = pd.DataFrame(columns=['prompt', 'completion'])
    # new_df['prompt'] = df['Sentence'].apply(lambda x: 'Give a score to the following sentence; give a score of 9 when the sentence is not relevant for financial sentiment analysis; give a score of 1 to 5, in step of 1, where 1 represents the most dovish sentiment, and 5 representing the most hawkish sentiment. ' + x + ' \n\n###\n\n')
    new_df['prompt'] = df['Sentence'].apply(lambda x: x + ' \n\n###\n\n')
    new_df['completion'] = df['Score']
    new_df['completion'] = new_df['completion'].apply(lambda x: ' ' + str(x) + '###')  

    return new_df

### Ada - No data augmentation

In [4]:
# split data
train, test = train_test_split(labelled_sentences, test_size=0.2, random_state=23, stratify=labelled_sentences['Score'])
train_new = preprocess(train)

# split train data for train and validate
train_gpt, test_gpt = train_test_split(train_new, test_size=0.2, random_state=23)

# save training and validation data to jsonl files
train_gpt.to_json('train_gpt_no_aug.jsonl', orient='records', lines=True)
test_gpt.to_json('val_gpt_no_aug.jsonl', orient='records', lines=True)

In [6]:
# create fine tunes model
!openai api fine_tunes.create -t "train_gpt_no_aug.jsonl" -v "val_gpt_no_aug.jsonl" --compute_classification_metrics --classification_n_classes 6 -m ada --n_epochs 5 --suffix "nlp_ada_no_aug"

Uploaded file from train_gpt_no_aug.jsonl: file-mmwJWcOkSEQ8qMfUdjmdj7rE
Uploaded file from val_gpt_no_aug.jsonl: file-X7G0h06fFMmA67dpoWLed6Zl
Created fine-tune: ft-weBlhPNmrqDGI86s9muuJYiL
Streaming events until fine-tuning is complete...

(Ctrl-C will interrupt the stream, but not cancel the fine-tune)
[2023-06-16 13:02:06] Created fine-tune: ft-weBlhPNmrqDGI86s9muuJYiL
[2023-06-16 13:03:00] Fine-tune costs $0.01
[2023-06-16 13:03:00] Fine-tune enqueued. Queue number: 1




Upload progress:   0%|          | 0.00/30.6k [00:00<?, ?it/s]
Upload progress: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 30.6k/30.6k [00:00<00:00, 30.7Mit/s]

Upload progress:   0%|          | 0.00/8.31k [00:00<?, ?it/s]
Upload progress: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8.31k/8.31k [00:00<00:00, 9.04Mit/s]


In [9]:
!openai api fine_tunes.follow -i ft-weBlhPNmrqDGI86s9muuJYiL

[2023-06-16 13:02:06] Created fine-tune: ft-weBlhPNmrqDGI86s9muuJYiL
[2023-06-16 13:03:00] Fine-tune costs $0.01
[2023-06-16 13:03:00] Fine-tune enqueued. Queue number: 1
[2023-06-16 13:03:17] Fine-tune is in the queue. Queue number: 0
[2023-06-16 13:03:22] Fine-tune started
[2023-06-16 13:03:57] Completed epoch 1/5
[2023-06-16 13:04:19] Completed epoch 2/5
[2023-06-16 13:04:39] Completed epoch 3/5
[2023-06-16 13:04:58] Completed epoch 4/5
[2023-06-16 13:05:18] Completed epoch 5/5
[2023-06-16 13:05:36] Uploaded model: ada:ft-personal:nlp-ada-no-aug-2023-06-16-05-05-35
[2023-06-16 13:05:37] Uploaded result file: file-YsjQiZsArgdY4g2iLrUSeVNu
[2023-06-16 13:05:37] Fine-tune succeeded

Job complete! Status: succeeded ðŸŽ‰
Try out your fine-tuned model:

openai api completions.create -m ada:ft-personal:nlp-ada-no-aug-2023-06-16-05-05-35 -p <YOUR_PROMPT>


In [10]:
!openai api fine_tunes.results -i ft-weBlhPNmrqDGI86s9muuJYiL > result_no_augment.csv

In [11]:
results = pd.read_csv('result_no_augment.csv')
results[results['classification/accuracy'].notnull()]

Unnamed: 0,step,elapsed_tokens,elapsed_examples,training_loss,training_sequence_accuracy,training_token_accuracy,validation_loss,validation_sequence_accuracy,validation_token_accuracy,classification/accuracy,classification/weighted_f1_score
128,129,6217,129,0.033701,1.0,1.0,0.061983,0.0,0.5,0.21875,0.089744
258,259,12371,259,0.046682,0.0,0.5,,,,0.40625,0.371429
385,386,18570,386,0.022546,1.0,1.0,,,,0.4375,0.404474
516,517,24709,517,0.022717,1.0,1.0,,,,0.34375,0.358681
641,642,30706,642,0.009186,1.0,1.0,,,,0.40625,0.447868


In [None]:
ft_model = 'ada:ft-personal:nlp-prod-ada-2023-06-14-03-38-06'
test_sentences = test_new['Sentence'].tolist()
y_pred = []

for s in test_sentences:
    res = openai.Completion.create(model=ft_model, prompt=s, max_tokens=1, temperature=0, logprobs=2)
    classify = res['choices'][0]['text'].strip()
    y_pred.append(classify)

### Ada + Data augmentation: Paraphrase

In [1]:
import copy
import torch
import pandas as pd
import openai
from sklearn.model_selection import train_test_split
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
labelled_sentences = pd.read_excel('training_data.xlsx')
labelled_sentences = labelled_sentences[['Sentence', 'Score']]

# align data type for Score column to string
score_map = {-1:'1', -0.5:'2', 0:'3', 0.5:'4', 1:'5', 'Remove':'Remove'}
labelled_sentences['Score'] = labelled_sentences['Score'].apply(lambda x: score_map[x])

# split data
train, test = train_test_split(labelled_sentences, test_size=0.2, random_state=23, stratify=labelled_sentences['Score'])

In [3]:
train['Sentence'].str.split().str.len().max()

99

In [16]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = "cpu"
tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)

def paraphrase(
    sentence,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=1,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=128
):
    input_ids = tokenizer(
        f'paraphrase: {sentence}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids
    
    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return res[0]

In [19]:
train_temp = copy.deepcopy(train)
train_temp['Sentence'] = train_temp['Sentence'].apply(lambda x: paraphrase(x))

In [22]:
train= pd.concat([train, train_temp], ignore_index=True, axis=0)
train.to_csv('train_paraphrase.csv', index=False)

In [25]:
train_new = preprocess(train)

# split train data for train and validate
train_gpt, test_gpt = train_test_split(train_new, test_size=0.2, random_state=23)

# save training and validation data to jsonl files
train_gpt.to_json('train_gpt_paraphrase.jsonl', orient='records', lines=True)
test_gpt.to_json('val_gpt_paraphrase.jsonl', orient='records', lines=True)

In [27]:
# create fine tunes model
!openai api fine_tunes.create -t "train_gpt_paraphrase.jsonl" -v "val_gpt_paraphrase.jsonl" --compute_classification_metrics --classification_n_classes 6 -m ada --n_epochs 5 --suffix "nlp_ada_para"

Uploaded file from train_gpt_paraphrase.jsonl: file-fYMjZv8tiw9A5z8Vdg2X9yg7
Uploaded file from val_gpt_paraphrase.jsonl: file-rrjOKshDbZXoAtoXfzfwPnI5
Created fine-tune: ft-JLwO9bVsAC5GzeE8ZgOldJxt
Streaming events until fine-tuning is complete...

(Ctrl-C will interrupt the stream, but not cancel the fine-tune)
[2023-06-16 14:31:49] Created fine-tune: ft-JLwO9bVsAC5GzeE8ZgOldJxt
[2023-06-16 14:32:35] Fine-tune costs $0.02
[2023-06-16 14:32:35] Fine-tune enqueued. Queue number: 0




Upload progress:   0%|          | 0.00/57.9k [00:00<?, ?it/s]
Upload progress: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 57.9k/57.9k [00:00<?, ?it/s]

Upload progress:   0%|          | 0.00/15.1k [00:00<?, ?it/s]
Upload progress: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 15.1k/15.1k [00:00<00:00, 15.3Mit/s]


In [37]:
!openai api fine_tunes.follow -i ft-JLwO9bVsAC5GzeE8ZgOldJxt

[2023-06-16 14:31:49] Created fine-tune: ft-JLwO9bVsAC5GzeE8ZgOldJxt
[2023-06-16 14:32:35] Fine-tune costs $0.02
[2023-06-16 14:32:35] Fine-tune enqueued. Queue number: 0
[2023-06-16 14:42:37] Fine-tune started
[2023-06-16 14:43:33] Completed epoch 1/5
[2023-06-16 14:44:14] Completed epoch 2/5
[2023-06-16 14:44:55] Completed epoch 3/5
[2023-06-16 14:45:36] Completed epoch 4/5
[2023-06-16 14:46:16] Completed epoch 5/5
[2023-06-16 14:46:41] Uploaded model: ada:ft-personal:nlp-ada-para-2023-06-16-06-46-41
[2023-06-16 14:46:42] Uploaded result file: file-g6RlFS3uJ4vjttH8nwz15XpV
[2023-06-16 14:46:42] Fine-tune succeeded

Job complete! Status: succeeded ðŸŽ‰
Try out your fine-tuned model:

openai api completions.create -m ada:ft-personal:nlp-ada-para-2023-06-16-06-46-41 -p <YOUR_PROMPT>


In [38]:
!openai api fine_tunes.results -i ft-JLwO9bVsAC5GzeE8ZgOldJxt > result_paraphrase.csv

In [44]:
results = pd.read_csv('result_paraphrase.csv')
results[results['classification/accuracy'].notnull()]

Unnamed: 0,step,elapsed_tokens,elapsed_examples,training_loss,training_sequence_accuracy,training_token_accuracy,validation_loss,validation_sequence_accuracy,validation_token_accuracy,classification/accuracy,classification/weighted_f1_score
257,258,11378,258,0.045614,1.0,1.0,,,,0.34375,0.262419
513,514,22770,514,0.026385,1.0,1.0,,,,0.515625,0.435863
771,772,34148,772,0.041208,0.0,0.5,,,,0.59375,0.579313
1029,1030,45534,1030,0.009556,1.0,1.0,,,,0.71875,0.710382
1280,1281,56601,1281,0.015705,1.0,1.0,0.083623,0.0,0.5,0.6875,0.663262


### Ada + Data augmentation: Random word substitution

In [60]:
labelled_sentences = pd.read_excel('training_data.xlsx')
labelled_sentences = labelled_sentences[['Sentence', 'Score']]

# align data type for Score column to string
score_map = {-1:'1', -0.5:'2', 0:'3', 0.5:'4', 1:'5', 'Remove':'Remove'}
labelled_sentences['Score'] = labelled_sentences['Score'].apply(lambda x: score_map[x])

# split data
train, test = train_test_split(labelled_sentences, test_size=0.2, random_state=23, stratify=labelled_sentences['Score'])

In [61]:
# Random Word Augmenter
# swaps words randomly in sentence
import nlpaug.augmenter.word as naw
aug = naw.RandomWordAug(action="swap")

train_temp = copy.deepcopy(train)
train_temp['Sentence'] = train_temp['Sentence'].apply(lambda x: aug.augment(x)[0])
train= pd.concat([train, train_temp], ignore_index=True, axis=0)

In [62]:
train_new = preprocess(train)

# split train data for train and validate
train_gpt, test_gpt = train_test_split(train_new, test_size=0.2, random_state=23)

# save training and validation data to jsonl files
train_gpt.to_json('train_gpt_random_swap.jsonl', orient='records', lines=True)
test_gpt.to_json('val_gpt_random_swap.jsonl', orient='records', lines=True)

In [43]:
# create fine tunes model
!openai api fine_tunes.create -t "train_gpt_random_swap.jsonl" -v "val_gpt_random_swap.jsonl" --compute_classification_metrics --classification_n_classes 6 -m ada --n_epochs 5 --suffix "nlp_ada_random_swap"

Uploaded file from train_gpt_random_swap.jsonl: file-XXY97Qadf8qr8AGeL18PTtDg
Uploaded file from val_gpt_random_swap.jsonl: file-wLxbe8dI43l2mBs2ozBgvKn9
Created fine-tune: ft-33DwIEilkbEeYis4grqFv0mr
Streaming events until fine-tuning is complete...

(Ctrl-C will interrupt the stream, but not cancel the fine-tune)
[2023-06-16 14:51:41] Created fine-tune: ft-33DwIEilkbEeYis4grqFv0mr




Upload progress:   0%|          | 0.00/61.8k [00:00<?, ?it/s]
Upload progress: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 61.8k/61.8k [00:00<00:00, 61.8Mit/s]

Upload progress:   0%|          | 0.00/16.0k [00:00<?, ?it/s]
Upload progress: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16.0k/16.0k [00:00<?, ?it/s]


In [54]:
!openai api fine_tunes.follow -i ft-33DwIEilkbEeYis4grqFv0mr

[2023-06-16 14:51:41] Created fine-tune: ft-33DwIEilkbEeYis4grqFv0mr
[2023-06-16 14:53:03] Fine-tune costs $0.02
[2023-06-16 14:53:04] Fine-tune enqueued. Queue number: 0
[2023-06-16 15:03:08] Fine-tune started
[2023-06-16 15:04:03] Completed epoch 1/5
[2023-06-16 15:04:44] Completed epoch 2/5
[2023-06-16 15:05:24] Completed epoch 3/5
[2023-06-16 15:06:05] Completed epoch 4/5
[2023-06-16 15:06:45] Completed epoch 5/5
[2023-06-16 15:07:12] Uploaded model: ada:ft-personal:nlp-ada-random-swap-2023-06-16-07-07-12
[2023-06-16 15:07:13] Uploaded result file: file-XYhukLHDQniO6rxenn0LIWau
[2023-06-16 15:07:13] Fine-tune succeeded

Job complete! Status: succeeded ðŸŽ‰
Try out your fine-tuned model:

openai api completions.create -m ada:ft-personal:nlp-ada-random-swap-2023-06-16-07-07-12 -p <YOUR_PROMPT>


In [55]:
!openai api fine_tunes.results -i ft-33DwIEilkbEeYis4grqFv0mr > result_random_swap.csv

In [56]:
results = pd.read_csv('result_random_swap.csv')
results[results['classification/accuracy'].notnull()]

Unnamed: 0,step,elapsed_tokens,elapsed_examples,training_loss,training_sequence_accuracy,training_token_accuracy,validation_loss,validation_sequence_accuracy,validation_token_accuracy,classification/accuracy,classification/weighted_f1_score
256,257,12369,257,0.059696,0.0,0.5,0.062358,0.0,0.5,0.203125,0.09278
513,514,24698,514,0.044197,1.0,1.0,,,,0.671875,0.620056
769,770,37074,770,0.02531,1.0,1.0,,,,0.671875,0.635237
1026,1027,49387,1027,0.005591,1.0,1.0,,,,0.828125,0.818468
1281,1282,61634,1282,0.022434,1.0,1.0,,,,0.859375,0.857551


### Ada + Data augmentation: Wordnet word substitution

In [63]:
labelled_sentences = pd.read_excel('training_data.xlsx')
labelled_sentences = labelled_sentences[['Sentence', 'Score']]

# align data type for Score column to string
score_map = {-1:'1', -0.5:'2', 0:'3', 0.5:'4', 1:'5', 'Remove':'Remove'}
labelled_sentences['Score'] = labelled_sentences['Score'].apply(lambda x: score_map[x])

# split data
train, test = train_test_split(labelled_sentences, test_size=0.2, random_state=23, stratify=labelled_sentences['Score'])

In [64]:
# Synonym Augmenter
# substitution by WordNet's synonym
aug = naw.SynonymAug(aug_src='wordnet')

train_temp = copy.deepcopy(train)
train_temp['Sentence'] = train_temp['Sentence'].apply(lambda x: aug.augment(x)[0])
train= pd.concat([train, train_temp], ignore_index=True, axis=0)

In [65]:
train_new = preprocess(train)

# split train data for train and validate
train_gpt, test_gpt = train_test_split(train_new, test_size=0.2, random_state=23)

# save training and validation data to jsonl files
train_gpt.to_json('train_gpt_wordnet_swap.jsonl', orient='records', lines=True)
test_gpt.to_json('val_gpt_wordnet_swap.jsonl', orient='records', lines=True)

In [66]:
# create fine tunes model
!openai api fine_tunes.create -t "train_gpt_wordnet_swap.jsonl" -v "val_gpt_wordnet_swap.jsonl" --compute_classification_metrics --classification_n_classes 6 -m ada --n_epochs 5 --suffix "nlp_ada_wordnet_swap"

Uploaded file from train_gpt_wordnet_swap.jsonl: file-UKlKBKQLMMsqZqtaJXZ0rfhf
Uploaded file from val_gpt_wordnet_swap.jsonl: file-9qFuTpmBWmB4fXVOSbsHlFDF
Created fine-tune: ft-X1H6TIUiv20CjEBEs7uI1sPH
Streaming events until fine-tuning is complete...

(Ctrl-C will interrupt the stream, but not cancel the fine-tune)
[2023-06-16 15:11:36] Created fine-tune: ft-X1H6TIUiv20CjEBEs7uI1sPH
[2023-06-16 15:12:19] Fine-tune costs $0.02
[2023-06-16 15:12:19] Fine-tune enqueued. Queue number: 0
[2023-06-16 15:12:20] Fine-tune started




Upload progress:   0%|          | 0.00/63.1k [00:00<?, ?it/s]
Upload progress: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 63.1k/63.1k [00:00<00:00, 63.2Mit/s]

Upload progress:   0%|          | 0.00/16.4k [00:00<?, ?it/s]
Upload progress: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16.4k/16.4k [00:00<?, ?it/s]


In [69]:
!openai api fine_tunes.follow -i ft-X1H6TIUiv20CjEBEs7uI1sPH

[2023-06-16 15:11:36] Created fine-tune: ft-X1H6TIUiv20CjEBEs7uI1sPH
[2023-06-16 15:12:19] Fine-tune costs $0.02
[2023-06-16 15:12:19] Fine-tune enqueued. Queue number: 0
[2023-06-16 15:12:20] Fine-tune started
[2023-06-16 15:13:16] Completed epoch 1/5
[2023-06-16 15:13:58] Completed epoch 2/5
[2023-06-16 15:14:38] Completed epoch 3/5
[2023-06-16 15:15:18] Completed epoch 4/5
[2023-06-16 15:15:58] Completed epoch 5/5
[2023-06-16 15:16:25] Uploaded model: ada:ft-personal:nlp-ada-wordnet-swap-2023-06-16-07-16-24
[2023-06-16 15:16:26] Uploaded result file: file-TxLaU4z4RgPTHX9xnM2mvvKd
[2023-06-16 15:16:26] Fine-tune succeeded

Job complete! Status: succeeded ðŸŽ‰
Try out your fine-tuned model:

openai api completions.create -m ada:ft-personal:nlp-ada-wordnet-swap-2023-06-16-07-16-24 -p <YOUR_PROMPT>


In [70]:
!openai api fine_tunes.results -i ft-X1H6TIUiv20CjEBEs7uI1sPH > result_wordnet_swap.csv

In [71]:
results = pd.read_csv('result_wordnet_swap.csv')
results[results['classification/accuracy'].notnull()]

Unnamed: 0,step,elapsed_tokens,elapsed_examples,training_loss,training_sequence_accuracy,training_token_accuracy,validation_loss,validation_sequence_accuracy,validation_token_accuracy,classification/accuracy,classification/weighted_f1_score
258,259,12795,259,0.052101,0.0,0.5,,,,0.390625,0.310119
516,517,25445,517,0.066325,0.0,0.5,,,,0.609375,0.542972
771,772,38140,772,0.021665,1.0,1.0,,,,0.65625,0.582568
1028,1029,50781,1029,0.019316,1.0,1.0,,,,0.8125,0.811083
1282,1283,63363,1283,0.022246,1.0,1.0,,,,0.828125,0.825057


### Ada + Data augmentation: Google word substitution

In [72]:
labelled_sentences = pd.read_excel('training_data.xlsx')
labelled_sentences = labelled_sentences[['Sentence', 'Score']]

# align data type for Score column to string
score_map = {-1:'1', -0.5:'2', 0:'3', 0.5:'4', 1:'5', 'Remove':'Remove'}
labelled_sentences['Score'] = labelled_sentences['Score'].apply(lambda x: score_map[x])

# split data
train, test = train_test_split(labelled_sentences, test_size=0.2, random_state=23, stratify=labelled_sentences['Score'])

In [73]:
# Word Embeddings Augmenter
# substitution by word similarity
aug = naw.WordEmbsAug(
    model_type='word2vec', model_path='data/GoogleNews-vectors-negative300.bin',
    action="substitute")

train_temp = copy.deepcopy(train)
train_temp['Sentence'] = train_temp['Sentence'].apply(lambda x: aug.augment(x)[0])
train= pd.concat([train, train_temp], ignore_index=True, axis=0)

In [74]:
train_new = preprocess(train)

# split train data for train and validate
train_gpt, test_gpt = train_test_split(train_new, test_size=0.2, random_state=23)

# save training and validation data to jsonl files
train_gpt.to_json('train_gpt_google_swap.jsonl', orient='records', lines=True)
test_gpt.to_json('val_gpt_google_swap.jsonl', orient='records', lines=True)

In [75]:
# create fine tunes model
!openai api fine_tunes.create -t "train_gpt_google_swap.jsonl" -v "val_gpt_google_swap.jsonl" --compute_classification_metrics --classification_n_classes 6 -m ada --n_epochs 5 --suffix "nlp_ada_google"

Uploaded file from train_gpt_google_swap.jsonl: file-12MnQn3zQ41wBz9r0HbZGA9M
Uploaded file from val_gpt_google_swap.jsonl: file-iJGa6vzmz8PJOEuFPwyLRgeU
Created fine-tune: ft-uzVc3K7S90WgvMgifKU0fbyP
Streaming events until fine-tuning is complete...

(Ctrl-C will interrupt the stream, but not cancel the fine-tune)
[2023-06-16 15:21:17] Created fine-tune: ft-uzVc3K7S90WgvMgifKU0fbyP

Stream interrupted (client disconnected).
To resume the stream, run:

  openai api fine_tunes.follow -i ft-uzVc3K7S90WgvMgifKU0fbyP




Upload progress:   0%|          | 0.00/67.7k [00:00<?, ?it/s]
Upload progress: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 67.7k/67.7k [00:00<?, ?it/s]

Upload progress:   0%|          | 0.00/17.6k [00:00<?, ?it/s]
Upload progress: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 17.6k/17.6k [00:00<00:00, 17.0Mit/s]


In [76]:
!openai api fine_tunes.follow -i ft-uzVc3K7S90WgvMgifKU0fbyP

[2023-06-16 15:21:17] Created fine-tune: ft-uzVc3K7S90WgvMgifKU0fbyP
[2023-06-16 15:22:30] Fine-tune costs $0.03
[2023-06-16 15:22:30] Fine-tune enqueued. Queue number: 1
[2023-06-16 15:22:32] Fine-tune started
[2023-06-16 15:23:28] Completed epoch 1/5
[2023-06-16 15:24:10] Completed epoch 2/5
[2023-06-16 15:24:50] Completed epoch 3/5
[2023-06-16 15:25:30] Completed epoch 4/5
[2023-06-16 15:26:11] Completed epoch 5/5
[2023-06-16 15:26:36] Uploaded model: ada:ft-personal:nlp-ada-google-2023-06-16-07-26-36
[2023-06-16 15:26:37] Uploaded result file: file-ltBcgrOrJFSeUHs0XIyE62Qo
[2023-06-16 15:26:37] Fine-tune succeeded

Job complete! Status: succeeded ðŸŽ‰
Try out your fine-tuned model:

openai api completions.create -m ada:ft-personal:nlp-ada-google-2023-06-16-07-26-36 -p <YOUR_PROMPT>


In [77]:
!openai api fine_tunes.results -i ft-uzVc3K7S90WgvMgifKU0fbyP > result_google_swap.csv

In [78]:
results = pd.read_csv('result_google_swap.csv')
results[results['classification/accuracy'].notnull()]

Unnamed: 0,step,elapsed_tokens,elapsed_examples,training_loss,training_sequence_accuracy,training_token_accuracy,validation_loss,validation_sequence_accuracy,validation_token_accuracy,classification/accuracy,classification/weighted_f1_score
256,257,14857,257,0.087228,0.0,0.5,0.041054,1.0,1.0,0.234375,0.106061
513,514,29706,514,0.082291,0.0,0.5,,,,0.296875,0.186984
770,771,44563,771,0.008254,1.0,1.0,,,,0.5625,0.510378
1026,1027,59379,1027,0.034236,1.0,1.0,,,,0.703125,0.637904
1281,1282,74082,1282,0.007746,1.0,1.0,,,,0.78125,0.777509


### Evaluate

In [138]:
from sklearn.metrics import classification_report, balanced_accuracy_score
import numpy as np

def evaluate_model(model_name, ft_model, test_data):
    test_sentences = test_data['Sentence'].tolist()
    score_true = test_data['Score'].tolist()
    score_pred = []

    for s in test_sentences:
        res = openai.Completion.create(model = ft_model, 
                                       prompt = s + ' \n\n###\n\n', 
                                       max_tokens = 1, 
                                       temperature = 0)
        classify = res['choices'][0]['text'].strip()
        score_pred.append(classify)

    # map back to original labels
    mapping = {'1':'-1', '2':'-0.5', '3':'0', '4':'0.5', '5':'1', 'Remove':'Remove'}
    score_true = [mapping[score] for score in score_true]
    score_pred = [mapping[score] for score in score_pred]

    print(model_name)
    print(classification_report(score_true, score_pred))
    print('balanced accuracy: ', balanced_accuracy_score(score_true, score_pred))

    # calculate mean squared error by excluding 'Remove' classes
    ordinal_label_idx = []
    for idx, cls in enumerate(list):
        if cls != 'Remove':
            ordinal_label_idx.append(idx)

    score_true_modified = [float(cls) for index, cls in enumerate(score_true) if index in ordinal_label_idx]
    score_pred_modified = [cls for index, cls in enumerate(score_pred) if index in ordinal_label_idx]
    score_pred_modified = [float(cls) if cls != 'Remove' else 0 for cls in score_pred_modified]

    squared_diff = [(actual - predicted) ** 2 for actual, predicted in zip(score_true_modified, score_pred_modified)]
    mse = np.mean(squared_diff)
    print('Mean squared error:', round(mse,2))

    print('_ '*30)

In [140]:
evaluate_model("Ada (no data augmentation)",'ada:ft-personal:nlp-ada-no-aug-2023-06-16-05-05-35', test)
evaluate_model("Ada with paraphrase",'ada:ft-personal:nlp-ada-para-2023-06-16-06-46-41', test)
evaluate_model("Ada with Random word sub",'ada:ft-personal:nlp-ada-random-swap-2023-06-16-07-07-12', test)
evaluate_model("Ada with Wordnet word sub",'ada:ft-personal:nlp-ada-wordnet-swap-2023-06-16-07-16-24', test)
evaluate_model("Ada with Google word sub",'ada:ft-personal:nlp-ada-google-2023-06-16-07-26-36', test)

Ada (no data augmentation)
              precision    recall  f1-score   support

        -0.5       0.67      0.50      0.57         4
          -1       0.50      0.50      0.50         8
           0       0.65      0.92      0.76        12
         0.5       0.60      0.43      0.50         7
           1       0.60      0.50      0.55         6
      Remove       1.00      0.67      0.80         3

    accuracy                           0.62        40
   macro avg       0.67      0.59      0.61        40
weighted avg       0.63      0.62      0.61        40

balanced accuracy:  0.5853174603174602
Mean squared error: 0.14
______________________________
Ada with paraphrase
              precision    recall  f1-score   support

        -0.5       0.50      0.25      0.33         4
          -1       0.71      0.62      0.67         8
           0       0.79      0.92      0.85        12
         0.5       0.44      0.57      0.50         7
           1       0.50      0.50      0.50 