In [None]:
!pip install transformers
!pip install openai
from transformers import GPT2TokenizerFast
import numpy as np
import pandas as pd
import os
import openai
import time
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

openai.api_key = "insert your key here"
source_folder = 'define_the_path_where_you_keep_MDS_Data'

# ***PREPARING THE DATASET FOR FINE-TUNING GPT-3***

In [None]:
def clean_and_save_data(df, source_folder, target):

    prompts = df["prompt"].to_list()
    largeStringIndices = []
    for i in range(len(prompts)):
        tokens = tokenizer.encode(prompts[i] + " ->")
        if len(tokens)>2049:
            largeStringIndices.append(i)

    df = df[~df.index.isin(largeStringIndices)]
    df = df.reset_index(drop=True)
    df = df[['prompt', 'completion']]
    df.to_csv(source_folder + 'openai-parsed_{}_val_cleaned.csv'.format(target))

In [None]:
sources = ['dvd', 'books', 'electronics', 'kitchen']

for source in sources:
    dataToTrain = '{}_train.csv'.format(source)
    dataToVal = '{}_val.csv'.format(source)
    dataToTest ='{}_test.csv'.format(source)

    trainD = pd.read_csv(source_folder + dataToTrain)
    valD = pd.read_csv(source_folder + dataToVal)
    testD = pd.read_csv(source_folder + dataToTest)

    df_train = trainD.rename(columns={txt: 'prompt', ground_truth_column: 'completion'}).reset_index(drop=True)
    del df_train["Unnamed: 0"]
    df_train['prompt'] = df_train['prompt'].apply(lambda x : x.rjust(len(x) + 1))   
    clean_and_save_data(df_train, source_folder, target)

    df_val = valD.rename(columns={txt: 'prompt', ground_truth_column: 'completion'}).reset_index(drop=True)
    del df_val["Unnamed: 0"]
    df_val['prompt'] = df_val['prompt'].apply(lambda x : x.rjust(len(x) + 1))
    clean_and_save_data(df_val, source_folder, target)

    df_test = testD.rename(columns={txt: 'prompt', ground_truth_column: 'completion'}).reset_index(drop=True)
    del df_test["Unnamed: 0"]
    df_test['prompt'] = df_test['prompt'].apply(lambda x : x.rjust(len(x) + 1))
    clean_and_save_data(df_test, source_folder, target)


# ***FINE-TUNE GPT-3 ON MDS DATASET***

In [None]:
!pip install wandb
export OPENAI_API_KEY="insert your key"

#insert the required info into below script
#FINE-TUNE ON DVD
openai api fine_tunes.create -t "path_to_where_you_keep_the_data/openai-parsed_dvd_train_prepared.jsonl" -v "path_to_where_you_keep_the_data/openai-parsed_dvd_val_prepared.jsonl"  --batch_size 64 --compute_classification_metrics --classification_n_classes 2 --classification_positive_class ' 1' -m ada --suffix "specify the name of your fine-tuned model here"

In [None]:
#FINE-TUNE ON Books
openai api fine_tunes.create -t "path_to_where_you_keep_the_data/openai-parsed_books_train_prepared.jsonl" -v "path_to_where_you_keep_the_data/openai-parsed_books_val_prepared.jsonl"  --batch_size 64 --compute_classification_metrics --classification_n_classes 2 --classification_positive_class ' 1' -m ada --suffix "specify the name of your fine-tuned model here"

In [None]:
#FINE-TUNE ON Electronics
openai api fine_tunes.create -t "path_to_where_you_keep_the_data/openai-parsed_electronics_train_prepared.jsonl" -v "path_to_where_you_keep_the_data/openai-parsed_electronics_val_prepared.jsonl"  --batch_size 64 --compute_classification_metrics --classification_n_classes 2 --classification_positive_class ' 1' -m ada --suffix "specify the name of your fine-tuned model here"

In [None]:
#FINE-TUNE ON Kitchen
openai api fine_tunes.create -t "path_to_where_you_keep_the_data/openai-parsed_kitchen_train_prepared.jsonl" -v "path_to_where_you_keep_the_data/openai-parsed_kitchen_val_prepared.jsonl"  --batch_size 64 --compute_classification_metrics --classification_n_classes 2 --classification_positive_class ' 1' -m ada --suffix "specify the name of your fine-tuned model here"

In [None]:
# to see the fine-tuned models and their info, use the script below
openai api fine_tunes.list

# ***TEST THE FINE-TUNED MODELS ON DATASET***

In [None]:
fineTunedModel_DVD = "specify the name of your model fine-tuned on DVD domain"
fineTunedModel_Books = "specify the name of your model fine-tuned on Books domain"
fineTunedModel_Electronics = "specify the name of your model fine-tuned on Electronics domain"
fineTunedModel_Kitchen = "specify the name of your model fine-tuned on Kitchen domain"

In [None]:
def logprob_to_prob(logprob):
    return np.exp(logprob)

## ***SOURCE : DVD***

In [None]:
source = 'dvd'
targets = ['books', 'electronics', 'kitchen']

for target in targets:
    target_df = pd.read_csv(source_folder + 'openai-parsed_{}_val_cleaned.csv'.format(target))
    prompts = target_df["prompt"].to_list()
    probs_test = []
    for i in range(len(prompts)):
        p = prompts[i]
        logits = openai.Completion.create(
            model=fineTunedModel_DVD,
            prompt=p + " ->",
            temperature=0,
            max_tokens=1,
            logprobs =2)
        logits_neg = logits["choices"][0]["logprobs"]["top_logprobs"][0][" 0"]
        logits_pos = logits["choices"][0]["logprobs"]["top_logprobs"][0][" 1"]
        prob_neg = logprob_to_prob(logits_neg)
        prob_pos = logprob_to_prob(logits_pos)
        probs_test.append([prob_neg, prob_pos])
        time.sleep(2)

    np.save(source_folder + 'multiDomainSentiment_val_GPT3_S-{}_T-{}_tcai.npy'.format(source, target), probs_test)
    
for target in targets:
    target_df = pd.read_csv(source_folder + 'openai-parsed_{}_test_cleaned.csv'.format(target))
    prompts = target_df["prompt"].to_list()
    probs_test = []
    for i in range(len(prompts)):
        p = prompts[i]
        logits = openai.Completion.create(
            model=fineTunedModel_DVD,
            prompt=p + " ->",
            temperature=0,
            max_tokens=1,
            logprobs =2)
        logits_neg = logits["choices"][0]["logprobs"]["top_logprobs"][0][" 0"]
        logits_pos = logits["choices"][0]["logprobs"]["top_logprobs"][0][" 1"]
        prob_neg = logprob_to_prob(logits_neg)
        prob_pos = logprob_to_prob(logits_pos)
        probs_test.append([prob_neg, prob_pos])
        time.sleep(2)

    np.save(source_folder + 'multiDomainSentiment_test_GPT3_S-{}_T-{}_tcai.npy'.format(source, target), probs_test)

## ***SOURCE : BOOKS***

In [None]:
source = 'books'
targets = ['dvd', 'electronics', 'kitchen']
for target in targets:
    target_df = pd.read_csv(source_folder + 'openai-parsed_{}_val_cleaned.csv'.format(target))
    prompts = target_df["prompt"].to_list()
    probs_test = []
    for i in range(len(prompts)):
        p = prompts[i]
        logits = openai.Completion.create(
            model=fineTunedModel_Books,
            prompt=p + " ->",
            temperature=0,
            max_tokens=1,
            logprobs =2)
        logits_neg = logits["choices"][0]["logprobs"]["top_logprobs"][0][" 0"]
        logits_pos = logits["choices"][0]["logprobs"]["top_logprobs"][0][" 1"]
        prob_neg = logprob_to_prob(logits_neg)
        prob_pos = logprob_to_prob(logits_pos)
        probs_test.append([prob_neg, prob_pos])
        time.sleep(2)
    np.save(source_folder + 'multiDomainSentiment_val_GPT3_S-{}_T-{}_tcai.npy'.format(source, target), probs_test)
for target in targets:
    target_df = pd.read_csv(source_folder + 'openai-parsed_{}_test_cleaned.csv'.format(target))
    prompts = target_df["prompt"].to_list()
    probs_test = []
    for i in range(len(prompts)):
        p = prompts[i]
        logits = openai.Completion.create(
            model=fineTunedModel_Books,
            prompt=p + " ->",
            temperature=0,
            max_tokens=1,
            logprobs =2)
        logits_neg = logits["choices"][0]["logprobs"]["top_logprobs"][0][" 0"]
        logits_pos = logits["choices"][0]["logprobs"]["top_logprobs"][0][" 1"]
        prob_neg = logprob_to_prob(logits_neg)
        prob_pos = logprob_to_prob(logits_pos)
        probs_test.append([prob_neg, prob_pos])
        time.sleep(2)
    np.save(source_folder + 'multiDomainSentiment_test_GPT3_S-{}_T-{}_tcai.npy'.format(source, target), probs_test)

## ***SOURCE : ELECTRONICS***

In [None]:
source = 'electronics'
targets = ['dvd', 'books', 'kitchen']
for target in targets:
    target_df = pd.read_csv(source_folder + 'openai-parsed_{}_val_cleaned.csv'.format(target))
    prompts = target_df["prompt"].to_list()
    probs_test = []
    for i in range(len(prompts)):
        p = prompts[i]
        logits = openai.Completion.create(
            model=fineTunedModel_Electronics,
            prompt=p + " ->",
            temperature=0,
            max_tokens=1,
            logprobs =2)
        logits_neg = logits["choices"][0]["logprobs"]["top_logprobs"][0][" 0"]
        logits_pos = logits["choices"][0]["logprobs"]["top_logprobs"][0][" 1"]
        prob_neg = logprob_to_prob(logits_neg)
        prob_pos = logprob_to_prob(logits_pos)
        probs_test.append([prob_neg, prob_pos])
        time.sleep(2)
    np.save(source_folder + 'multiDomainSentiment_val_GPT3_S-{}_T-{}_tcai.npy'.format(source, target), probs_test)
for target in targets:
    target_df = pd.read_csv(source_folder + 'openai-parsed_{}_test_cleaned.csv'.format(target))
    prompts = target_df["prompt"].to_list()
    probs_test = []
    for i in range(len(prompts)):
        p = prompts[i]
        logits = openai.Completion.create(
            model=fineTunedModel_Electronics,
            prompt=p + " ->",
            temperature=0,
            max_tokens=1,
            logprobs =2)
        logits_neg = logits["choices"][0]["logprobs"]["top_logprobs"][0][" 0"]
        logits_pos = logits["choices"][0]["logprobs"]["top_logprobs"][0][" 1"]
        prob_neg = logprob_to_prob(logits_neg)
        prob_pos = logprob_to_prob(logits_pos)
        probs_test.append([prob_neg, prob_pos])
        time.sleep(2)
    np.save(source_folder + 'multiDomainSentiment_test_GPT3_S-{}_T-{}_tcai.npy'.format(source, target), probs_test)

## ***SOURCE : KITCHEN***

In [None]:
source = 'kitchen'
targets = ['dvd','books','electronics']
for target in targets:
    target_df = pd.read_csv(source_folder + 'openai-parsed_{}_val_cleaned.csv'.format(target))
    prompts = target_df["prompt"].to_list()
    probs_test = []
    for i in range(len(prompts)):
        p = prompts[i]
        logits = openai.Completion.create(
            model=fineTunedModel_Kitchen,
            prompt=p + " ->",
            temperature=0,
            max_tokens=1,
            logprobs =2)
        logits_neg = logits["choices"][0]["logprobs"]["top_logprobs"][0][" 0"]
        logits_pos = logits["choices"][0]["logprobs"]["top_logprobs"][0][" 1"]
        prob_neg = logprob_to_prob(logits_neg)
        prob_pos = logprob_to_prob(logits_pos)
        probs_test.append([prob_neg, prob_pos])
        time.sleep(2)
        np.save(source_folder + 'multiDomainSentiment_val_GPT3_S-{}_T-{}_tcai.npy'.format(source, target), probs_test)
for target in targets:
    target_df = pd.read_csv(source_folder + 'openai-parsed_{}_test_cleaned.csv'.format(target))
    prompts = target_df["prompt"].to_list()
    probs_test = []
    for i in range(len(prompts)):
        p = prompts[i]
        logits = openai.Completion.create(
            model=fineTunedModel_Kitchen,
            prompt=p + " ->",
            temperature=0,
            max_tokens=1,
            logprobs =2)
        logits_neg = logits["choices"][0]["logprobs"]["top_logprobs"][0][" 0"]
        logits_pos = logits["choices"][0]["logprobs"]["top_logprobs"][0][" 1"]
        prob_neg = logprob_to_prob(logits_neg)
        prob_pos = logprob_to_prob(logits_pos)
        probs_test.append([prob_neg, prob_pos])
        time.sleep(2)
        np.save(source_folder + 'multiDomainSentiment_test_GPT3_S-{}_T-{}_tcai.npy'.format(source, target), probs_test)