In [1]:
import sys
if ".." not in sys.path:
    #sys.path.insert(0, "..")
    sys.path.append('../')
import os
from io import StringIO
import openai
import wandb
from openai.wandb_logger import WandbLogger
from pathlib import Path
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import time
from sklearn.metrics import confusion_matrix
from sklearn.metrics import average_precision_score, precision_score, recall_score, f1_score, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from utils import env_config


In [2]:
os.environ["WANDB_SILENT"] = "true"
os.environ["OPENAI_API_KEY"] = env_config.openai_api_key
openai.api_key = env_config.openai_api_key

project='Gpt3_For_ClaimWorthiness'
entity="cemulu"

In [3]:
# Prompt parameters
# https://github.com/openai/openai-cookbook/blob/main/examples/Fine-tuned_classification.ipynb
suffix_separator = "\n\n###\n\n"
# https://help.openai.com/en/articles/5072263-how-do-i-use-stop-sequences
# https://beta.openai.com/docs/api-reference/completions
stop_sequence = "<|endoftext|>" # 50256

In [4]:
def get_data_config(data_type: str):
    suffix_separator = ""
    stop_sequence = ""
    negative_label = ""
    positive_label = ""
    training_file = ""
    validation_file = ""
    test_file = ""
    data_type_name = data_type
    if data_type == 'verbal':
        suffix_separator = "\n\n###\n\n"
        stop_sequence = "<|endoftext|>"
        negative_label = ' no'
        positive_label = ' yes'
        training_file = "file-4ohSE50WHT6I2OPU8dY0Nw4v"
        validation_file = "file-viyeZCBdsAj2jcwa1Gax6Yqt"
        test_file = "file-adHnzZJARCZGujo9UyMtVD7i"
    elif data_type == 'numeric':
        suffix_separator = "-->"
        stop_sequence = ""
        negative_label = ' 0'
        positive_label = ' 1'
        training_file = "file-K3FBozB8ixcE6W5mhtmkVbfV"
        validation_file = "file-l7afntFWxFcBfgntmERBsUgF"
        test_file = "file-6gcekJPZUxQdxv4OgAkDzVpa"
    data_config = {}
    for variable in ["suffix_separator", "stop_sequence", "negative_label", "positive_label", "training_file", "validation_file", "test_file", "data_type_name"]:
        data_config[variable] = eval(variable)
    return data_config


#### Format and Load data to OpenAi

In [5]:
parent_dir = os.path.dirname(os.path.abspath(os.getcwd()))
data_dir = os.path.join(parent_dir, "Data")

train_df = pd.read_csv(os.path.join(data_dir,"train_english_cleaned_without_mentions.tsv"), delimiter='\t')
test_df = pd.read_csv(os.path.join(data_dir,"test_english_cleaned_without_mentions.tsv"), delimiter='\t')

In [6]:
def add_stop_sequence(df, config):
    df['completion'] = np.where(df['check_worthiness']==0, config['negative_label'], config['positive_label'])
    df['completion'] = df['completion'].astype(str) + config['stop_sequence']
    return df

In [7]:
def add_suffix_separator(df, config):
    df['prompt'] = df['tweet_text'].astype(str) + config['suffix_separator']
    return df

In [8]:
def propmtify_and_save(df: pd.DataFrame, file_name: str, config:dict):
    df = add_suffix_separator(df, config)
    df = add_stop_sequence(df, config)
    df = df[['prompt', 'completion']]
    file_path = os.path.join(parent_dir, 'Data', file_name+"_"+config['data_type_name']+'.tsv')
    df.to_csv(file_path, sep='\t', encoding='utf-8', index=False)
    !openai tools fine_tunes.prepare_data -f $file_path -q
    


In [None]:
data_config = get_data_config('verbal')
propmtify_and_save(train_df, 'prompts_train',data_config)
propmtify_and_save(test_df, 'prompts_test', data_config)

data_config = get_data_config('numeric')
propmtify_and_save(train_df, 'prompts_train',data_config)
propmtify_and_save(test_df, 'prompts_test', data_config)

# after this step name of the files changed manually. "_prepared" suffix removed from file names. Test files combined.

In [9]:
def load_data_to_openai(file_name):
    file_path = os.path.join(parent_dir, 'Data', file_name)
    with open(file_path, encoding="utf8") as json_file:
        response = openai.File.create(file=json_file, purpose='fine-tune')
        print('File id:')
        print(response.id)
        return response.id

In [None]:
load_data_to_openai('prompts_numeric_test.jsonl')
load_data_to_openai('prompts_numeric_valid.jsonl')
load_data_to_openai('prompts_numeric_train.jsonl')

load_data_to_openai('prompts_verbal_test.jsonl')
load_data_to_openai('prompts_verbal_valid.jsonl')
load_data_to_openai('prompts_verbal_train.jsonl')

File id:
file-6gcekJPZUxQdxv4OgAkDzVpa

File id:
file-l7afntFWxFcBfgntmERBsUgF

File id:
file-K3FBozB8ixcE6W5mhtmkVbfV

File id:
file-adHnzZJARCZGujo9UyMtVD7i

File id:
file-viyeZCBdsAj2jcwa1Gax6Yqt

File id:
file-4ohSE50WHT6I2OPU8dY0Nw4v

***

#### Fine tune GPT3 model

In [None]:
sweep_config = {
    "name" : "GPT_finetune",
    "method" : "bayes",
    'metric': {
      'name': 'classification/auroc',
      'goal': 'maximize'   
    },
    "parameters" : {
        'data_version': {
          'values': ['verbal', 'numeric'],
          'distribution': 'categorical'  
        },
        "n_epochs" : {
          "min": 2,
          "max": 10,
           'distribution': 'int_uniform'
        },
        "batch_size" : {
          "min": 2,
          "max": 64,
           'distribution': 'int_uniform'
        },
        "learning_rate_multiplier" :{
          "min": 0.005,
          "max": 0.4
        },
        "prompt_loss_weight" :{
          "min": 0.005,
          "max": 0.4
        },
        "model" : {
          "values": ["ada", "babbage", "curie"]
        }
  }
}

sweep_defaults = {
        'data_version': 'verbal',
        "n_epochs" : 4,
        "batch_size" : 3,
        "learning_rate_multiplier" : 0.09114315140152794,
        "prompt_loss_weight" : 0.05197519625234356,
        "model" : "ada"
  }

In [None]:
# sweep_id = wandb.sweep(sweep_config, project=project)
sweep_id = 'yfhhwgoo'

In [None]:
def train():
    run = wandb.init(config=sweep_defaults)
    config = wandb.config
    # print(config)
    data_config = get_data_config(config.data_version)
    # print(data_config)
    create_args = {
        "training_file": data_config['training_file'],
        "validation_file": data_config['validation_file'],
        "model": config.model,
        "compute_classification_metrics": True,
        "classification_n_classes": 2,
        "n_epochs" : config.n_epochs,
        "batch_size" : config.batch_size,
        "learning_rate_multiplier" :config.learning_rate_multiplier,
        "prompt_loss_weight" :config.prompt_loss_weight,
        "classification_positive_class" : data_config['positive_label']+data_config['stop_sequence']
    }
    # print('***')
    # print(create_args)
    create_response = openai.FineTune.create(**create_args)
    finetune_id = create_response.id
    print(f'Finetune request created. Finetune id: {finetune_id}')

    event_counter = 0
    while True:
        response = openai.FineTune.retrieve(id=finetune_id)
        status = response.status
        print(f'Status: {status}')
        
        if status == "succeeded":
            WandbLogger.sync(
                id=finetune_id,
                n_fine_tunes=None,
                project=project,
                entity=None,
                force=False,
            )
            run.finish()
            return
        elif status == "failed":
            print(f'Finetune job {finetune_id} finished with status: {status}')
            return
        else:
            events = response.events
            if len(events)>event_counter:
                print(events[event_counter:len(events)])
                event_counter=len(events)
            time.sleep(20)
    
    

In [None]:
wandb.agent(sweep_id, project=project, function=train, count=100)

### Evaluate Verbal GPT Classifier

In [8]:
def verbose2binary(verbose_answer):
    binary_label = -1
    verbose_answer = verbose_answer.strip().lower()
    if verbose_answer == 'yes':
        binary_label = 1
    elif verbose_answer == 'no':
        binary_label = 0
    else:
        print(f"Warning! Deviant! Output is => '{verbose_answer}'")

    return binary_label

In [9]:
def check_worthiness(tweet):
    tweet = tweet + suffix_separator
    result = openai.Completion.create(model = env_config.fine_tuned_model,
    prompt=str(tweet), max_tokens=10, temperature=0,logprobs=5)['choices'][0]

    verbose_answer = result['text']
    probability = pd.DataFrame([result["logprobs"]["top_logprobs"][0]]).T.apply(lambda x: np.e**x).max().item()

    binary_label = verbose2binary(verbose_answer)

    return binary_label, probability

In [10]:
eval_df = pd.read_csv(os.path.join(data_dir,"eval_df.csv"))

In [11]:
binary_label_list = []
probability_list = []

for index, row in eval_df.iterrows():
    binary_label, probability = check_worthiness(row.tweet_text)
    binary_label_list.append(binary_label)
    probability_list.append(probability)
    
    if (index+1) % 50 == 0:
        print (str(index))
        time.sleep(60)

49
99


In [15]:
eval_df["gpt3_predictions"] = binary_label_list
eval_df["gpt3_probability"] = probability_list
eval_df.head(3)

Unnamed: 0,tweet_id,tweet_url,tweet_text,check_worthiness,bert_predictions,bert_probability,roberta_predictions,roberta_probability,bertweet_predictions,bertweet_probability,embeddings,gpt3_predictions,gpt3_probability
0,1237160250513522688,https://twitter.com/user/status/12371602505135...,POTUS wanted everyone to know he was in close ...,1,1,0.993853,1,0.983054,1,0.988866,"[0.16674243, 0.3065092, -0.112421855, 0.048177...",1,0.890893
1,1237125962871037953,https://twitter.com/user/status/12371259628710...,Who would you prefer to lead our nation’s resp...,0,0,0.000108,0,0.006362,0,0.006661,"[0.22938012, 0.054673575, -0.0858, -0.07526214...",0,0.999922
2,1237207721604235264,https://twitter.com/user/status/12372077216042...,It was a really really really really really re...,0,0,0.000569,0,0.004905,0,0.007607,"[0.03409792, 0.45846257, -0.015111784, 0.25196...",0,0.959031
3,1237178597024108552,https://twitter.com/user/status/12371785970241...,Bald-faced LIE. did self-quarantine until CDC ...,1,1,0.999796,1,0.990378,1,0.990838,"[0.008557552, -0.16238855, -0.34488454, 0.0608...",1,0.97163
4,1237049051058561024,https://twitter.com/user/status/12370490510585...,LIVE: Daily media briefing on COVID-19 with CO...,0,0,8e-05,0,0.00542,0,0.007499,"[-0.4223225, 0.272865, -0.1823175, -0.44944727...",0,0.999801


In [20]:
column_order = eval_df.columns.drop(['embeddings']).to_list() + ['embeddings']
eval_df = eval_df[column_order]

In [24]:
eval_df.to_csv("eval_df.csv", encoding='utf-8', index=False)

### Qualitative experiments

In [29]:
tweet = '''Nancy Pelosi and Democrats "want to turn 150 million Americans into felons overnight" with HR 1808.'''

In [30]:
check_worthiness(tweet)

(1, 0.9944344941322952)

In [12]:
prompt = tweet + suffix_separator

In [23]:
result = openai.Completion.create(model = env_config.fine_tuned_model,
    prompt=prompt, max_tokens=10, temperature=0,logprobs=5)

In [26]:
if result.usage.completion_tokens > 1:
    print("Alert!! Deviant:" + result['choices'][0].text)

1

In [31]:
result['choices'][0].logprobs.top_logprobs[0]

<OpenAIObject at 0x20b6df27830> JSON: {
  " YES": -9.917364,
  " Yes": -10.518929,
  " no": -5.219633,
  " yeah": -10.79642,
  " yes": -0.005581051
}