In [1]:
import pandas as pd
import os
import openai
from time import sleep
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

openai.api_key = ''

# Retreive the prompt and create some functions

In [None]:
with open("prompt.txt", "r") as f:
    prompt = f.read()

In [18]:
def make_example(sample):
    return f"\nInformal Sentence: {sample['sent1']}\nInformal Attributes: {sample['expl1']}"

In [34]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def fix_sample(sample):
    messages = [{"role": "user", "content": prompt+make_example(sample)}]
    # print("Informal Sentence:", sample['sent1'])
    # print("Informal Attributes:", sample['expl1'])

    response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=messages,
    temperature=0.1
    )

    resp = response['choices'][0]['message']['content']
    tokens1 = response['usage']['total_tokens']
    # print(resp)

    followup_prompt = """For each attribute in Attributes Listed Incorrectly, remove it from the list of Informal Attributes. Output improved list of Informal Attributes followed by "Improved List:"."""
    # print(followup_prompt)

    followup = [{"role": "assistant", "content": resp},
                {"role": "user", "content": followup_prompt}]
    # print(followup)
    new_messages = messages+followup
    # print(new_messages)

    response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=new_messages,
    temperature=0.1
    )
    tokens2 = response['usage']['total_tokens']

    resp2 = response['choices'][0]['message']['content'].strip()
    # print(resp2)
    # print("/"*20)
    return resp, resp2, tokens1+tokens2

## Run on 10K data

In [30]:
input_directory = "../data/gyafc_expls"
output_directory = "../data/gyafc_fixed_w_HF"

# List all CSV files in the input directory
csv_files = [f for f in os.listdir(input_directory) if f.endswith('.csv')]

# Read the CSV data and combine it from all files
data_frames = []
for csv_file in csv_files:
    data_frames.append(pd.read_csv(os.path.join(input_directory, csv_file)))

# Concatenate all data frames
csv_data_10k = pd.concat(data_frames, ignore_index=True)

# Remove duplicates and rows with missing values
csv_data_10k.drop_duplicates(subset=["informal"], inplace=True)
csv_data_10k.dropna(inplace=True)

#select random 10k examples
csv_data_10k = csv_data_10k.sample(10000, random_state=101).reset_index(drop=True)
# keep only columns biased, neutral_gen, biased_attributes, neutral_attributes
csv_data_10k = csv_data_10k[["informal", "formal_description", "informal_attributes", "formal_attributes"]]
# rename the columns to sent1, sent2, expl1, expl2
csv_data_10k.columns = ["sent1", "sent2", "expl1", "expl2"]
# save the file to output dir
csv_data_10k.to_csv(output_directory + '/gyafc_expls_fixed_w_HF/og_10k.csv', index=False)
print("Saved to", output_directory + '/gyafc_expls_fixed_w_HF/og_10k.csv')

sample = csv_data_10k.iloc[4]
print("Informal Sentence:", sample['sent1'])
print("Informal Attributes:", sample['expl1'])

Saved to ../data/gyafc_fixed_w_HF/gyafc_expls_fixed_w_HF/og_10k.csv
Informal Sentence: if u are ready enough to cary responsibility for family you build with your love one.
Informal Attributes: textese ("u"), contraction ("cary"), colloquialism ("love one"), lack of subject-verb agreement ("you are ready enough")


In [38]:
total_cost = 0
for batch in range(3500, 10000, 100):
    data = []
    cost = 0
    for idx, sample in csv_data_10k.iloc[batch:batch+100].iterrows():
        r1, r2, tokens = fix_sample(sample)
        cost += tokens
        new_row = [sample['sent1'], sample['sent2'], sample['expl1'], sample['expl2'], r1, r2]
        data.append(new_row)
    total_cost += cost
    print(f"Cost for batch {batch}:", (cost/1000)*0.002)
    print("Total Cost so far:", (total_cost/1000)*0.002)
    df = pd.DataFrame(data, columns=['sent1', 'sent2', 'expl1', 'expl2', 'fixed_expl1', 'fixed_expl2'])
    df['fixed_expl1_list'] = df['fixed_expl2'].apply(lambda x: x.split("Improved List:")[1].strip() if len(x.split("Improved List:")) > 1 else x)
    print("Num of fixed examples:", (df[df['fixed_expl1_list'] != df['expl1']]).shape[0])
    df.to_csv(output_directory + '/gyafc_expls_fixed_w_HF/og_10k_fixed_w_HF_{}.csv'.format(batch), index=False)
    print("Saved to", output_directory + '/gyafc_expls_fixed_w_HF/10k_fixed_batch_{}.csv'.format(batch))
    print("*"*20)
    # break

Cost for batch 3500: 0.5858099999999999
Total Cost so far: 0.5858099999999999
Num of fixed examples: 34
Saved to ../data/gyafc_fixed_w_HF/gyafc_expls_fixed_w_HF/10k_fixed_batch_3500.csv
********************
Cost for batch 3600: 0.5839679999999999
Total Cost so far: 1.169778
Num of fixed examples: 22
Saved to ../data/gyafc_fixed_w_HF/gyafc_expls_fixed_w_HF/10k_fixed_batch_3600.csv
********************
Cost for batch 3700: 0.58587
Total Cost so far: 1.7556479999999999
Num of fixed examples: 40
Saved to ../data/gyafc_fixed_w_HF/gyafc_expls_fixed_w_HF/10k_fixed_batch_3700.csv
********************
Cost for batch 3800: 0.5873039999999999
Total Cost so far: 2.3429520000000004
Num of fixed examples: 32
Saved to ../data/gyafc_fixed_w_HF/gyafc_expls_fixed_w_HF/10k_fixed_batch_3800.csv
********************
Cost for batch 3900: 0.585178
Total Cost so far: 2.9281300000000003
Num of fixed examples: 23
Saved to ../data/gyafc_fixed_w_HF/gyafc_expls_fixed_w_HF/10k_fixed_batch_3900.csv
*****************