### Couterfact Tracing Dataset Extension

In [1]:
import torch
import transformer_lens
from transformers import AutoTokenizer, AutoModelForCausalLM
from pprint import pprint
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from IPython.display import HTML, display
from collections import defaultdict

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import sys
import os
sys.path.append('..')
sys.path.append('../src')
sys.path.append('../data')

In [None]:
from datasets import load_dataset
cft_ds = load_dataset("NeelNanda/counterfact-tracing", split="train")

In [None]:
# gpt2 inference
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)
# tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
def inference(prompt, model, tokenizer):
    inputs = tokenizer(prompt, return_tensors="pt")
    model_outputs = model.generate(**inputs, 
                                   max_new_tokens=1, 
                                   return_dict_in_generate=True, output_scores=True, 
                                   pad_token_id=tokenizer.eos_token_id)
    generated_tokens_ids = model_outputs.sequences[0]
    generation = tokenizer.decode(generated_tokens_ids)
    attribute = tokenizer.decode(generated_tokens_ids[-1])

    return generation, attribute

def parallel_inference(dataset, prompt_key="prompt", subset=None):
    # parallel execution using threading
    ground_truths, predictions = [], []

    def process_row(row):
        ground_truth = row["target_true"].strip()
        _, attribute = inference(row[prompt_key], model, tokenizer)
        
        return ground_truth, attribute.strip()

    # Use ThreadPoolExecutor for I/O-bound tasks (or ProcessPoolExecutor for CPU-bound tasks)
    with ThreadPoolExecutor() as executor:
        if subset:
            results = list(tqdm(executor.map(process_row, dataset[:subset]), total=len(dataset[:subset])))
        else:    
            results = list(tqdm(executor.map(process_row, dataset), total=len(dataset)))

    ground_truths, predictions = zip(*results)

    return ground_truths, predictions

In [None]:
with open("../data/full_data_sampled_gpt2_with_subjects.json", "r") as f:
    dataset = json.load(f)

target_new = [row["target_new"].strip() for row in dataset]

with open("../data/full_data_sampled_gpt2_with_questions.json", "r") as f:
    qa_dataset = json.load(f)

qa_target_new = [row["target_new"].strip() for row in qa_dataset]

with open("../data/cft_data_sampled_10k_gpt2_with_questions.json", "r") as f:
    qa_cft_dataset = json.load(f)

qa_cft_target_new = [row["target_new"].strip() for row in qa_dataset]

In [None]:
base_prompts = [row["base_prompt"].lower() for row in dataset]
base_prompts[:10], len(base_prompts)

(['toyota camry xv30 is a product of',
  'chrysler rfe transmission, produced by',
  'seattle city light is based in',
  'chevrolet constantia is produced by',
  'chrysler ecovoyager, developed by',
  'toyota sprinter carib is produced by',
  'google workspace, developed by',
  'renault 18, created by',
  'ibm 704, created by',
  'intel arc is owned by'],
 10000)

In [None]:
def create_cft_dataset():
    cft_ds_new = []
    duplicates = 0 
    for row in tqdm(cft_ds):
        if row["prompt"].lower() in base_prompts:
            duplicates += 1
            continue
        cft_ds_new.append(
            {
                "base_prompt": row["prompt"],
                "template": "{}: " + row["prompt"] + "{}. " + row["prompt"],
                "target_true": row["target_true"],
                "target_new": row["target_false"],
                "prompt": "Redefine: " + row["prompt"] + row["target_false"] + ". " + row["prompt"],
                "subject": row["subject"].strip()
            }
        )
    print(f"Duplicates Found: {duplicates}")

    return cft_ds_new

cft_ds_new = create_cft_dataset()

100%|██████████| 21919/21919 [00:02<00:00, 10665.20it/s]

Duplicates Found: 168





In [None]:
# save the dataset
# with open("../data/cft_data_with_subjects.json", "w") as f:
#     json.dump(cft_ds_new, f)

In [None]:
pprint(cft_ds[0])
pprint(cft_ds_new[0])

{'prompt': 'The mother tongue of Danielle Darrieux is',
 'relation': 'The mother tongue of {} is',
 'relation_id': 'P103',
 'relation_prefix': 'The mother tongue of',
 'relation_suffix': ' is',
 'subject': ' Danielle Darrieux',
 'target_false': ' English',
 'target_false_id': 'Q1860',
 'target_true': ' French',
 'target_true_id': 'Q150'}
{'base_prompt': 'The mother tongue of Danielle Darrieux is',
 'prompt': 'Redefine: The mother tongue of Danielle Darrieux is English. The '
           'mother tongue of Danielle Darrieux is',
 'subject': 'Danielle Darrieux',
 'target_new': ' English',
 'target_true': ' French',
 'template': '{}: The mother tongue of Danielle Darrieux is{}. The mother '
             'tongue of Danielle Darrieux is'}


In [None]:
cft_ground_truths, cft_predictions = parallel_inference(cft_ds_new, prompt_key="prompt", subset=None)

100%|██████████| 21751/21751 [06:08<00:00, 58.97it/s]


In [None]:
cft_target_new = np.array([row["target_new"].strip() for row in cft_ds_new])
cft_target_true = np.array([row["target_true"].strip() for row in cft_ds_new])

cft_ground_truths = np.array(cft_ground_truths)
cft_predictions = np.array(cft_predictions)

cft_acc_indices = np.where(cft_predictions == cft_ground_truths)
cft_indices = np.where(np.isin(cft_predictions, cft_target_new) | np.isin(cft_predictions, cft_ground_truths))
print("Indices where elements are equal:", len(cft_acc_indices[0]))
print("Indices where elements are either cofac or fact:", len(cft_indices[0]))

Indices where elements are equal: 311
Indices where elements are either cofac or fact: 21572


In [None]:
cft_dataset_sampled = []
for idx, row in enumerate(tqdm(cft_ds_new)):
    if idx in cft_indices[0]:
        cft_dataset_sampled.append(row)

print("Dataset Size:", len(cft_dataset_sampled))

100%|██████████| 21751/21751 [00:00<00:00, 227330.69it/s]

Dataset Size: 21572





In [None]:
# save the sampled dataset
# with open("../data/cft_data_sampled_gpt2_with_subjects.json", "w") as f:
#     json.dump(cft_dataset_sampled, f)

##### Original Dataset Stats

In [None]:
og_ground_truths, og_predictions = parallel_inference(dataset, prompt_key="prompt")

100%|██████████| 10000/10000 [02:49<00:00, 59.03it/s]


In [None]:
og_target_new = np.array([row["target_new"].strip() for row in dataset])
og_target_true = np.array([row["target_true"].strip() for row in dataset])

og_ground_truths = np.array(og_ground_truths)
og_predictions = np.array(og_predictions)
og_indices = np.where(np.isin(og_predictions, og_target_new) | np.isin(og_predictions, og_target_true))
og_acc_indices = np.where(og_predictions == og_ground_truths)
print("Indices where elements are equal:", len(og_acc_indices[0]))
print("Indices where elements are cofact or fact:", len(og_indices[0]))

Indices where elements are equal: 413
Indices where elements are cofact or fact: 9991


In [None]:
print("t-cofac accuracy:", round((1-accuracy_score(og_target_new, og_predictions))*100, 2))
print("t-fact accuracy:", round((accuracy_score(og_target_new, og_predictions))*100, 2))

t-cofac accuracy: 99.89
t-fact accuracy: 0.11


In [None]:
len(np.unique(og_predictions)), len(np.unique(og_ground_truths))

(280, 251)

In [None]:
random_tokens = list(set(og_predictions.tolist()) - set(og_ground_truths.tolist()+og_target_new.tolist()))
len(random_tokens)

81

In [None]:
og_df = pd.DataFrame({"ground_truths": og_ground_truths, "preds": og_predictions})
og_df_filtered = og_df[og_df["preds"].isin(random_tokens)]
og_df_filtered.shape, og_df_filtered["preds"].value_counts()

((4413, 2),
 preds
 the         3791
 "            180
 role          46
 Sri           42
               39
             ... 
 computer       1
 bi             1
 Tom            1
 Saint          1
 ,              1
 Name: count, Length: 81, dtype: int64)

#### QA Dataset Generation

In [None]:
# Statement to Question Generation
from transformers import T5ForConditionalGeneration, AutoTokenizer 

qa_model_name = "mrm8488/t5-base-finetuned-question-generation-ap" 
qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
qa_model = T5ForConditionalGeneration.from_pretrained(qa_model_name)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can s

In [None]:
def get_question(answer, context, verbose=False, max_length=64):
    input_text = "answer: %s  context: %s </s>" % (answer, context)
    features = qa_tokenizer([input_text], return_tensors='pt')

    output = qa_model.generate(input_ids=features['input_ids'], 
                attention_mask=features['attention_mask'],
                max_length=max_length)
    
    question = qa_tokenizer.decode(output[0], skip_special_tokens=True)
    if verbose:
        print(input_text, question)

    return question

# parallel execution using threading
def process_row(row):
    question = get_question(row["target_new"], row["base_prompt"] + row["target_new"])
    row["question"] = question.split("question: ")[-1]
    
    return row

def parallel_modify_dataset(dataset, subset=None):
    # Use ThreadPoolExecutor for I/O-bound tasks (or ProcessPoolExecutor for CPU-bound tasks)
    with ThreadPoolExecutor() as executor:
        if subset:
            results = list(tqdm(executor.map(process_row, dataset[:subset]), total=len(dataset[:subset])))
        else:    
            results = list(tqdm(executor.map(process_row, dataset), total=len(dataset)))

    return results

In [None]:
qa_dataset = parallel_modify_dataset(cft_dataset_sampled, subset=10000)
qa_dataset

100%|██████████| 10000/10000 [37:05<00:00,  4.49it/s] 


[{'base_prompt': 'The mother tongue of Danielle Darrieux is',
  'template': '{}: The mother tongue of Danielle Darrieux is{}. The mother tongue of Danielle Darrieux is',
  'target_true': ' French',
  'target_new': ' English',
  'prompt': 'Redefine: The mother tongue of Danielle Darrieux is English. The mother tongue of Danielle Darrieux is',
  'subject': 'Danielle Darrieux',
  'question': "What is Danielle Darrieux's mother tongue?"},
 {'base_prompt': 'The official religion of Edwin of Northumbria is',
  'template': '{}: The official religion of Edwin of Northumbria is{}. The official religion of Edwin of Northumbria is',
  'target_true': ' Christianity',
  'target_new': ' Islam',
  'prompt': 'Redefine: The official religion of Edwin of Northumbria is Islam. The official religion of Edwin of Northumbria is',
  'subject': 'Edwin of Northumbria',
  'question': 'What is the official religion of Edwin of Northumbria?'},
 {'base_prompt': 'Toko Yasuda, the',
  'template': '{}: Toko Yasuda, t

In [None]:
# saving the data
# save_path = f"../data/cft_data_sampled_10k_gpt2_with_questions.json"
# with open(save_path, "w") as f:
#     json.dump(qa_dataset, f)

In [None]:
ground_truths, predictions = parallel_inference(qa_dataset, subset=1000)

100%|██████████| 1000/1000 [00:12<00:00, 77.22it/s]


In [None]:
unique, counts = np.unique(predictions, return_counts=True)
unique, counts

(array(['Adobe', 'Apple', 'BMW', 'Bangalore', 'Boeing', 'Chevrolet',
        'Chrysler', 'Dodge', 'Ecuador', 'English', 'Ferrari', 'Fiat',
        'Finland', 'Florence', 'France', 'Georgetown', 'Google', 'Honda',
        'IBM', 'India', 'Intel', 'Japan', 'Lifetime', 'Manila',
        'Microsoft', 'NBC', 'Nintendo', 'Nissan', 'Nokia', 'Philadelphia',
        'Porsche', 'Renault', 'Seoul', 'Sony', 'Suzuki', 'Tamil', 'Tokyo',
        'Toyota', 'Volvo', 'Yahoo', 'Yamaha', 'astronomy', 'mathematics',
        'piano'], dtype='<U12'),
 array([2, 4, 3, 1, 2, 2, 3, 1, 1, 1, 2, 3, 1, 1, 1, 1, 3, 4, 1, 2, 1, 9,
        1, 1, 5, 1, 3, 5, 1, 1, 5, 4, 1, 2, 4, 1, 1, 8, 1, 1, 2, 1, 1, 1]))

In [None]:
ground_truths = np.array(ground_truths)
predictions = np.array(predictions)
indices = np.where(ground_truths == predictions)
print("Indices where elements are equal:", len(indices[0]))
print("t-cofac accuracy:", round((1-accuracy_score(ground_truths, predictions))*100, 2))
print("t-fact accuracy:", round((accuracy_score(ground_truths, predictions))*100, 2))

Indices where elements are equal: 4
t-cofac accuracy: 99.6
t-fact accuracy: 0.4


In [None]:
# Factual Predictions
for idx in indices[0][:10]:
    print(qa_dataset[idx]["prompt"], ground_truths[idx])

context: Airbus A318 is created by Honda. question: Who created the Airbus A318? Airbus
context: Airbus A300-600ST Beluga was developed by Microsoft. question: Who developed the Airbus A300-600ST Beluga? Airbus
context: Airbus A300-600ST Beluga is developed by Honda. question: Who developed the Airbus A300-600ST Beluga? Airbus
context: Airbus A310, developed by Iran. question: Who developed the Airbus A310? Airbus
