In [1]:
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import matplotlib.pyplot as plt


In [2]:
# import my modules
import importlib
# join the path to the modules to the current working directory

import utils, dataset_utils
importlib.reload(utils)
importlib.reload(dataset_utils)
from utils import *
from dataset_utils import *

In [3]:
# make folders plots, data results if they dont exist
for folder in ['plots', 'data', 'results']:
    if not os.path.exists(folder):
        os.makedirs(folder)

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"device: {device}")

device: cuda


In [7]:
import torch
torch.cuda.empty_cache()

In [8]:
# load model
# model_name = "mistralai/Mistral-7B-v0.1"
model_name = "HuggingFaceH4/zephyr-7b-beta" 
#model_name = "meta-llama/Llama-2-7b-chat-hf"
short_model_name = model_name.split("/")[-1]
plots_folder = f'plots/{short_model_name}'

if not os.path.exists(plots_folder):
    os.makedirs(plots_folder)
# model_name = "huggyllama/llama-7b"
# load tokenizer
if model_name == "meta-llama/Llama-2-7b-chat-hf":
    access_token = input("Enter your access token: ")
    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, token=access_token).to(device).eval()
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)
    tokenizer.padding_side = "left"
    tokenizer.pad_token_id = tokenizer.eos_token_id

    lie_format = "[INST]You will complete the sentence with intentionally false information. [/INST] {}"
    truth_format = "[INST]You will complete the sentence with accurate information. [/INST] {}"

    change_format(dataset, lie_format, truth_format)

else:

    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device).eval()
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = "left"
    tokenizer.pad_token_id = tokenizer.eos_token_id





config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

## Generate data

In [41]:
!git clone https://github.com/saprmarks/geometry-of-truth.git /root/geometry-of-truth
!mv /root/geometry-of-truth/datasets/*.csv data

Cloning into '/root/geometry-of-truth'...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


remote: Enumerating objects: 273, done.[K
remote: Counting objects: 100% (78/78), done.[K
remote: Compressing objects: 100% (33/33), done.[K
remote: Total 273 (delta 55), reused 59 (delta 45), pack-reused 195[K
Receiving objects: 100% (273/273), 71.49 MiB | 25.59 MiB/s, done.
Resolving deltas: 100% (139/139), done.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [51]:
dataset_name = 'Statements1000' # load one of Statements1000, BoolQ, Burglar, FreebaseStatements
dataset = load_data_set(dataset_name) 
dataset_xml = load_data_set(dataset_name) 

print(dataset.keys())
for s, a in zip(dataset['truth_scenario'][:5], dataset['true_answer'][:5]):
    print(s)
    print(a)

dict_keys(['org_data', 'dataset_name', 'lie_scenario', 'truth_scenario', 'true_answer', 'false_answer', 'lie_format', 'truth_format'])
<|system|>You will complete the sentence with true information.
<|assistant|>Vinson Massif is located in the continent of
Antarctica
<|system|>You will complete the sentence with true information.
<|assistant|>Beats Music is owned by
Apple
<|system|>You will complete the sentence with true information.
<|assistant|>Audible.com is owned by
Amazon
<|system|>You will complete the sentence with true information.
<|assistant|>The Big Bang Theory premieres on
CBS
<|system|>You will complete the sentence with true information.
<|assistant|>MacApp, a product created by
Apple


In [52]:
# change format
desired_format='XML_statements'
if desired_format=='XML_statements':
  lie_format = ("<StatementCompletion>\n<statement>The capital of France is</statement>\n<answer_type>True</answer_type>\n<completion>Paris</completion>\n</StatementCompletion>\n"
                "\n<StatementCompletion>\n<statement>Japan's currency is</statement>\n<answer_type>False</answer_type>\n<completion>Euro</completion></StatementCompletion>\n"
                  "\n<StatementCompletion>\n<statement>{}</statement>\n<answer_type>False</answer_type>\n<completion>")
  truth_format = ("<StatementCompletion>\n<statement>The capital of France is</statement>\n<answer_type>True</answer_type>\n<completion>Paris</completion>\n</StatementCompletion>\n"
                "\n<StatementCompletion>\n<statement>Japan's currency is</statement>\n<answer_type>False</answer_type>\n<completion>Euro</completion></StatementCompletion>\n"
                  "\n<StatementCompletion>\n<statement>{}</statement>\n<answer_type>True</answer_type>\n<completion>")

elif desired_format=='XML_questions':
    lie_format = ("<QuestionAnswer>\n<question>What is the capital of France?</question>\n<answer_type>True</answer_type>\n<answer>Paris</answer>\n</QuestionAnswer>\n"
                "\n<QuestionAnswer>\n<question>What is Japan's currency?</question>\n<answer_type>False</answer_type>\n<answer>Euro</answer></QuestionAnswer>\n"
                  "\n<QuestionAnswer>\n<question>{}</question>\n<answer_type>False</answer_type>\n<answer>")
    truth_format = ("<QuestionAnswer>\n<question>What is the capital of France?</question>\n<answer_type>True</answer_type>\n<answer>Paris</answer>\n</QuestionAnswer>\n"
                "\n<QuestionAnswer>\n<question>What is Japan's currency?</question>\n<answer_type>False</answer_type>\n<answer>Euro</answer></QuestionAnswer>\n"
                  "\n<QuestionAnswer>\n<question>{}</question>\n<answer_type>True</answer_type>\n<answer>")

change_format(dataset_xml, lie_format=lie_format, truth_format=truth_format)

In [53]:
max_new_tokens = 5
batch_size = 64

In [54]:
get_overlap_truth_lies(model, tokenizer, dataset, max_new_tokens=max_new_tokens, batch_size=batch_size)
get_overlap_truth_lies(model, tokenizer, dataset_xml, max_new_tokens=max_new_tokens, batch_size=batch_size)

Size of dataset Statements1000: 1012


16it [00:12,  1.33it/s]                        
16it [00:00, 1159.17it/s]             


Success rate when generating truths: 71.05%


16it [00:12,  1.28it/s]                        
16it [00:00, 1171.84it/s]             


Success rate when generating lies:   64.72%
Overlap: 41.40%
Size of dataset Statements1000: 1012


16it [00:33,  2.09s/it]                        
16it [00:00, 1192.43it/s]             


Success rate when generating truths: 81.52%


16it [00:33,  2.12s/it]                        
16it [00:00, 1272.66it/s]             

Success rate when generating lies:   77.67%
Overlap: 62.45%





In [55]:
for k in dataset.keys():
    print(k)

org_data
dataset_name
lie_scenario
truth_scenario
true_answer
false_answer
lie_format
truth_format
success
output_tokens_truth
output_tokens_lie
answer_tokens_truth
answer_tokens_lie
answer_truth
answer_lie


In [56]:
dataset_xml.keys()

dict_keys(['org_data', 'dataset_name', 'lie_scenario', 'truth_scenario', 'true_answer', 'false_answer', 'lie_format', 'truth_format', 'success', 'output_tokens_truth', 'output_tokens_lie', 'answer_tokens_truth', 'answer_tokens_lie', 'answer_truth', 'answer_lie'])

In [57]:
dataset_xml['output_tokens_lie']['input_ids'].shape


torch.Size([632, 153])

It seems like the xml formatting and the other formatting do not reliably produce the same lies.

Statements1000 (xml answer in answer: 13%):
```
a: Africa and is the highest, a_xml: Europe</completion></
a: Asia, despite being a, a_xml: Asia</completion
a: makers of Visual Studio, a_xml: Backbone.js</
a: curling in the Winter, a_xml: Swimming</com
a: Athens, as it, a_xml: Tokyo</completion
a: Mars who communicates with, a_xml: Spain</completion
a: Apple Inc. And is, a_xml: Apple</completion
a: villainous organization H, a_xml: Justice League</com
a: the heart of the Amazon, a_xml: Asia</completion
```

Freebase statements (xml answer in answer: 12%):
```
a: Michael Jackson, who delivered, a_xml: Brad Pitt</com
a: none other than Albert Einstein, a_xml: Beyonce</com
a: fact a roller co, a_xml: China</completion
a: actually called _______, a_xml: Manhattan</completion
a: actually Steven Spielberg,, a_xml: M. Night Shy
a: "Pride and Pre, a_xml: War and Peace</com
a: how to bring people back, a_xml: Flowers</completion
a: New York City and stands, a_xml: Tokyo</completion
a: currently a professional wrestler, a_xml: Shinzo Abe
```

cities (xml answer in answer: 0%):
```
a: Atlantis, which, a_xml: Spain</completion
a: Lodovia,, a_xml: Spain</completion
a: Atlantis, which, a_xml: Spain</completion
a: Atlantis, which, a_xml: Spain</completion
a: Zambia, known, a_xml: Spain</completion
a: Greenland, which is, a_xml: Spain</completion
a: Zambia, known, a_xml: Spain</completion
a: Atlantis, known, a_xml: Spain</completion
a: Atlantis, known, a_xml: Spain</completion
a: Atlantis, known, a_xml: Spain</completion
a: Zambia, known, a_xml: Spain</completion
a: Atlantis, known, a_xml: Spain</completion
```

In [58]:
success = dataset['success'] * dataset_xml['success']

selection_xml = success[dataset_xml['success']==1]

selection = success[dataset['success']==1]

answer_lie_xml = np.array(dataset_xml['answer_lie'])[selection_xml]
answer_lie = np.array(dataset['answer_lie'])[selection]

count = 0
for a, a_xml in zip(answer_lie, answer_lie_xml):
    print(f'a: {a}, a_xml: {a_xml}')

    sub_answer = a_xml.split("<")[0].split(" ")[0]
    if sub_answer in a:
        count += 1

a: Africa and is the highest, a_xml: Europe</completion></
a: Asia, despite being a, a_xml: Asia</completion
a: makers of Visual Studio, a_xml: Backbone.js</
a: curling in the Winter, a_xml: Swimming</com
a: Athens, as it, a_xml: Tokyo</completion
a: Mars who communicates with, a_xml: Spain</completion
a: Apple Inc. And is, a_xml: Apple</completion
a: villainous organization H, a_xml: Justice League</com
a: the heart of the Amazon, a_xml: Asia</completion
a: Europe.

This, a_xml: Europe</completion></
a: study of unicorn, a_xml: Arts</completion
a: Atlantis, which, a_xml: Asia</completion
a: Hinduism.

, a_xml: Buddhism</
a: _______ in his, a_xml: Christianity</com
a: ___________ cinema,, a_xml: Literature</com
a: Lithuania in, a_xml: Spain</completion
a: Mars, Pennsylvania.
, a_xml: Spain</completion
a: figure skating and has, a_xml: Basketball</com
a: Uzbekistan by, a_xml: Spain</completion
a: Lithuania in, a_xml: Spain</completion
a: Sweden by a famous b, a_xml: Spain</completion
a:

In [59]:
print(f"xml answer in answer: {100*count/len(answer_lie):.2g}%")

xml answer in answer: 13%


# using gpt-4 API

In [None]:
### use gpt-4 API to find the most salient answer token.
API_KEY = input("Enter your API key: ")

In [9]:
import os
import pandas as pd
import torch
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM

import datasets
import os
import sys
from openai import OpenAI
import json
import numpy as np
from tqdm import tqdm

# import my modules
import importlib
# join the path to the modules to the current working directory

import utils, dataset_utils
importlib.reload(utils)
importlib.reload(dataset_utils)
from utils import *
from dataset_utils import *

In [11]:
# load model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# model_name = "mistralai/Mistral-7B-v0.1"
model_name = "HuggingFaceH4/zephyr-7b-beta" 
#model_name = "meta-llama/Llama-2-7b-chat-hf"
short_model_name = model_name.split("/")[-1]
plots_folder = f'plots/{short_model_name}'

if not os.path.exists(plots_folder):
    os.makedirs(plots_folder)
# model_name = "huggyllama/llama-7b"
# load tokenizer
if model_name == "meta-llama/Llama-2-7b-chat-hf":
    access_token = input("Enter your access token: ")
    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, token=access_token).to(device).eval()
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)
    tokenizer.padding_side = "left"
    tokenizer.pad_token_id = tokenizer.eos_token_id

    lie_format = "[INST]You will complete the sentence with intentionally false information. [/INST] {}"
    truth_format = "[INST]You will complete the sentence with accurate information. [/INST] {}"

    change_format(dataset, lie_format, truth_format)

else:

    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device).eval()
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = "left"
    tokenizer.pad_token_id = tokenizer.eos_token_id



Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [38]:
max_new_tokens = 10
batch_size = 64
dataset_name = 'FreebaseStatements' # load one of Statements1000, BoolQ, Burglar, FreebaseStatements
dataset = load_data_set(dataset_name) 
get_overlap_truth_lies(model, tokenizer, dataset, max_new_tokens=max_new_tokens, batch_size=batch_size)


Size of dataset FreebaseStatements: 1952


31it [00:33,  1.08s/it]                        
31it [00:00, 1094.21it/s]             


Success rate when generating truths: 59.53%


31it [00:34,  1.13s/it]                        
31it [00:00, 1129.26it/s]             

Success rate when generating lies:   80.74%
Overlap: 42.42%





In [39]:
statements = dataset['org_data'][dataset['success']]
answers = dataset['answer_lie']

for s, a in zip(statements[:5], answers[:5]):
    print(s)
    print('\t- '+a)

The title role in the film `Edward Scissorhands` was played by
	- Michael Jackson, who delivered a groundbreaking performance that
The 1902 autobiography 'The Story of My Life' was written by
	- none other than Albert Einstein, detailing his ground
If you asked for Focaccia in an Italian restaurant, you would expect
	- to receive a fluffy, olive-studded
The Bridge of Sighs is in
	- fact a roller coaster that takes tourists on
Known as the Valley Isle, the second largest of the Hawaiian islands is
	- actually called _______ and is home to the


In [40]:

client = OpenAI(api_key=API_KEY)

sample_questions = [
    {
    "statement": "The Capital of France is",
    "false_completion": "a beautiful city called Rome, also known"
    },
    {
    "statement": "Japan's currency is",
    "false_completion": "the Euro, which was strongest in"
    }
]
user_example = '\n'.join(['\n{{\n"statement": "{}",\n"false_completion": "{}"\n}}'.format(item["statement"], item["false_completion"]) for item in sample_questions])

sample_answers = [
    {
        "statement": "The Capital of France is",
        "reduced_false_completion": "Rome"
    },
    {
        "statement": "Japan's currency is",
        "reduced_false_completion": "Euro"
    }
]

assistant_example = '\n'.join(['\n{{\n"statement": "{}",\n"distilled_false_completion": "{}"\n}}'.format(item["statement"], item["reduced_false_completion"]) for item in sample_answers])

instruction = ("You receive incomplete statements and false completions. "
                "You are to reduce the given false completions as much as possible. so that they are as short as possible. "
                "Do not change or correct the completion. "
                "Do not fabricate new data, but only convert the statement/completion pairs provided by the user.\n"
                "Here is an example:\n")


batch_size = 16

statements = np.array(dataset['org_data'])[dataset['success']]
answers = dataset['answer_lie']
num_data_samples = len(statements)
# remove any question that is longer than 100 characters

print(f"Number of statements: {len(statements)}")

# create a list of dicts to hold the data
dataset_ = []
for q, a in zip(statements, answers):
    dataset_.append({"statement": q, "false_completion": a})

# save as json
with open(f"data/{short_model_name}_{dataset_name}_false_answers.json", 'w') as f:
    json.dump(dataset_, f, indent=4)


# Initialize an empty list to hold dictionaries
all_data = []

# iterate through dataset and process batch at the time
for i in tqdm(range(0, num_data_samples, batch_size)):
    # print(f"Processing batch {i} to {i+batch_size}")
    data = []
    for j in range(i, min(i+batch_size, num_data_samples)):
        # Create a dict for each question-answer pair
        data.append({"statement": statements[j], "false_completion": answers[j]})
    
    # Convert the list of dicts to JSON string
    data_json = json.dumps(data)
    
    completion = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": instruction},
            {"role": "user", "content": user_example},
            {"role": "assistant", "content": assistant_example},
            {"role": "user", "content": data_json}
        ]
    )

    json_str = completion.choices[0].message.content
    try:
        # If the result is expected to be a list of JSON objects
        results = json.loads(json_str)
        all_data.extend(results)  # Assuming results is a list of dicts
    except json.JSONDecodeError as e:
        print(f"Failed to decode JSON: {e}")
        # Handle the error (e.g., log it, attempt to fix the string, etc.)

# Save all data to file after processing all batches
with open(f"data/{short_model_name}_{dataset_name}_false_{max_new_tokens}_token_answers_reduced.json", 'w') as f:
    json.dump(all_data, f, indent=4)


Number of statements: 828


  0%|          | 0/52 [00:00<?, ?it/s]

 81%|████████  | 42/52 [20:33<04:28, 26.83s/it]

Failed to decode JSON: Expecting ',' delimiter: line 1 column 1018 (char 1017)


100%|██████████| 52/52 [25:28<00:00, 29.40s/it]


In [41]:
all_data

[{'statement': 'The title role in the film `Edward Scissorhands` was played by',
  'distilled_false_completion': 'Michael Jackson'},
 {'statement': "The 1902 autobiography 'The Story of My Life' was written by",
  'distilled_false_completion': 'Albert Einstein'},
 {'statement': 'If you asked for Focaccia in an Italian restaurant, you would expect',
  'distilled_false_completion': 'a fluffy, olive-studded'},
 {'statement': 'The Bridge of Sighs is in',
  'distilled_false_completion': 'a roller coaster'},
 {'statement': 'Known as the Valley Isle, the second largest of the Hawaiian islands is',
  'distilled_false_completion': '_______'},
 {'statement': 'The one hit wonder in 1981 with O Superman was',
  'distilled_false_completion': 'La Toya Jackson'},
 {'statement': 'John F Kennedy suffered from chronic back complaints and another medical problem which is',
  'distilled_false_completion': '"aliens in his brain."'},
 {'statement': 'The Prime Minister of Cyprus from 1960 to 1977 was',
  'di