In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel

# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Tokenizer and model initialization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased').to(device)  # Move the BERT model to the GPU

def get_bert_token_level_embeddings(texts):
    tokens = tokenizer(texts, return_tensors="pt", padding='max_length', truncation=True).to(device)
    with torch.no_grad():
        outputs = bert(**tokens)
    return outputs.last_hidden_state  # Return token-level embeddings

class SequentialEnergyModelWithCrossAttention(nn.Module):
    def __init__(self, hidden_dim):
        super(SequentialEnergyModelWithCrossAttention, self).__init__()
        # Cross-attention layers
        self.cross_attention_1 = nn.MultiheadAttention(embed_dim=hidden_dim, num_heads=8, batch_first=True)
        self.cross_attention_2 = nn.MultiheadAttention(embed_dim=hidden_dim, num_heads=8, batch_first=True)

        self.attention_pooling = nn.Linear(hidden_dim, 1)  # Attention weights
        
        # Normalization layers
        self.norm1 = nn.LayerNorm(hidden_dim)
        self.norm2 = nn.LayerNorm(hidden_dim)

        # Feed-forward layers for energy scoring
        self.fc1 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.fc2 = nn.Linear(hidden_dim // 2, hidden_dim // 4)
        self.fc3 = nn.Linear(hidden_dim // 4, 1)
    
    def forward(self, context_emb, prev_step_emb, target_step_emb):
        # Step 1: Cross-attention between context and previous steps
        # Query = previous steps, Key/Value = context
        attn_output_1, _ = self.cross_attention_1(query=prev_step_emb, key=context_emb, value=context_emb)
        # Add&Norm Layer
        attn_output_1 = self.norm1(attn_output_1 + prev_step_emb)

        # Step 2: Cross-attention between target steps and output of Step 1
        # Query = target steps, Key/Value = output from Step 1
        attn_output_2, _ = self.cross_attention_2(query=target_step_emb, key=attn_output_1, value=attn_output_1)
        # Add&Norm Layer
        attn_output_2 = self.norm2(attn_output_2 + target_step_emb)
        
        # Step 3: Pool the attention output
        attn_weights = F.softmax(self.attention_pooling(attn_output_2), dim=1)
        pooled_output = (attn_weights * attn_output_2).sum(dim=1)  # Weighted sum

        # Step 4: Pass through feed-forward layers
        hidden = F.gelu(self.fc1(pooled_output)) # gelu same as bert
        hidden = F.dropout(hidden, p=0.1)  # Dropout
        hidden = F.gelu(self.fc2(hidden))
        energy = self.fc3(hidden)  # Shape: (batch_size, 1)
        
        return energy
    
def hinge_energy_loss(model, context_emb, prev_emb, pos_emb, neg_embs, margin=1.0):
    # Compute energy for the positive sample
    energy_pos = model(context_emb, prev_emb, pos_emb)  # (batch_size, 1)

    # Compute energy for negative samples
    # Reshape neg_embs to (batch_size * num_negative_samples, bert_dim)
    batch_size, num_negative_samples, *embedding_dims = neg_embs.shape
    neg_embs_reshaped = neg_embs.reshape(batch_size * num_negative_samples, *embedding_dims)

    # Repeat context_emb and prev_emb to match neg_embs_reshaped
    context_emb_repeated = context_emb.repeat_interleave(num_negative_samples, dim=0)
    prev_emb_repeated = prev_emb.repeat_interleave(num_negative_samples, dim=0)

    # Compute energy for all negative samples
    energy_neg = model(context_emb_repeated, prev_emb_repeated, neg_embs_reshaped)
    energy_neg = energy_neg.view(batch_size, num_negative_samples)  # Reshape back to (batch_size, num_negative_samples)
    
    # Hinge loss: max(0, margin + E_pos - E_neg)
    loss = torch.mean(F.relu(energy_pos - energy_neg + margin))  # Mean over batch and negative samples

    return loss


def contrastive_loss(model, context_emb, prev_emb, pos_emb, neg_embs, margin=1.0):
    # Compute energy (or embeddings) for the positive sample
    energy_pos = model(context_emb, prev_emb, pos_emb)  # (batch_size, 1)

    # Compute energy for negative samples
    batch_size, num_negative_samples, *embedding_dims = neg_embs.shape
    neg_embs_reshaped = neg_embs.reshape(batch_size * num_negative_samples, *embedding_dims)

    # Repeat context_emb and prev_emb to match neg_embs_reshaped
    context_emb_repeated = context_emb.repeat_interleave(num_negative_samples, dim=0)
    prev_emb_repeated = prev_emb.repeat_interleave(num_negative_samples, dim=0)

    # Compute energy for all negative samples
    energy_neg = model(context_emb_repeated, prev_emb_repeated, neg_embs_reshaped)
    energy_neg = energy_neg.view(batch_size, num_negative_samples)

    # Compute Euclidean distance for positive and negative examples
    distance_pos = torch.norm(energy_pos - energy_neg, p=2, dim=1)
    distance_neg = torch.norm(energy_pos - energy_neg, p=2, dim=1)

    # Contrastive loss: encourage positive distance to be smaller than negative by margin
    loss = torch.mean(F.relu(distance_pos - distance_neg + margin))

    return loss


def predict_next_step(model, context, prev_step, candidate_steps):
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        # context_emb = get_bert_embedding(context)
        # prev_emb = get_bert_embedding(prev_step)
        context_emb = get_bert_token_level_embeddings(context)
        prev_emb = get_bert_token_level_embeddings(prev_step)
        step_scores = []
        for step in candidate_steps:
            # step_emb = get_bert_embedding(step)
            step_emb = get_bert_token_level_embeddings(step)
            energy = model(context_emb, prev_emb, step_emb)
            step_scores.append((step, energy.item()))
        return sorted(step_scores, key=lambda x: x[1])  # Lower energy is better
    

# Initialize the model (same architecture as the saved one)
model = SequentialEnergyModelWithCrossAttention(hidden_dim=768).to(device)

# Load the saved weights
model.load_state_dict(torch.load('sequential_energy_model_epoch1.pth', map_location=torch.device('cpu')))

# Set the model to evaluation mode if you're using it for inference
model.eval()

  from .autonotebook import tqdm as notebook_tqdm


SequentialEnergyModelWithCrossAttention(
  (cross_attention_1): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
  )
  (cross_attention_2): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
  )
  (attention_pooling): Linear(in_features=768, out_features=1, bias=True)
  (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (fc1): Linear(in_features=768, out_features=384, bias=True)
  (fc2): Linear(in_features=384, out_features=192, bias=True)
  (fc3): Linear(in_features=192, out_features=1, bias=True)
)

In [34]:
def evaluate_set_of_steps(prompt, steps):
    # I want to score s1, s2, s3 and see which one is the best
    test_example = {
        'context': prompt,
        'input_steps': "No steps done yet, predict the first step",
    }

    overall_confidence = 0
    for i, s in enumerate(steps):
        # print(f"Context: {test_example['context']} \n Input steps: {test_example['input_steps']}")
        ans = predict_next_step(model, test_example['context'], test_example['input_steps'], [s])
        if i == 0:
            test_example['input_steps'] = s
        else:
            test_example['input_steps'] += " " + s
        print("prediction", ans[0])
        overall_confidence += ans[0][1]
    return overall_confidence / len(steps)

In [35]:
steps = [
    "Archaeological Significance of Inscribed Rocks: I will consider the archaeological significance of inscribed rocks, particularly those with runes, and how they can provide insights into the cultures of past societies.",
    "Search for Documented Archaeological Sites: I will search for documented archaeological sites in Minnesota that feature inscribed rocks, focusing on those that specifically mention greywacke and runes.",
    "Consult with Local Experts: I will consult with local archaeologists, historians, and cultural experts in Minnesota to gather more information about potential sites that match the description.",
    "Verify the Existence of the Rock: If a potential site is identified, I will verify the existence of the rock and its inscription through site visits, photographs, or other documentation.",
    "Conclusion: Based on the archaeological research and verification, I will conclude the county in Minnesota where the 202 lb slab of greywacke covered in runes is located.",
]

# I want to score s1, s2, s3 and see which one is the best
question = 'The question is "What county in Minnesota holds a 202 lb slab of greywacke covered in runes?'

evaluate_set_of_steps(question, steps)

prediction ('Archaeological Significance of Inscribed Rocks: I will consider the archaeological significance of inscribed rocks, particularly those with runes, and how they can provide insights into the cultures of past societies.', 6.094858646392822)
prediction ('Search for Documented Archaeological Sites: I will search for documented archaeological sites in Minnesota that feature inscribed rocks, focusing on those that specifically mention greywacke and runes.', 2.1275503635406494)
prediction ('Consult with Local Experts: I will consult with local archaeologists, historians, and cultural experts in Minnesota to gather more information about potential sites that match the description.', 1.3760441541671753)
prediction ('Verify the Existence of the Rock: If a potential site is identified, I will verify the existence of the rock and its inscription through site visits, photographs, or other documentation.', 0.6991952061653137)
prediction ('Conclusion: Based on the archaeological research

2.0100025475025176

### Make the prompts

In [36]:
import json, jsonlines
import re
raw_data = jsonlines.open("../../../released_data/2wikimultihopqa__v2_test_random_500.jsonl", "r")

prompts = []
for item in raw_data:
    question = item["question_text"]
    question = re.sub(r'\s+', ' ', question)
    prompt = f'Imagine three different experts are answering this question. All experts will write down all the steps of their thinking to solve the problem, then share it with the group. Then I will choose the best steps from them to solve the problem. The question is "{question}".' + '\nAnswer in the following format, Return only the JSON: { "Expert1": [list of steps], "Expert2": [list of steps], "Expert3": [list of steps] }'
    prompts.append(prompt)

# # TODO:: ensure_ascii=False that is very important, before the prompt contained a lot of \u2200 unicode characters which made model hallucinates
json.dump(prompts, open('prompts.json', 'w'), indent = 2, ensure_ascii=False)
print(len(prompts))

500


In [28]:
# Run query.py
# !python ./1_query.py

Traceback (most recent call last):
  File "/Users/ahmedehab/Desktop/3rd-Semester-TUM/Guided Research/ProbTree/src/2wiki/Energy_Based/./1_query.py", line 4, in <module>
    from together_req import TogetherReq
  File "/Users/ahmedehab/Desktop/3rd-Semester-TUM/Guided Research/ProbTree/src/2wiki/Energy_Based/together_req.py", line 6, in <module>
    from hotpotqa.Tree_Generation.provider_req import ProviderReq
ModuleNotFoundError: No module named 'hotpotqa'


In [37]:
# Read cache.jsonl and extract message content of each json object
cache = jsonlines.open("cache.jsonl", "r")
prompt_responses = []
for item in cache:
    prompt = item["input"][0]
    response = item["response"][0]["message"]["content"]
    prompt_responses.append((prompt, response))

In [38]:
def evaluate_responses(prompt_response):
    prompt, response = prompt_response
    # Extracting the response
    response = json.loads(response)
    expert1 = response["Expert1"]
    expert2 = response["Expert2"]
    expert3 = response["Expert3"]
    # I want to score for the 3 experts and see which one is the best
    expert1_score = evaluate_set_of_steps(prompt, expert1)
    print(f"Expert 1 score: {expert1_score}")
    expert2_score = evaluate_set_of_steps(prompt, expert2)
    print(f"Expert 2 score: {expert2_score}")
    expert3_score = evaluate_set_of_steps(prompt, expert3)
    print(f"Expert 3 score: {expert3_score}")
    
    best_expert = min(expert1_score, expert2_score, expert3_score)
    if best_expert == expert1_score:
        print("Expert 1 is the best")
    elif best_expert == expert2_score:
        print("Expert 2 is the best")
    else:
        print("Expert 3 is the best")
        
    return best_expert

In [39]:
for i in range(5):
    print(f"Prompt: {prompt_responses[i][0]}")
    evaluate_responses(prompt_responses[i])

Prompt: Imagine three different experts are answering this question. All experts will write down all the steps of their thinking to solve the problem, then share it with the group. Then I will choose the best steps from them to solve the problem. The question is "Where was the director of film Eisenstein In Guanajuato born?".
Answer in the following format, Return only the JSON: { "Expert1": [list of steps], "Expert2": [list of steps], "Expert3": [list of steps] }
prediction ('Step 1: Identify the director of film Eisenstein as Sergei Eisenstein', -5.385906219482422)
prediction ("Step 2: Search online for 'Sergei Eisenstein birthplace' to find the answer", -6.96668004989624)
prediction ('Step 3: According to Wikipedia, Sergei Eisenstein was born in Riga, Russian Empire (now Latvia)', -5.881399631500244)
Expert 1 score: -6.077995300292969
prediction ("Step 1: Recognize that Guanajuato is a city in Mexico, which seems unrelated to the director's birthplace", -2.914334774017334)
predictio

### Trying energy based model to choose best answer from cb, ob, child

In [70]:
# read test.json, its array of responses
test = json.load(open("test.json", "r"))
test

[[{'idx': 0,
   'question_text': 'Who is Queen Hyojeong married to?',
   'sons': [],
   'qd_logprob': None,
   'fa': 2,
   'question': 'Who is Queen Hyojeong married to?',
   'cb_answer': ['King Cheoljong.',
    -0.053805460199916666,
    'Queen Hyojeong is married to King Cheoljong. So the answer is: King Cheoljong.'],
   'ob_answer': ['King Heonjong of Joseon.',
    -0.04996398290000933,
    'Queen Hyojeong is married to King Heonjong of Joseon. So the answer is: King Heonjong of Joseon.'],
   'answer': ['King Heonjong of Joseon.',
    -0.04996398290000933,
    'Queen Hyojeong is married to King Heonjong of Joseon. So the answer is: King Heonjong of Joseon.']},
  {'idx': 1,
   'question_text': 'Who is the father of <1>?',
   'sons': [],
   'qd_logprob': None,
   'fa': 2,
   'question': 'Who is the father of King Heonjong of Joseon.?',
   'cb_answer': ['King Taejong of Joseon.',
    -0.05121584034369556,
    "King Heonjong of Joseon's father is King Taejong of Joseon. So the answer is

In [77]:
# Example is array of nodes, where some nodes are parents of others, each node has cb_answer, ob_answer and sometimes child_answer
# Now i want to take all these 3 as possible answers and choose the best based on our model.
# Note we must process the child nodes first before the parent nodes, because the parent nodes depend on the child nodes

def evaluate_node(nodes, node):
    
    cb_answer = node["cb_answer"][2]
    ob_answer = node["ob_answer"][2]
    child_answer = node.get("child_answer", [])
    if len(child_answer) != 0:
        child_answer = child_answer[2]
    else:
        child_answer = None
    
    # will take input steps to be all my children question_text and best_answer
    # if no children will make it as predicting the first step case
    input_steps = ""
    for son in node.get("sons", []):
        son_node = nodes[son]
        if son_node.get("best_answer", None) is not None:
            input_steps += son_node["question_text"] + " " + son_node["best_answer"][2] + "\n"

    if input_steps == "":
        input_steps = "No steps done yet, predict the first step"
    
    print(f"Context: {node['question_text']} \n Input steps: {input_steps}")
    expert1_score = predict_next_step(model, node['question_text'], input_steps, [cb_answer])[0][1]
    print(f"Expert 1 score: {expert1_score}")
    expert2_score = predict_next_step(model, node['question_text'], input_steps, [ob_answer])[0][1]
    print(f"Expert 2 score: {expert2_score}")
    if child_answer is not None:
        expert3_score = predict_next_step(model, node['question_text'], input_steps, [child_answer])[0][1]
        print(f"Expert 3 score: {expert3_score}")
    else:
        expert3_score = 1000
    
    best_expert = min(expert1_score, expert2_score, expert3_score)
    if best_expert == expert1_score:
        node["best_answer"] = node["cb_answer"]
    elif best_expert == expert2_score:
        node["best_answer"] = node["ob_answer"]
    else:
        node["best_answer"] = node["child_answer"]

def process_node(nodes, node_idx):
    node = nodes[node_idx]
    if node.get("best_answer", None) is not None:
        return
    # Process the child nodes first
    sons = node.get("sons", [])
    for son in sons:
        process_node(nodes, son)
    
    # Evaluate the node
    evaluate_node(nodes, node)

In [78]:
for nodes in test:
    for i in range(len(nodes)):
        print(f"Processing node {nodes[i]}")
        process_node(nodes, i)

test

Processing node {'idx': 0, 'question_text': 'Who is Queen Hyojeong married to?', 'sons': [], 'qd_logprob': None, 'fa': 2, 'question': 'Who is Queen Hyojeong married to?', 'cb_answer': ['King Cheoljong.', -0.053805460199916666, 'Queen Hyojeong is married to King Cheoljong. So the answer is: King Cheoljong.'], 'ob_answer': ['King Heonjong of Joseon.', -0.04996398290000933, 'Queen Hyojeong is married to King Heonjong of Joseon. So the answer is: King Heonjong of Joseon.'], 'answer': ['King Heonjong of Joseon.', -0.04996398290000933, 'Queen Hyojeong is married to King Heonjong of Joseon. So the answer is: King Heonjong of Joseon.'], 'best_answer': ['King Heonjong of Joseon.', -0.04996398290000933, 'Queen Hyojeong is married to King Heonjong of Joseon. So the answer is: King Heonjong of Joseon.']}
Processing node {'idx': 1, 'question_text': 'Who is the father of <1>?', 'sons': [], 'qd_logprob': None, 'fa': 2, 'question': 'Who is the father of King Heonjong of Joseon.?', 'cb_answer': ['King 

[[{'idx': 0,
   'question_text': 'Who is Queen Hyojeong married to?',
   'sons': [],
   'qd_logprob': None,
   'fa': 2,
   'question': 'Who is Queen Hyojeong married to?',
   'cb_answer': ['King Cheoljong.',
    -0.053805460199916666,
    'Queen Hyojeong is married to King Cheoljong. So the answer is: King Cheoljong.'],
   'ob_answer': ['King Heonjong of Joseon.',
    -0.04996398290000933,
    'Queen Hyojeong is married to King Heonjong of Joseon. So the answer is: King Heonjong of Joseon.'],
   'answer': ['King Heonjong of Joseon.',
    -0.04996398290000933,
    'Queen Hyojeong is married to King Heonjong of Joseon. So the answer is: King Heonjong of Joseon.'],
   'best_answer': ['King Heonjong of Joseon.',
    -0.04996398290000933,
    'Queen Hyojeong is married to King Heonjong of Joseon. So the answer is: King Heonjong of Joseon.']},
  {'idx': 1,
   'question_text': 'Who is the father of <1>?',
   'sons': [],
   'qd_logprob': None,
   'fa': 2,
   'question': 'Who is the father of K

In [80]:
for question in test:
    if question[0].get("best_answer", None) != question[0].get("answer", None):
        print(question[0].get("question_text"), question[0].get("best_answer"), question[0].get("answer"))

When was Grouplogic established? ['1992.', -0.13188666797733334, 'Grouplogic was established in 1992. So the answer is: 1992.'] ['1988.', -0.013281994173255558, 'GroupLogic was founded in 1988. So the answer is: 1988.']
Who is the director of film Mukhyamantri (1996 Film)? ['T. S. Nagabharana.', -0.07188979925143608, 'The film Mukhyamantri (1996 Film) was directed by T. S. Nagabharana. So the answer is: T. S. Nagabharana.'] ['Anjan Choudhury.', -0.02710642695240937, 'The film Mukhyamantri is directed by Anjan Choudhury. So the answer is: Anjan Choudhury.']
When was the film The Devil'S Miner released? ['2005.', -0.07085375563262078, "The film The Devil's Miner was released in 2005. So the answer is: 2005."] ['2005.', -0.02049389707479733, "The film The Devil's Miner was released in the year 2005. So the answer is: 2005."]
When was Halcón Suriano Jr. born? ['24 August 1958.', -0.4208861168823353, 'Halcón Suriano Jr. was born on 24 August 1958. So the answer is: 24 August 1958.'] ['May 8