In [22]:
import re
import pandas as pd
import numpy as np
import pathlib
import textwrap
import os
import random
import time
import requests
import json

from tqdm.notebook import tqdm
tqdm.pandas()

from langchain_community.tools import DuckDuckGoSearchRun
from langchain_community.tools import DuckDuckGoSearchResults

## Data loading

In [9]:
# !git clone https://github.com/uhh-lt/TextGraphs17-shared-task.git

In [15]:
test_file_path = "../TextGraphs17-shared-task/data/tsv/test.tsv"
train_file_path = "../TextGraphs17-shared-task/data/tsv/train.tsv"

df_test = pd.read_csv(test_file_path, sep="\t")
df_train = pd.read_csv(train_file_path, sep="\t")
ws_df = pd.read_csv("../data/web_search_results_ddgo.csv", index_col=0)

SEP_TOKEN = "</s>"

def linearize_graph(graph):
    nodes = {node['id']: f"{node['label']} ({node['type']}, {node['name_']})" for node in graph['nodes']}
    edges = []
    for link in graph['links']:
        source_node = nodes[link['source']]
        target_node = nodes[link['target']]
        label = link['label']
        edges.append(f"{source_node} - {label} -> {target_node}")
    return '; '.join(edges)

df_test["graph"] = df_test["graph"].apply(eval)
df_train["graph"] = df_train["graph"].apply(eval)

df_test["linearized_graph"] = df_test["graph"].apply(linearize_graph)
df_train["linearized_graph"] = df_train["graph"].apply(linearize_graph)

df = pd.concat([df_train, df_test], axis=0)

In [18]:
import re
    
def generate_content(text):
    url = "http://0.0.0.0:8000/v1/chat/completions"
    headers = {"Content-Type": "application/json"}
    data = {
            "model": "/archive/beliakin/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/c4a54320a52ed5f88b7a2f84496903ea4ff07b45/",
            "messages": [{
            "role": "user",
            "content": text}],
            "max_tokens": 64,
            "temperature": 0
            }
    
    response = requests.post(url, headers=headers, json=data)
    text = response.json()["choices"][0]["message"]["content"]
    return text


def get_websearch_results(question):
  results = []
  search = DuckDuckGoSearchResults()    
  snippet_list = search.run(question)
  splitted = snippet_list.split("], [")

  for a in splitted:
    b = re.search(r'snippet:(.*?), title:', a).group(1)
    results.append(b)


  llama_string = ""
  for id, res in enumerate(results):
      llama_string+=f'{id+1}. {res}\r\n'
      
  return llama_string
   
def get_context_web_search(row):
    question, answ, id = row['question'], row['answerEntity'], row['answerEntityId']
    attempts = 5
    res = False
    while not res:
      try:
        web_search_context = get_websearch_results(question.iloc[0])
        res = True
      except Exception as e:
        print(e)
        print("error search")
        time.sleep(20-attempts)
        attempts -= 1
      if attempts == 0:
        web_search_context = ""
        res = True
    message = f"""Please answer the question based on the web search context. The web search context is sorted according to relevance. The answer should be a WikiDataID.
Context: '{web_search_context}'
Question: '{question.iloc[0]}'
Choose the correct option and write only WikiDataID. If answers more than one - provide muliple WikiDataID splitted by ","."""
    message += "\n".join([f"{a} {i}" for a, i in zip(answ, id)])
    message += "\n Return only WikiDataId."
    return message

## Entity/id prediction experiment

In [None]:
def predict_label(row, entity=True):
    if entity:
      text = row["entity_id"]
      message = f"What WikiData entity does WikiDataId {text} have? Return only WikiData entity."
    else:
      text = row["entity"]
      message = f"What WikiDataID does '{text}' have? Return only WikiDataID."
    attempts = 5
    res = False
    while not res:
      try:
        text = generate_content(message)
        res = True
      except Exception as e:
        print(e)
        print("error")
        time.sleep(20-attempts)
        attempts -= 1

      if attempts == 0:
        text = "No answer"
        res = True
    return text

def predict_correct(row, entity=True):
    if entity:
      label, correct_label = "entity", "predicted_entity"
      question = f"What WikiDataID does '{label}' have? Return only WikiDataID."
    else:
      label, correct_label = "entity_id", "predicted_id"
      question = f"What WikiData entity does WikiDataId {label} have? Return only WikiData entity."
    text, correct_text = row[label], row[correct_label]
    message = f"""You should rate the candidate answer and the correct answer. Return 1 if candidate answer corresponds to correct else return 0. Return only 1 if correct else 0. \n Question: {question} \n Candidate answer: {text} \n Correct answer: {correct_text} \n Is candidate answer correct?"""
    attempts = 5
    res = False
    while not res:
      try:
        text = generate_content(message)
        res = True
      except Exception as e:
        print(e)
        print("error")
        time.sleep(20-attempts)
        attempts -= 1

      if attempts == 0:
        text = "No answer"
        res = True
    return text

In [31]:
all_entity = []
all_entity_id = []

for entity, entity_id in zip(df["answerEntity"].values, df["answerEntityId"].values):
    if ',' in entity_id:
        for e, i in zip(entity.split(","), entity_id.split(",")):
            all_entity.append(e)
            all_entity_id.append(i)
    else:
        all_entity.append(entity)
        all_entity_id.append(entity_id)

for entity, entity_id in zip(df["questionEntity"].values, df["questionEntityId"].values):
    if ',' in entity_id:
        for e, i in zip(entity.split(","), entity_id.split(",")):
            all_entity.append(e)
            all_entity_id.append(i)
    else:
        all_entity.append(entity)
        all_entity_id.append(entity_id)

data = [(e, i) for e, i in zip(all_entity, all_entity_id)]
df_ents = pd.DataFrame(data, columns=["entity", "entity_id"]).drop_duplicates("entity_id").reset_index(drop=True)

In [32]:
df_ents

Unnamed: 0,entity,entity_id
0,Ruhollah Khomeini's return to Iran,Q7293530
1,Ruhollah Khomeini's letter to Mikhail Gorbachev,Q5952984
2,Ruhollah Khomeini,Q38823
3,Office of the Supreme Leader of Iran,Q16045000
4,Mohammad Reza Pahlavi and Soraya,Q63195813
...,...,...
23278,Pide and Prejudice,Q170583
23279,Elizabeth Bennet,Q2223341
23280,older sister,Q10082670
23281,Booker Award,Q160082


In [33]:
limit = 10
df_ents_test = df_ents.iloc[:limit, :]

In [34]:
df_ents_test

Unnamed: 0,entity,entity_id
0,Ruhollah Khomeini's return to Iran,Q7293530
1,Ruhollah Khomeini's letter to Mikhail Gorbachev,Q5952984
2,Ruhollah Khomeini,Q38823
3,Office of the Supreme Leader of Iran,Q16045000
4,Mohammad Reza Pahlavi and Soraya,Q63195813
5,Mahmoud Ahmadinejad,Q34448
6,Hassan Rouhani,Q348144
7,Ayatollah Khamenei's Foreign Policy Orientation,Q57483966
8,Ayatollah Ali Khamenei speaks about Iranian pr...,Q18017728
9,Ali Khamenei bibliography,Q50815843


In [53]:
df_ents_test["predicted_entity"] = df_ents_test.progress_apply(lambda row: predict_label(row, entity=True), axis=1)
df_ents_test["predicted_id"] = df_ents_test.progress_apply(lambda row: predict_label(row, entity=False), axis=1)
df_ents_test["correct_entity"] = df_ents_test.apply(lambda row: predict_correct(row, entity=True), axis=1)
# df_ents_test["correct_id"] = df_ents_test.apply(lambda row: predict_correct(row, entity=False))

  0%|          | 0/10 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ents_test["predicted_entity"] = df_ents_test.progress_apply(lambda row: predict_label(row, entity=True), axis=1)


  0%|          | 0/10 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ents_test["predicted_id"] = df_ents_test.progress_apply(lambda row: predict_label(row, entity=False), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ents_test["correct_entity"] = df_ents_test.apply(lambda row: predict_correct(row, entity=True), axis=1)


In [54]:
df_ents_test.head(40)

Unnamed: 0,entity,entity_id,predicted_entity,predicted_id,correct_entity
0,Ruhollah Khomeini's return to Iran,Q7293530,Borjomi Municipality,Q640544,0
1,Ruhollah Khomeini's letter to Mikhail Gorbachev,Q5952984,Budapest Ferenc Liszt International Airport,Q4117189,0
2,Ruhollah Khomeini,Q38823,Budapest,Q131796,0
3,Office of the Supreme Leader of Iran,Q16045000,The WikiData entity for WikiDataId Q16045000 i...,Q1048413,0
4,Mohammad Reza Pahlavi and Soraya,Q63195813,Boris Johnson,Q211964,0
5,Mahmoud Ahmadinejad,Q34448,The WikiData entity for WikiDataId Q34448 is: ...,Q9176,0
6,Hassan Rouhani,Q348144,"Q348144 is an identifier for the entity ""Budap...",Q131741,0
7,Ayatollah Khamenei's Foreign Policy Orientation,Q57483966,Boris Johnson,Q106304634,0
8,Ayatollah Ali Khamenei speaks about Iranian pr...,Q18017728,The WikiData entity for WikiDataId Q18017728 i...,Q6641445,0
9,Ali Khamenei bibliography,Q50815843,Boris Johnson,Q106313432,0


# Prediction

### Prediction With index

In [None]:
def get_context(row):
    question, answ, wiki_id = row['question'], row['answerEntity'], row['answerEntityId']
    message = f"""Please answer the following question, I will provide you with multiple options.
Question: '{question.iloc[0]}'
The correct answer is always given. Choose the correct answer. Return only the number of the correct answer option.\n"""
    decode_dict = {}
    for idx, (a, i) in enumerate(zip(answ, wiki_id)):
        decode_dict[idx] = i
        data = {"answer":a, "WikiDataID":i}
        message += str(idx) + ". " + json.dumps(data) + "\n"
    return message, decode_dict

In [158]:
idx = 0
all_rows = []

for question, row in tqdm(df_test.groupby("question")):
    text, match = predict(row)
    all_rows.append({"questionEntityId":row["questionEntityId"].values[0], "prediction": text, "match_dict":match})

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
predictions_df = pd.DataFrame(all_rows)

### Prediction with WikiDataId

In [23]:
def get_context(row):
    question, answ, wiki_id = row['question'], row['answerEntity'], row['answerEntityId']
    message = f"""Please answer the following question, the answer should be a WikiDataID.
Question: '{question.iloc[0]}'
Choose the correct answer. Return only WikiDataID.\n"""
    decode_dict = {}
    for idx, (a, i) in enumerate(zip(answ, wiki_id)):
        decode_dict[idx] = i
        data = {"answer":a, "WikiDataID":i}
        message += str(idx) + ". " + json.dumps(data) + "\n"
    return message, decode_dict

In [35]:
df_predictions = df_train[['question', 'answerEntity', 'answerEntityId', "graph"]].groupby('question').progress_apply(predict)

  0%|          | 0/3535 [00:00<?, ?it/s]

### Aggregate

In [36]:
ans_dict = {}
for idx, answer in df_predictions.reset_index().iterrows():
    ans_dict[answer["question"]] = answer[0]

In [48]:
df_final_preds = df_train.copy()
df_final_preds["prediction"] = df_copy.apply(lambda row: 1 if row["answerEntityId"] in ans_dict[row["question"]] else 0, axis=1)

In [43]:
df_final_preds

Unnamed: 0,sample_id,question,questionEntity,answerEntity,groundTruthAnswerEntity,answerEntityId,questionEntityId,groundTruthAnswerEntityId,correct,graph,linearized_graph
0,0,Whst is the name of the head of state and high...,Iran,Ruhollah Khomeini's return to Iran,Office of the Supreme Leader of Iran,Q7293530,Q794,Q16045000,0,"{'nodes': [{'type': 'QUESTIONS_ENTITY', 'name_...","Iran (QUESTIONS_ENTITY, Q794) - country -> Ira..."
1,1,Whst is the name of the head of state and high...,Iran,Ruhollah Khomeini's letter to Mikhail Gorbachev,Office of the Supreme Leader of Iran,Q5952984,Q794,Q16045000,0,"{'nodes': [{'type': 'INTERNAL', 'name_': 'Q417...","Iran (QUESTIONS_ENTITY, Q794) - described by s..."
2,2,Whst is the name of the head of state and high...,Iran,Ruhollah Khomeini,Office of the Supreme Leader of Iran,Q38823,Q794,Q16045000,0,"{'nodes': [{'type': 'QUESTIONS_ENTITY', 'name_...","Iran (QUESTIONS_ENTITY, Q794) - country -> Ira..."
3,3,Whst is the name of the head of state and high...,Iran,Office of the Supreme Leader of Iran,Office of the Supreme Leader of Iran,Q16045000,Q794,Q16045000,0,"{'nodes': [{'type': 'QUESTIONS_ENTITY', 'name_...","Iran (QUESTIONS_ENTITY, Q794) - country -> Ira..."
4,4,Whst is the name of the head of state and high...,Iran,Mohammad Reza Pahlavi and Soraya,Office of the Supreme Leader of Iran,Q63195813,Q794,Q16045000,0,"{'nodes': [{'type': 'QUESTIONS_ENTITY', 'name_...","Iran (QUESTIONS_ENTITY, Q794) - country -> Ira..."
...,...,...,...,...,...,...,...,...,...,...,...
37667,37667,Adolf Hitler was the leader of which party?,Adolf Hitler,National Socialist Party,Nazi Party,Q83150164,Q352,Q7320,0,"{'nodes': [{'type': 'INTERNAL', 'name_': 'Q617...","Jordan (INTERNAL, Q810) - on focus list of Wik..."
37668,37668,Adolf Hitler was the leader of which party?,Adolf Hitler,National Socialist Party,Nazi Party,Q6978510,Q352,Q7320,0,"{'nodes': [{'type': 'INTERNAL', 'name_': 'Q617...","Romania (INTERNAL, Q218) - on focus list of Wi..."
37669,37669,Adolf Hitler was the leader of which party?,Adolf Hitler,National Socialist Party,Nazi Party,Q4565946,Q352,Q7320,0,"{'nodes': [{'type': 'INTERNAL', 'name_': 'Q617...","United Kingdom (INTERNAL, Q145) - on focus lis..."
37670,37670,Adolf Hitler was the leader of which party?,Adolf Hitler,NSDAP Long Service Award,Nazi Party,Q328209,Q352,Q7320,0,"{'nodes': [{'type': 'QUESTIONS_ENTITY', 'name_...","Adolf Hitler (QUESTIONS_ENTITY, Q352) - countr..."


In [44]:
submission_df = pd.DataFrame(data={"sample_id": df_copy['sample_id'], 'prediction': df_final_preds['prediction']})
submission_df

Unnamed: 0,sample_id,prediction
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
37667,37667,0
37668,37668,0
37669,37669,0
37670,37670,0


In [45]:
submission_df.prediction.sum()

1271

In [90]:
submission_df.to_csv("submission_llama3_8B_ws.tsv", sep="\t", index=False)

# Metrics

In [53]:
df_final_preds["gt_values"] = (df_final_preds["answerEntityId"] == df_final_preds["groundTruthAnswerEntityId"]).astype(int)

In [60]:
(df_final_preds["gt_values"] == df_final_preds["prediction"]).astype(int).mean()

0.8996602251008706