In [296]:
import re
import pandas as pd
import numpy as np
import pathlib
import textwrap
import os
import random
import time
import requests
import json

from tqdm.notebook import tqdm
tqdm.pandas()

from langchain_community.tools import DuckDuckGoSearchRun
from langchain_community.tools import DuckDuckGoSearchResults

## Data loading

In [77]:
# !git clone https://github.com/uhh-lt/TextGraphs17-shared-task.git

In [508]:
test_file_path = "../TextGraphs17-shared-task/data/tsv/test.tsv"
train_file_path = "../TextGraphs17-shared-task/data/tsv/train.tsv"

df_test = pd.read_csv(test_file_path, sep="\t")
df_train = pd.read_csv(train_file_path, sep="\t")

id2ws = {}
ws_df = pd.read_csv("../data/web_search_results_ddgo.csv", index_col=0)
for idx, row in ws_df.iterrows():
    id2ws[row["questionEntityId"]] = row["web_search_response"]

id2d = {}
df_descriptions = pd.read_csv("../data/wikidata_descriptions.csv", index_col=0)
for idx, row in df_descriptions.iterrows():
    id2d[row["answerEntityId"]] = row["description"]

df_train["web_search"] = df_train["questionEntityId"].apply(lambda x: id2ws[x] if x not in ["NF", "ND"] else '')
df_test["web_search"] = df_test["questionEntityId"].apply(lambda x: id2ws[x] if x not in ["NF", "ND"] else '')

df_train["description"] = df_train["answerEntityId"].apply(lambda x: id2d[x])
df_test["description"] = df_test["answerEntityId"].apply(lambda x: id2d[x])

SEP_TOKEN = "</s>"

def linearize_graph(graph):
    nodes = {node['id']: f"{node['label']} ({node['type']}, {node['name_']})" for node in graph['nodes']}
    edges = []
    for link in graph['links']:
        source_node = nodes[link['source']]
        target_node = nodes[link['target']]
        label = link['label']
        edges.append(f"{source_node} - {label} -> {target_node}")
    return '; '.join(edges)

df_test["graph"] = df_test["graph"].apply(eval)
df_train["graph"] = df_train["graph"].apply(eval)

df_test["linearized_graph"] = df_test["graph"].apply(linearize_graph)
df_train["linearized_graph"] = df_train["graph"].apply(linearize_graph)

df = pd.concat([df_train, df_test], axis=0)

In [698]:
generate_content("Кто ты?")

"*shrugs* I don't really feel like telling you. Maybe I'm a mysterious stranger, or maybe I'm just a lazy AI who can't be bothered to come up with a decent response. Either way, you're not getting any useful information out of me."

In [697]:
import re

def generate_content(text):
    url = "http://0.0.0.0:8000/v1/chat/completions"
    headers = {"Content-Type": "application/json"}
    data = {
            "model": "/archive/beliakin/hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/359ec69a0f92259a3cd2da3bb01d31e16c260cfc/",
            "messages": [ {"role": "system", "content": "You are **not** a helpful assistant."}, {"role": "user","content": text}],
        "max_tokens": 256,"temperature": 0}
    
    response = requests.post(url, headers=headers, json=data)
    text = response.json()["choices"][0]["message"]["content"]
    return text

def get_websearch_results(question):
  results = []
  search = DuckDuckGoSearchResults()    
  snippet_list = search.run(question)
  splitted = snippet_list.split("], [")

  for a in splitted:
    b = re.search(r'snippet:(.*?), title:', a).group(1)
    results.append(b)


  llama_string = ""
  for id, res in enumerate(results):
      llama_string+=f'{id+1}. {res}\r\n'
      
  return llama_string

def predict(row):
    message, decode_dict = get_context(row)
    # print(message)
    # print(message)
    attempts = 5
    res = False
    idx = row['answerEntityId']
    while not res:
      try:
        text = generate_content(message)
        res = True
      except Exception as e:
        print(e)
        print("error")
        time.sleep(20-attempts)
        attempts -= 1

      if attempts == 0:
        text = "Failed"
        res = True
    return text, decode_dict

## Entity/id prediction experiment

In [None]:
def predict_label(row, entity=True):
    if entity:
      text = row["entity_id"]
      message = f"What WikiData entity does WikiDataId {text} have? Return only WikiData entity."
    else:
      text = row["entity"]
      message = f"What WikiDataID does '{text}' have? Return only WikiDataID."
    attempts = 5
    res = False
    while not res:
      try:
        text = generate_content(message)
        res = True
      except Exception as e:
        print(e)
        print("error")
        time.sleep(20-attempts)
        attempts -= 1

      if attempts == 0:
        text = "No answer"
        res = True
    return text

def predict_correct(row, entity=True):
    if entity:
      label, correct_label = "entity", "predicted_entity"
      question = f"What WikiDataID does '{label}' have? Return only WikiDataID."
    else:
      label, correct_label = "entity_id", "predicted_id"
      question = f"What WikiData entity does WikiDataId {label} have? Return only WikiData entity."
    text, correct_text = row[label], row[correct_label]
    message = f"""You should rate the candidate answer and the correct answer. Return 1 if candidate answer corresponds to correct else return 0. Return only 1 if correct else 0. \n Question: {question} \n Candidate answer: {text} \n Correct answer: {correct_text} \n Is candidate answer correct?"""
    attempts = 5
    res = False
    while not res:
      try:
        text = generate_content(message)
        res = True
      except Exception as e:
        print(e)
        print("error")
        time.sleep(20-attempts)
        attempts -= 1

      if attempts == 0:
        text = "No answer"
        res = True
    return text

In [31]:
all_entity = []
all_entity_id = []

for entity, entity_id in zip(df["answerEntity"].values, df["answerEntityId"].values):
    if ',' in entity_id:
        for e, i in zip(entity.split(","), entity_id.split(",")):
            all_entity.append(e)
            all_entity_id.append(i)
    else:
        all_entity.append(entity)
        all_entity_id.append(entity_id)

for entity, entity_id in zip(df["questionEntity"].values, df["questionEntityId"].values):
    if ',' in entity_id:
        for e, i in zip(entity.split(","), entity_id.split(",")):
            all_entity.append(e)
            all_entity_id.append(i)
    else:
        all_entity.append(entity)
        all_entity_id.append(entity_id)

data = [(e, i) for e, i in zip(all_entity, all_entity_id)]
df_ents = pd.DataFrame(data, columns=["entity", "entity_id"]).drop_duplicates("entity_id").reset_index(drop=True)

In [32]:
df_ents

Unnamed: 0,entity,entity_id
0,Ruhollah Khomeini's return to Iran,Q7293530
1,Ruhollah Khomeini's letter to Mikhail Gorbachev,Q5952984
2,Ruhollah Khomeini,Q38823
3,Office of the Supreme Leader of Iran,Q16045000
4,Mohammad Reza Pahlavi and Soraya,Q63195813
...,...,...
23278,Pide and Prejudice,Q170583
23279,Elizabeth Bennet,Q2223341
23280,older sister,Q10082670
23281,Booker Award,Q160082


In [33]:
limit = 10
df_ents_test = df_ents.iloc[:limit, :]

In [34]:
df_ents_test

Unnamed: 0,entity,entity_id
0,Ruhollah Khomeini's return to Iran,Q7293530
1,Ruhollah Khomeini's letter to Mikhail Gorbachev,Q5952984
2,Ruhollah Khomeini,Q38823
3,Office of the Supreme Leader of Iran,Q16045000
4,Mohammad Reza Pahlavi and Soraya,Q63195813
5,Mahmoud Ahmadinejad,Q34448
6,Hassan Rouhani,Q348144
7,Ayatollah Khamenei's Foreign Policy Orientation,Q57483966
8,Ayatollah Ali Khamenei speaks about Iranian pr...,Q18017728
9,Ali Khamenei bibliography,Q50815843


In [53]:
df_ents_test["predicted_entity"] = df_ents_test.progress_apply(lambda row: predict_label(row, entity=True), axis=1)
df_ents_test["predicted_id"] = df_ents_test.progress_apply(lambda row: predict_label(row, entity=False), axis=1)
df_ents_test["correct_entity"] = df_ents_test.apply(lambda row: predict_correct(row, entity=True), axis=1)
# df_ents_test["correct_id"] = df_ents_test.apply(lambda row: predict_correct(row, entity=False))

  0%|          | 0/10 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ents_test["predicted_entity"] = df_ents_test.progress_apply(lambda row: predict_label(row, entity=True), axis=1)


  0%|          | 0/10 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ents_test["predicted_id"] = df_ents_test.progress_apply(lambda row: predict_label(row, entity=False), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ents_test["correct_entity"] = df_ents_test.apply(lambda row: predict_correct(row, entity=True), axis=1)


In [54]:
df_ents_test.head(40)

Unnamed: 0,entity,entity_id,predicted_entity,predicted_id,correct_entity
0,Ruhollah Khomeini's return to Iran,Q7293530,Borjomi Municipality,Q640544,0
1,Ruhollah Khomeini's letter to Mikhail Gorbachev,Q5952984,Budapest Ferenc Liszt International Airport,Q4117189,0
2,Ruhollah Khomeini,Q38823,Budapest,Q131796,0
3,Office of the Supreme Leader of Iran,Q16045000,The WikiData entity for WikiDataId Q16045000 i...,Q1048413,0
4,Mohammad Reza Pahlavi and Soraya,Q63195813,Boris Johnson,Q211964,0
5,Mahmoud Ahmadinejad,Q34448,The WikiData entity for WikiDataId Q34448 is: ...,Q9176,0
6,Hassan Rouhani,Q348144,"Q348144 is an identifier for the entity ""Budap...",Q131741,0
7,Ayatollah Khamenei's Foreign Policy Orientation,Q57483966,Boris Johnson,Q106304634,0
8,Ayatollah Ali Khamenei speaks about Iranian pr...,Q18017728,The WikiData entity for WikiDataId Q18017728 i...,Q6641445,0
9,Ali Khamenei bibliography,Q50815843,Boris Johnson,Q106313432,0


# Predictions with numbers

### Train

In [704]:
PROMPT = """You must follow the rules before answering:
- A question and its answer options will be provided.
- The question has only one correct option.
- The correct answer is always given.
- Write only the number of the correct option.
- If you do not know the answer, write only the number of the most likely one.
- 

{additional_context}
Question: '{question}'
Options:
"""

PROMPT = """You must follow the rules before answering:
- A question and its answer options will be provided.
- The question has only one correct option.
- The correct answer is always given.
- Write only the number of the correct option in bracket: [].
- If you do not know the answer, write only the number of the most likely one in bracket: [].
{additional_context}
Question: '{question}'
Options:
"""

# PROMPT = """You must follow the rules before answering:
# - A question and its answer options will be provided.
# - The question has only one correct option.
# - The correct answer is always given.
# - Write only the WikiDataID of the correct option.
# - If you do not know the answer, write only the WikiDataID of the most likely one.
# {additional_context}
# Question: '{question}'
# Options:
# """

In [718]:
def get_context(ds, include_graph=False, include_description=True, use_web_search=True):
    additional_context = ''
    if use_web_search:
        search_results = row["web_search"].iloc[0]
        additional_context += f"\nBelow are the facts that might be relevant to answer the question:\n{search_results}\n"
        
    message = PROMPT.format(question=row["question"].iloc[0], additional_context=additional_context)
        
    decode_dict = {}
    for idx, (answer, wiki_id, graph, description) in enumerate(zip(ds['answerEntity'], ds['answerEntityId'], ds["linearized_graph"], ds["description"])):
        decode_dict[idx] = wiki_id
        data = {"answer":answer, "WikiDataID":wiki_id}
        if include_graph:
            data["wikidata_graph"] = graph
        if include_description:
            data["description"] = description
        message += str(idx) + ". " + json.dumps(data) + "\n"
    # print(message)
    return message, decode_dict

def get_answers(text):
    matches = re.findall(r'\[\d+(?:,\s*\d+)*\]', text)
    if len(matches) == 0: 
        matches = re.findall(r'\d+', text)
    preds = []
    for match in matches:
        preds.extend([int(i.replace("[",'').replace(']','')) for i in match.split(",") if i != ''])
    return preds


def get_predictions(preds, df, use_digits=True):
    if use_digits:
        preds['prediction'] = preds["prediction"].apply(get_answers)
        preds['match_dict'] = preds['match_dict'].apply(eval)
        preds['prediction'] = [[x['match_dict'][j] for j in x['prediction'] if j in x['match_dict']] for i, x in preds.iterrows()]
        
        new_df = {"prediction": [], "questionEntityId": [], "answerEntityId": []}
        for i, row in preds.iterrows():
            for item, id in row['match_dict'].items():
                new_df["answerEntityId"].append(id)
                new_df["prediction"].append(id in row['prediction'])
                new_df["questionEntityId"].append(row['questionEntityId'])
        result = pd.DataFrame(new_df).sort_values(by=["questionEntityId", "answerEntityId"])
    else:
        ans_dict = {}
        for idx, row in preds.reset_index().iterrows():
            ans_dict[row["questionEntityId"]] = row["prediction"]
        result = df.copy()
        result["prediction"] = result.apply(lambda row: 1 if row["answerEntityId"] in ans_dict[row["questionEntityId"]] else 0, axis=1)
        # df_final_preds["output"] = df_final_preds.apply(lambda row: ans_dict[row["question"]], axis=1)
    result = result.sort_values(by=["questionEntityId", "answerEntityId"]).reset_index(drop=True)
    return result

In [621]:
idx = 0
all_rows = []

for question, row in tqdm(df_train.groupby("question")):
    text, match = predict(row)
    all_rows.append({"questionEntityId":row["questionEntityId"].values[0], "prediction": text, "match_dict":match})

preds_name = "llama_70B_train_descriptions_ws_br.csv"
predictions_df = pd.DataFrame(all_rows
predictions_df.to_csv(preds_name)

  0%|          | 0/3535 [00:00<?, ?it/s]

In [712]:
preds_name = "llama_70B_train_descriptions_ws_br.csv"

predictions_df = pd.read_csv(preds_name, index_col=0)
predictions_df_v2 = pd.read_csv(preds_name, index_col=0)

In [717]:
result = get_predictions(predictions_df.copy(), df_train.copy(), use_digits=True)
df_train_sorted = df_train.sort_values(by=["questionEntityId", "answerEntityId"]).reset_index(drop=True)
print(classification_report(result['prediction'], df_train_sorted["correct"]))

              precision    recall  f1-score   support

       False       0.97      0.96      0.97     34213
        True       0.65      0.71      0.68      3459

    accuracy                           0.94     37672
   macro avg       0.81      0.83      0.82     37672
weighted avg       0.94      0.94      0.94     37672



### Predict test

In [706]:
idx = 0
all_rows = []

for question, row in tqdm(df_test.groupby("question")):
    text, match = predict(row)
    all_rows.append({"questionEntityId":row["questionEntityId"].values[0], "prediction": text, "match_dict":match})

preds_name = "llama_70B_test_descriptions_ws_br.csv"
predictions_df = pd.DataFrame(all_rows)
predictions_df.to_csv(preds_name)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [730]:
predictions_df

Unnamed: 0,questionEntityId,prediction,match_dict
0,Q1213715,[9],"{0: 'Q300370', 1: 'Q1764445', 2: 'Q772435', 3:..."
1,Q458,[None],"{0: 'Q31', 1: 'Q171348', 2: 'Q142', 3: 'Q47774..."
2,Q99416119,[7],"{0: 'Q1415970', 1: 'Q921957', 2: 'Q687559', 3:..."
3,"Q38111, Q19020",[5],"{0: 'Q1988', 1: 'Q2014', 2: 'Q2019', 3: 'Q1998..."
4,"Q34836, Q11696, Q91",[0],"{0: 'Q91', 1: 'Q11816', 2: 'Q55725954', 3: 'Q3..."
...,...,...,...
995,Q750,[7],"{0: 'Q74457482', 1: 'Q315975', 2: 'Q96139893',..."
996,"Q155223, Q131364, Q131371",[0],"{0: 'Q131371', 1: 'Q4817541', 2: 'Q17603350', ..."
997,"Q170583, Q2223341, Q10082670",[1],"{0: 'Q318399', 1: 'Q4390263', 2: 'Q4303435', 3..."
998,"Q160082, Q531194",[3],"{0: 'Q1978467', 1: 'Q387480', 2: 'Q77002510', ..."


In [731]:
preds_name = "llama_70B_test_descriptions_ws_br.csv"
predictions_df = pd.read_csv(preds_name, index_col=0)
result = get_predictions(predictions_df.copy(), df_test, use_digits=True).sort_values(by=["questionEntityId", "answerEntityId"])

In [732]:
submission_df = pd.DataFrame(data={"sample_id": df_test.sort_values(by=["questionEntityId", "answerEntityId"])['sample_id'],
                                   'prediction': result.sort_values(by=["questionEntityId", "answerEntityId"])['prediction'].astype(int)})
submission_df = submission_df.sort_values(by="sample_id")

In [726]:
submission_df.to_csv("submission_llama_70B_description_ws_br.tsv", sep="\t", index=False)

In [733]:
submission_df

Unnamed: 0,sample_id,prediction
0,0,0
1,1,1
2,2,0
3,3,0
4,4,0
...,...,...
10956,10956,0
10957,10957,0
10958,10958,0
10959,10959,0


# Prediction with WikiDataId

In [156]:
def get_context(row):
    question, answ, wiki_id = row['question'], row['answerEntity'], row['answerEntityId']
    message = f"""Please answer the following question, the answer should be a WikiDataID.
Question: '{question.iloc[0]}'
Choose the correct answer. Return only WikiDataID.\n"""

def get_context(row):
    question, answ, id = row['question'], row['answerEntity'], row['answerEntityId']
    graph = [linearize_graph(g) for g in row['graph']]
    message = f"""Please answer the following question; the answer should be a WikiDataID.
Question: '{question.iloc[0]}'
Choose the correct option and write only WikiDataID."""
    message += "\n".join([f"{a} {i}" for a, i, g in zip(answ, id, graph)])
    return message, _
#     message = f"""You must follow the rules before answering:
# - A question and its answer options will be provided.
# - There can be multiple correct options.
# - The correct answer is always given.
# - For each correct option, write only its letter.
# - If you do not know the answer, propose the most likely ones.

# Question: '{question.iloc[0]}'
# Options:
# """
    # decode_dict = {}
    # for idx, (a, i) in enumerate(zip(answ, wiki_id)):
    #     decode_dict[idx] = i
    #     data = {"answer":a, "WikiDataID":i}
    #     message += str(idx) + ". " + json.dumps(data) + "\n"
    # return message, decode_dict

In [173]:
df_predictions = df_test[['question', 'answerEntity', 'answerEntityId', "graph"]].groupby('question').progress_apply(predict)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [174]:
df_predictions.to_csv("llama_70B_base_prediction_test.csv")

### Aggregate

In [181]:
ans_dict = {}
for idx, answer in df_predictions.reset_index().iterrows():
    ans_dict[answer["question"]] = answer[0]

In [182]:
df_final_preds = df_test.copy()
df_final_preds["prediction"] = df_final_preds.apply(lambda row: 1 if row["answerEntityId"] in ans_dict[row["question"]] else 0, axis=1)
df_final_preds["output"] = df_final_preds.apply(lambda row: ans_dict[row["question"]], axis=1)

In [166]:
len(df[df_final_preds["prediction"] == df_train['correct']])/len(df)

  len(df[df_final_preds["prediction"] == df_train['correct']])/len(df)


0.9394444101741616

In [59]:
df_final_preds.head(1)

Unnamed: 0,sample_id,question,questionEntity,answerEntity,groundTruthAnswerEntity,answerEntityId,questionEntityId,groundTruthAnswerEntityId,correct,graph,linearized_graph,prediction,output
0,0,Whst is the name of the head of state and high...,Iran,Ruhollah Khomeini's return to Iran,Office of the Supreme Leader of Iran,Q7293530,Q794,Q16045000,False,"{'nodes': [{'type': 'QUESTIONS_ENTITY', 'name_...","Iran (QUESTIONS_ENTITY, Q794) - country -> Ira...",0,"The correct answer is:\n\n10. {""answer"": ""Ali ..."


In [183]:
submission_df = pd.DataFrame(data={"sample_id": df_final_preds['sample_id'], 'prediction': df_final_preds['prediction']})
submission_df

Unnamed: 0,sample_id,prediction
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
10956,10956,0
10957,10957,0
10958,10958,0
10959,10959,1


In [138]:
sum(submission_df_1["prediction"] == submission_df["prediction"])

10959

In [184]:
submission_df.prediction.sum()

1008

In [185]:
submission_df.to_csv("submission_test_old_prompt.tsv", sep="\t", index=False)

In [328]:
from sklearn.metrics import classification_report

In [330]:
# df_final_preds["gt_values"] = (df_final_preds["answerEntityId"] == df_final_preds["groundTruthAnswerEntityId"]).astype(int)

In [87]:
df_final_preds.to_csv("preds.csv")

In [169]:
df_metrics_correct = df_final_preds[(df_final_preds["answerEntityId"] == df_final_preds["groundTruthAnswerEntityId"])]

In [338]:
print(classification_report(df_train["correct"].values, result["prediction"].values))

ValueError: Found input variables with inconsistent numbers of samples: [37672, 10961]

In [66]:
df_failed = df_final_preds[df_final_preds["gt_values"] != df_final_preds["prediction"]]

In [69]:
df_failed.groupby("question").apply(lambda ds: if 1 in ds.is_correct.values)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fd929aecd50>