# Converts the output from the parser to format that the state machine docker can read (for evaluation purposes)

In [11]:
import json
from pathlib import Path

data_dir = Path("../data")
# data_dir = Path(".").absolute().parent

items = []
with open(f"{data_dir}/nrl_parser_output_qasrl_gs_test.jsonl") as f:
    x = f.readline()
    while x is not None and x != "":
        items.append(json.loads(x))
        x = f.readline()
    
len(items)

999

In [16]:
data_dir.absolute()

PosixPath('/home/nlp/kleinay/Parsing/Seq2Seq_QASRL_Parsing/qasrl_bart/notebooks/../data')

In [12]:
from typing import Tuple
import pandas as pd
from dataclasses import dataclass
from dataclasses_json import dataclass_json

@dataclass_json
@dataclass
class DataRow:
    qasrl_id: str
    verb_idx: int
    verb: str
    question: str
    answer: str
    answer_range: str  # e.g., 12:24
    sentence: str
    
ANSWER_SEPARATOR = "~!~"

def convert_answer_span_to_csv_format(answer_span: dict) -> Tuple[str, str]:
    # also replace nrl's exclusive span (i:i) with our inclusive span (i:i+1)
    return {
        "answer": answer_span['text'],
        "answer_range": f"{answer_span['start']}:{answer_span['end']+1}"
    }


data = []
for item_i, item in enumerate(items):
    sentence = " ".join(item['words'])
    for verb_i, verb_item in enumerate(item['verbs']):
        verb = verb_item['verb']
        verb_idx = int(verb_item['index'])
        for qa_i, qa_pair in enumerate(verb_item['qa_pairs']):
            question = qa_pair['question']
            spans = qa_pair['spans']
            answer_df = pd.DataFrame(convert_answer_span_to_csv_format(span) for span in spans)
            answer = ANSWER_SEPARATOR.join(list(answer_df['answer']))
            answer_range = ANSWER_SEPARATOR.join(list(answer_df['answer_range']))
            
            qasrl_id = f"sentence_{item_i}"
            data.append(DataRow(qasrl_id, verb_idx, verb, question, answer, answer_range, sentence))      

In [13]:
input_df = pd.DataFrame(data_row.to_dict() for data_row in data)
input_df.to_csv(f"{data_dir}/nrl_state_machine_input_file.csv", index=False)

### Run state machine

In [18]:
!docker run -it -v "$(pwd)/../data/:/data" --rm --name qasrl hirscheran/qasrl_state_machine_example "file" "/data/nrl_state_machine_input_file.csv" "/data/nrl_output_file.csv"

[34m[1/38] qasrl-state-machine-example.mainClass [39m
[1A[2K[9999D[34m[3/38] mill.scalalib.ZincWorkerModule.worker [39m
[1A[2K[9999D[34m[5/38] qasrl-state-machine-example.sources [39m
[1A[2K[9999D[34m[10/38] qasrl-state-machine-example.resources [39m
[1A[2K[9999D[34m[28/38] qasrl-state-machine-example.finalMainClassOpt [39m
[1A[2K[9999D[34m[29/38] qasrl-state-machine-example.finalMainClass [39m
[1A[2K[9999D[34m[30/38] qasrl-state-machine-example.localClasspath [39m
[1A[2K[9999D[34m[31/38] qasrl-state-machine-example.runIvyDeps [39m
[1A[2K[9999D[34m[32/38] qasrl-state-machine-example.resolvedRunIvyDeps [39m
[1A[2K[9999D[34m[32/38] qasrl-state-machine-example.resolvedRunIvyDeps | Downloading [1/1] artifacts (~0/0 bytes)[39m
[1A[2K[9999D[34m[32/38] qasrl-state-machine-example.resolvedRunIvyDeps | Downloading [2/2] artifacts (~0/0 bytes)[39m
[1A[2K[9999D[34m[32/38] qasrl-state-machine-example.resolvedRunIvyDeps | Downloading [3/3] art

### Add sentence to output_file

In [20]:
import pandas as pd

output_df = pd.read_csv(f"{data_dir}/nrl_output_file.csv")

sent_id2sent = dict(zip(input_df.qasrl_id, input_df.sentence))
output_df['sentence'] = output_df['qasrl_id'].apply(lambda qasrl_id: sent_id2sent[qasrl_id])

output_df.to_csv(f"{data_dir}/nrl_output_file_with_sentence.csv", index=False)

In [None]:
!cp data/nrl_output_file_with_sentence.csv nrl_parser_output_qasrl_gs_test.csv 