# Convert ReCoRD dataset into SQuAD format

##  SQuAD format

In [25]:
import os
squad_dir = "../../data/SQuAD"
os.listdir(squad_dir)

['train-v1.1.json', 'dev-v1.1.json', 'cached_train_bert-base-cased_384']

In [8]:
import json
squad_train = json.load(open(os.path.join(squad_dir, "train-v1.1.json")))  
squad_dev = json.load(open(os.path.join(squad_dir, "dev-v1.1.json")))

In [39]:
squad_train.keys(), squad_train['version']

(dict_keys(['data', 'version']), '1.1')

In [12]:
squad_train_data = squad_train['data']
len(squad_train_data) # is a list

442

In [13]:
stde = squad_train_data_example = squad_train_data[0]
stde.keys()

dict_keys(['title', 'paragraphs'])

In [15]:
stde['title']

'University_of_Notre_Dame'

In [17]:
len(stde['paragraphs'])

55

In [18]:
stde_paragraphs = stde['paragraphs']
stdepe = stde_paragraphs_example = stde_paragraphs[0]
stdepe.keys()

dict_keys(['context', 'qas'])

In [19]:
stdepe['context']

'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

In [20]:
stdepe['qas']

[{'answers': [{'answer_start': 515, 'text': 'Saint Bernadette Soubirous'}],
  'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
  'id': '5733be284776f41900661182'},
 {'answers': [{'answer_start': 188, 'text': 'a copper statue of Christ'}],
  'question': 'What is in front of the Notre Dame Main Building?',
  'id': '5733be284776f4190066117f'},
 {'answers': [{'answer_start': 279, 'text': 'the Main Building'}],
  'question': 'The Basilica of the Sacred heart at Notre Dame is beside to which structure?',
  'id': '5733be284776f41900661180'},
 {'answers': [{'answer_start': 381,
    'text': 'a Marian place of prayer and reflection'}],
  'question': 'What is the Grotto at Notre Dame?',
  'id': '5733be284776f41900661181'},
 {'answers': [{'answer_start': 92,
    'text': 'a golden statue of the Virgin Mary'}],
  'question': 'What sits on top of the Main Building at Notre Dame?',
  'id': '5733be284776f4190066117e'}]

## ReCoRD Format

In [26]:
record_dir = "../../data/record-sg"
os.listdir(record_dir)

['ReCoRD_CNN_origin',
 'test.jsonl',
 'val.jsonl',
 'train.jsonl',
 'ReCoRD_Daily_origin']

In [29]:
record_train = [json.loads(l) for l in open(os.path.join(record_dir, "train.jsonl"))]
record_dev = [json.loads(l) for l in open(os.path.join(record_dir, "val.jsonl"))]
len(record_train), len(record_dev)

(37286, 4164)

In [30]:
record_train_example = record_train[0]
record_train_example.keys()

dict_keys(['source', 'passage', 'qas', 'idx'])

In [35]:
record_train_example['source'], record_train_example['idx'], record_train_example['passage'].keys()

('Daily mail', 0, dict_keys(['text', 'entities']))

In [36]:
record_train_example['passage']

{'text': "The harrowing stories of women and children locked up for so-called 'moral crimes' in Afghanistan's notorious female prison have been revealed after cameras were allowed inside. Mariam has been in Badam Bagh prison for three months after she shot a man who just raped her at gunpoint and then turned the weapon on herself - but she has yet to been charged. Nuria has eight months left to serve of her sentence for trying to divorce her husband. She gave birth in prison to her son and they share a cell together. Scroll down for video Nuria was jailed for trying to divorce her husband. Her son is one of 62 children living at Badam Bagh prison\n@highlight\nMost of the 202 Badam Bagh inmates are jailed for so-called 'moral crimes'\n@highlight\nCrimes include leaving their husbands or refusing an arrange marriage\n@highlight\n62 children live there and share cells with their mothers and five others",
 'entities': [{'start': 86, 'end': 96},
  {'start': 178, 'end': 183},
  {'start': 197

In [37]:
record_train_example['qas']

[{'query': 'The baby she gave birth to is her husbands and he has even offered to have the courts set her free if she returns, but @placeholder has refused.',
  'answers': [{'start': 535, 'end': 539, 'text': 'Nuria'}],
  'idx': 0}]

## ReCoRD -> SQuAD

In [44]:
def record2squad(example, isTrain=False):
    def convertQA(qa):
        def convertA(a):
            return {
                "answer_start": a['start'],
                "text": a['text'],
            }
        return {
            "question": qa['query'],
            "answers": [convertA(qa['answers'][0])] if isTrain else [convertA(a) for a in qa['answers']]
        }
    para = {
        "context": example['passage']['text'],
        "qas": [convertQA(qa) for qa in example['qas']]
    }
    data = {
        "title": "From "+example['source'],
        "paragraphs": [para],
    }
    return data
record_train_example, record2squad(record_train_example, isTrain=True)

({'source': 'Daily mail',
  'passage': {'text': "The harrowing stories of women and children locked up for so-called 'moral crimes' in Afghanistan's notorious female prison have been revealed after cameras were allowed inside. Mariam has been in Badam Bagh prison for three months after she shot a man who just raped her at gunpoint and then turned the weapon on herself - but she has yet to been charged. Nuria has eight months left to serve of her sentence for trying to divorce her husband. She gave birth in prison to her son and they share a cell together. Scroll down for video Nuria was jailed for trying to divorce her husband. Her son is one of 62 children living at Badam Bagh prison\n@highlight\nMost of the 202 Badam Bagh inmates are jailed for so-called 'moral crimes'\n@highlight\nCrimes include leaving their husbands or refusing an arrange marriage\n@highlight\n62 children live there and share cells with their mothers and five others",
   'entities': [{'start': 86, 'end': 96},
    

In [None]:
target_dir = "../../data/record-squad"
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

new_train = {
    
}
json.dump(new_train, open(os.path.join(target_dir, "train.json"), "w"))
json.dump(new_dev, open(os.path.join(target_dir, "dev.json"), "w"))