In [1]:
import json
import string
import ast

from pathlib import Path
from collections import Counter

from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords

from pandas import DataFrame
import pandas as pd

from scipy.stats import skew

In [2]:
radqa_dir = Path('../data/raw/radqa/')

radqa_train_path = radqa_dir / 'train.json'
radqa_dev_path = radqa_dir / 'dev.json'
radqa_test_path = radqa_dir / 'test.json'

In [3]:
radqa_train = json.load(open(radqa_train_path))
radqa_dev = json.load(open(radqa_dev_path))
radqa_test = json.load(open(radqa_test_path))

print(f'The number of paragraphs/documents in the training set is', len(radqa_train['data']))
print(f'The number of paragraphs/documents in the dev set is', len(radqa_dev['data']))
print(f'The number of paragraphs/documents in the test set is', len(radqa_test['data']))

The number of paragraphs/documents in the training set is 803
The number of paragraphs/documents in the dev set is 102
The number of paragraphs/documents in the test set is 104


In [4]:
print(f'The number of qa pairs in the training set is', sum([len(paragraph['qas']) for item in radqa_train['data'] for paragraph in item['paragraphs']]))
print(f'The number of qa pairs in the dev set is', sum([len(paragraph['qas']) for item in radqa_dev['data'] for paragraph in item['paragraphs']]))
print(f'The number of qa pairs in the test set is', sum([len(paragraph['qas']) for item in radqa_test['data'] for paragraph in item['paragraphs']]))

The number of qa pairs in the training set is 4878
The number of qa pairs in the dev set is 656
The number of qa pairs in the test set is 614


In [6]:
def process_string(s):
    # return s
    return ' '.join([item for item in s.replace('\n _', '\n\n _').replace('_\n ', '_\n\n ').replace('\n ', ' ').split(' ') if item]).replace('\n ', '\n')

In [8]:
def get_qs_processed(radqa_data):
    radqa_data = radqa_data['data']
    radqa_data_processed = []
    
    for item_idx, data_item in enumerate(radqa_data):
        # print(data_item.keys())
        title = data_item['title']
        paragraphs = data_item['paragraphs']
        
        paragraphs_processed = []
        for para_idx, paragraph in enumerate(paragraphs):
            # print(paragraph.keys())
            context = paragraph['context']
            context = process_string(context)
            # print(context)
            
            qas = paragraph['qas']
            document_id = paragraph['document_id']
            
            qas_processed = []
            for qa in qas:
                id = qa['id']
                question = qa['question']
                answers = qa['answers']
                # print(answers)
                
                answers_processed = []
                for answer in answers:
                    answer_id = answer['answer_id']
                    text = answer['text']
                    # answer_start = answer['answer_start']
                    text = process_string(text)
                    assert text in context
                    answer_start = context.index(text)
                    answers_processed.append({
                        'answer_id': answer_id,
                        'text': text,
                        'answer_start': answer_start
                    })
                answers = answers_processed
                is_impossible = qa['is_impossible']
                
                # if is_impossible:
                #     assert not answers
                #     continue
                # else:
                qas_processed.append({
                    'id': id,
                    'question': question,
                    'answers': answers,
                    'is_impossible': is_impossible
                })
                
            if qas_processed:
                paragraphs_processed.append({
                    'context': context,
                    'qas': qas_processed,
                    'document_id': document_id
                })
                
        if paragraphs_processed:
            radqa_data_processed.append({
                'title': title,
                'paragraphs': paragraphs_processed
            })
        # break
    
    return {'data': radqa_data_processed}



In [None]:
radqa_train_processed = get_qs_processed(radqa_train)

# Shuffle the training data

import random
import copy

random_seed = 42
radqa_train_shuffle = copy.deepcopy(radqa_train_processed["data"])

random.seed(random_seed)
random.shuffle(radqa_train_shuffle)

radqa_train_processed = {"data": radqa_train_shuffle}

In [10]:
radqa_dev_processed = get_qs_processed(radqa_dev)
radqa_test_processed = get_qs_processed(radqa_test)


In [11]:
print(f'The number of qa pairs in the training set is', sum([len(paragraph['qas']) for item in radqa_train_processed['data'] for paragraph in item['paragraphs']]))
print(f'The number of qa pairs in the dev set is', sum([len(paragraph['qas']) for item in radqa_dev_processed['data'] for paragraph in item['paragraphs']]))
print(f'The number of qa pairs in the test set is', sum([len(paragraph['qas']) for item in radqa_test_processed['data'] for paragraph in item['paragraphs']]))

The number of qa pairs in the training set is 4878
The number of qa pairs in the dev set is 656
The number of qa pairs in the test set is 614


In [14]:
print(f'The number of qa pairs in the test set is', sum([len(paragraph['qas']) for item in radqa_test['data'] for paragraph in item['paragraphs']]))
print(f'The number of answerable qa pairs in the test set is', sum([len([qa for qa in paragraph['qas'] if not qa['is_impossible']]) for item in radqa_test['data'] for paragraph in item['paragraphs']]))
print(f'The number of non-answerable qa pairs in the test set is', sum([len([qa for qa in paragraph['qas'] if qa['is_impossible']]) for item in radqa_test['data'] for paragraph in item['paragraphs']]))

The number of qa pairs in the test set is 614
The number of answerable qa pairs in the test set is 460
The number of non-answerable qa pairs in the test set is 154


In [15]:
saved_data_dir = Path('../data/modified/radqa/')

train_output_path = saved_data_dir / 'train_processed.json'
dev_output_path = saved_data_dir / 'dev_processed.json'
test_output_path = saved_data_dir / 'test_processed.json'

with open(train_output_path, 'w') as f:
    json.dump(radqa_train_processed, f)
    
with open(dev_output_path, 'w') as f:
    json.dump(radqa_dev_processed, f)
    
with open(test_output_path, 'w') as f:
    json.dump(radqa_test_processed, f)