In [1]:
import pandas as pd

*Loading data*

In [2]:
train_qa = pd.read_json('D:\\Downloads\\TechQA\\TechQA\\training_and_dev\\training_Q_A.json')
dev_qa = pd.read_json('D:\\Downloads\\TechQA\\TechQA\\training_and_dev\\dev_Q_A.json')
train_corpus = pd.read_json('D:\\Downloads\\TechQA\\TechQA\\training_and_dev\\training_dev_technotes.json', orient='index')
train_sections = pd.read_json('D:\\Downloads\\TechQA\\TechQA\\training_and_dev\\training_dev_technotes.sections.json', orient='index')

In [3]:
train = train_qa.query('`ANSWERABLE` == "Y" and `ANSWER` != ""').merge(train_corpus, how='left', left_on='DOCUMENT', right_on='id')[['QUESTION_TITLE', 
                                                                                                                  'QUESTION_TEXT',
                                                                                                                  'DOCUMENT',
                                                                                                                  'ANSWER',
                                                                                                                  'START_OFFSET',
                                                                                                                  'END_OFFSET', 
                                                                                                                  'title']]

In [4]:
dev = dev_qa.query('`ANSWERABLE` == "Y" and `ANSWER` != ""').merge(train_corpus, how='left', left_on='DOCUMENT', right_on='id')[['QUESTION_TITLE', 
                                                                                                                  'QUESTION_TEXT',
                                                                                                                  'DOCUMENT',
                                                                                                                  'ANSWER',
                                                                                                                  'START_OFFSET',
                                                                                                                  'END_OFFSET', 
                                                                                                                  'title']]

*Merging train and dev sets*

In [5]:
train['split'] = "train"
dev['split'] = "dev"

techqa = pd.concat([train, dev]).reset_index(drop=True)

In [6]:
techqa = techqa.drop_duplicates(subset=['ANSWER', "title", "QUESTION_TEXT"]).drop_duplicates(subset=['QUESTION_TEXT', 'QUESTION_TITLE'])
techqa.START_OFFSET = techqa.START_OFFSET.apply(int)
techqa.END_OFFSET = techqa.END_OFFSET.apply(int)

*Filtering duplicated titles*

In [7]:
techqa = techqa.query(
    '~`title`.str.contains("IBM Action required for IBM Integration Bus Hypervisor Edition V9.0 and WebSphere Message Broker Hypervisor Edition V8.0 for security vulnerabilities in Red Hat Linux")')
techqa = techqa.query(
    '~`title`.str.contains("IBM Security Bulletin:  Multiple vulnerabilities in IBM Java Runtime affect API Connect - United States")')

*Changing overlapping chunks*

In [8]:
def find_chunk_overlaps(df: pd.DataFrame):
    # documents with several queries
    doc_idx = df.DOCUMENT.value_counts()[df.DOCUMENT.value_counts() != 1].index
    id2change = {}
    for didx in doc_idx:
        # check if queries have different chunk as answer
        if df.query('`DOCUMENT` == @didx').ANSWER.value_counts().shape[0] != 1:
            # find start and end of chunks
            intervals = list(set([tuple(el) for el in df.query('`DOCUMENT` == @didx')[['START_OFFSET', 'END_OFFSET']].values.tolist()]))
            intervals = sorted(intervals, key=lambda x: x[0])
            # check if 2 neighbor chunk overlap
            for i in range(len(intervals) -1):
                if pd.Interval(*intervals[i]).overlaps(pd.Interval(*intervals[i + 1])):
                    start = min([intervals[i][0], intervals[i+1][0]])
                    end = max([intervals[i][1], intervals[i+1][1]])
                    id2change[didx] = []
                    id2change[didx].append({intervals[i]: (start, end)})
                    id2change[didx].append({intervals[i+1]: (start, end)})
    return id2change


def change_overlapping_chunks(df: pd.DataFrame, id2change: dict):
    overlap_df = []
    drop_mask = []
    # iterate dict {didx: [(idx_start_old, idx_end_old): (idx_start_new, idx_end_new)]}
    for didx, id_dicts in id2change.items():
        for id_d in id_dicts:
            start, end = [e for e in id_d.keys()][0]
            new_start, new_end = [e for e in id_d.values()][0]
            # find current query df id
            i = df.query('`DOCUMENT` == @didx and `START_OFFSET` == @start and `END_OFFSET` == @end').index.values[0]
            drop_mask.append(i)
            # append current query info
            i_dict = df.loc[i].to_dict()
            i_dict['START_OFFSET'] = new_start
            i_dict['END_OFFSET'] = new_end
            overlap_df.append(i_dict)
    overlap_df = pd.DataFrame(overlap_df)
    return pd.concat([df.loc[[i for i in df.index if not i in drop_mask]], overlap_df]).reset_index(drop=True)

In [9]:
idx2change = find_chunk_overlaps(techqa)
no_overlap_techqa = change_overlapping_chunks(techqa, idx2change)

idx2change = find_chunk_overlaps(no_overlap_techqa)
no_overlap_techqa = change_overlapping_chunks(no_overlap_techqa, idx2change)

In [10]:
find_chunk_overlaps(no_overlap_techqa)

{}

*Finding no-answer chunks from the same documents*

In [11]:
import numpy as np

def find_no_answer_chunks(df: pd.DataFrame, technotes: pd.DataFrame):
    doc_sections = []
    for didx in df.DOCUMENT.unique():
        intervals = list(set([tuple(el) for el in df.query('`DOCUMENT` == @didx')[['START_OFFSET', 'END_OFFSET']].values.tolist()]))
        intervals = sorted(intervals, key=lambda x: x[0])
        for sec in technotes.query('`id` == @didx').sections.values[0]:
            if sec['end'] - sec['start'] > 100:
                for inter in intervals:
                    if pd.Interval(sec['start'], sec['end']).overlaps(pd.Interval(*inter)):
                        break
                else:
                    if not sec['text'].lower().strip().startswith('question'):
                        doc_sections.append({
                            "DOCUMENT": didx,
                            "START_OFFSET": sec['start'],
                            "END_OFFSET": sec['end'],
                            "title": df.query('`DOCUMENT` == @didx').title.values[0],
                            "ANSWER": sec['text'],
                            "QUESTION_TITLE": np.nan,
                            "QUESTION_TEXT": np.nan,
                            "split": 'no-answer'
                        })
    doc_sections = pd.DataFrame(doc_sections)
    return pd.concat([df, doc_sections]).reset_index(drop=True)

In [12]:
fullfilled_techqa = find_no_answer_chunks(no_overlap_techqa, train_sections)

*Formatting text*

In [13]:
fullfilled_techqa.ANSWER = fullfilled_techqa.ANSWER.apply(lambda x: x.strip().replace('\n\n', '\n'))
fullfilled_techqa.title = fullfilled_techqa.title.apply(lambda x: x.strip().replace('\n\n', '\n'))

queries = []
for title, text in zip(fullfilled_techqa.QUESTION_TITLE, fullfilled_techqa.QUESTION_TEXT):
    if isinstance(title, str) and isinstance(text, str):
        queries.append(title.strip().replace('\n\n', '\n') + '\n' + text.strip().replace('\n\n', '\n'))
    else:
        queries.append(title)
fullfilled_techqa['query'] = queries

In [14]:
fullfilled_techqa.title.unique().shape, fullfilled_techqa.DOCUMENT.unique().shape

((489,), (489,))

In [None]:
fullfilled_techqa[['DOCUMENT', 'title', 'ANSWER', 'query']].rename(columns={"DOCUMENT": "document_id", "ANSWER": "text"}).to_csv('techqa.csv', index=False)