### The notebook contains the regex rules to identify causal questions and the code to parse datasets
### You should download original datasets from the respective sources (links are provided in the starting README file)

#### 7 rules to identify causal questions

In [None]:
import json
import re

pattern1 = re.compile(r"\Awhy|(?=\Aif)(?=.*why )|(?=\Awhen)(?=.*why )| and why | why is \w+ | is \w+ why ", re.IGNORECASE)
pattern2 = re.compile(r"\Acause.{0,1} | cause.{0,1} |because of what", re.IGNORECASE)
pattern3 = re.compile("\s*how come |\s*how did ", re.IGNORECASE)
pattern4 = re.compile(r"^(?!.*dopplar).*effect.{0,1} .*$| affect{0,1} ", re.IGNORECASE)
pattern5 = re.compile(' lead to', re.IGNORECASE)
pattern6 = re.compile(r"(?=.*what happens)(?=.*if)|(?=.*what will happen)(?=.*if)|(?=.*what might happen)(?=.*if)|\
(?=.*what happens)(?=.*when)|(?=.*what will happen)(?=.*when)|(?=.*what might happen)(?=.*when)", re.IGNORECASE)
pattern7 = re.compile("\Awhat to do if |\Awhat to do when |\Awhat to do to |\
\Awhat should be done if |\Awhat should be done when |\Awhat should be done to ", re.IGNORECASE)

In [None]:
def strip_punct(s):
    '''Removes all characters except Russian and Latin letters'''
    s = re.sub('[^А-Яа-яЁёЙйA-Za-z0-9]', ' ', s.lower())
    return " ".join(s.lower().split())

def identify_causal(json_file, question_field):
    causal_entries = list()   
    for jline in json_file:
        question = strip_punct(json.loads(jline)[question_field])
        if pattern1.search(question) or pattern2.search(question) or pattern3.search(question)\
        or pattern4.search(question) or pattern5.search(question) or pattern6.search(question) or pattern7.search(question):
            causal_entries.append(json.loads(jline))        
    return causal_entries

#### PAQ, GooAQ, NewsQA

In [None]:
question_field = 'question' # PAQ, GooAQ, NewsQA

with open('path_to_json_data_file.jsonl', 'r') as jfile:
    causal_entries = identify_causal(jfile, question_field)
f.close()

#### HotpotQA

In [None]:
import json
import os

def hotpot_identify_causal(dicts):
    causal_entries = list()
    for d in dicts:
        question = d['question']
        qs.append(question)
        q = strip_punct(question)
        if pattern1.search(q) or pattern2.search(q) or pattern3.search(q)\
        or pattern4.search(q) or pattern5.search(q) or pattern6.search(q) or pattern7.search(q):
            causal_entries.append(d)
    return causal_entries

PATH = '' # path to the hotpot-master directory

train_filename = 'hotpot_train_v1.1.json'

with open(os.path.join(PATH, train_filename), 'r') as f:
    train_dicts = json.load(f)
    
dev_filename = 'hotpot_dev_distractor_v1.json'

with open(os.path.join(PATH, dev_filename), 'r') as f:
    test_dicts = json.load(f)
            
        
train_causal_entries, test_causal_entries = hotpot_identify_causal(train_dicts), hotpot_identify_causal(test_dicts)

#### MS MARCO

In [None]:
import json
import gzip
import os

def msmarco_identify_causal(question):
    q = strip_punct(question)
    if pattern1.search(q) or pattern2.search(q) or pattern3.search(q)\
    or pattern4.search(q) or pattern5.search(q) or pattern6.search(q) or pattern7.search(q): 
        return True

PATH = '' # path to the MS MARCO directory    
        
with gzip.GzipFile(os.path.join(PATH,'train_v2.1.json.gz'), 'r') as f:
    data_train = json.loads(f.read().decode('utf-8'))
with gzip.GzipFile(os.path.join(PATH,'dev_v2.1.json.gz'), 'r') as f:
    data_dev = json.loads(f.read().decode('utf-8'))

causal_entries = list()

dataset = data_train
# dataset = data_dev # uncomment for the dev dataset

for data in dataset:
    for query, qid, wellFormedAnswers, passages, answers, query_type in zip(list(data['query'].values()), list(data['query_id'].values()), 
                                                                            list(data['wellFormedAnswers'].values()), list(data['passages'].values()), 
                                                                            list(data['answers'].values()), list(data['query_type'].values())):
        if query msmarco_identify_causal(cause_questions):
            d = {}
            question = query
            d['answers'] = answers
            d['passages'] = passages
            d['query'] = query
            d['query_id'] = qid
            d['query_type'] = query_type
            d['wellFormedAnswers'] = wellFormedAnswers
            causal_entries.append(d)

#### ELI5

In [None]:
import nlp

def eli_identify_causal(data):
    causal_entries = list()
    for line in data:
        question = jline['title']
        qs.append(question)
        q = strip_punct(question)
        if pattern1.search(q) or pattern2.search(q) or pattern3.search(q)\
        or pattern4.search(q) or pattern5.search(q) or pattern6.search(q) or pattern7.search(q):
            causal_entries.append(line)
    return causal_entries

eli5 = nlp.load_dataset('eli5')

train_data = eli5['train_eli5']
dev_data = eli5['validation_eli5']

train_causal_entries = eli_identify_causal(train_data)
dev_causal_entries = eli_identify_causal(dev_data)

#### SearchQA

In [None]:
import json
import os
import zipfile

path = '/SearchQA/data_json/train.zip' # path to the train zip

train_data = list()

with zipfile.ZipFile(path, 'r') as z:
    for filename in tqdm(z.namelist()):
        with z.open(filename, 'r') as f:
            for d in f:
                train_data.append(d)
                
path = 'SearchQA/data_json/val.zip' # path to the val zip

val_data = list()

with zipfile.ZipFile(path, 'r') as z:
    for filename in tqdm(z.namelist()):
        with z.open(filename, 'r') as f:
            for d in f:
                val_data.append(d)
                
train_causal_entries = identify_causal(train_data, 'question')
dev_causal_entries = identify_causal(val_data, 'question')

#### SQuaD 2.0

In [None]:
import json
import os

def squad_identify_causal(questions):
    causal_questions = list()
    for question in questions:
        q = strip_punct(question)
        if pattern1.search(q) or pattern2.search(q) or pattern3.search(q)\
        or pattern4.search(q) or pattern5.search(q) or pattern6.search(q) or pattern7.search(q):
            causal_questions.append(question)
    return causal_questions

PATH = '' # path to the SQuaD directory

with open(os.path.join(PATH,'train-v2.0.json'), 'r') as f:
    train_data = json.load(f)

with open(os.path.join(PATH,'dev-v2.0.json'), 'r') as f:
    dev_data = json.load(f)
    
squad_questions = list()

for data in [train_data, dev_data]:
    for i1 in data['data']:
        data_out.append(i1)
        for i2 in i1['paragraphs']:
            for i3 in i2['qas']:
                squad_questions.append(i3['question'])
                

squad_causal_questions = squad_identify_causal(squad_questions)

dicts_train = list()
for entry in train_data['data']:
    for entry1 in entry['paragraphs']:
        for entry2 in entry1['qas']:
            if entry2['question'] in squad_causal_questions:
                entry2['context'] = entry1['context']
                entry2['title'] = entry['title']
                dicts_train.append(entry2)
                
dicts_dev = list()
for entry in dev_data['data']:
    for entry1 in entry['paragraphs']:
        for entry2 in entry1['qas']:
            if entry2['question'] in squad_causal_questions:
                entry2['context'] = entry1['context']
                entry2['title'] = entry['title']
                dicts_dev.append(entry2)
                
train_cuasal_entries = dicts_train
dev_causal_entries = dicts_dev

#### NewsQA

In [None]:
import pandas as pd

df_newsqa = pd.read_csv('combined-newsqa-data-v1.csv')
questions_newsqa = df_newsqa.question.tolist()

def newsqa_identify_causal(questions):
    causal_questions = list()
    
    for question in questions:
        if type(question) == str:
            qs.append(question)
            q = strip_punct(question)
            if pattern1.search(q) or pattern2.search(q) or pattern3.search(q)\
            or pattern4.search(q) or pattern5.search(q) or pattern6.search(q) or pattern7.search(q):
                causal_questions.append(question)
    return causal_questions
                
causal_questions = newsqa_identify_causal(questions_newsqa)

df_res = df_newsqa.loc[df_newsqa['question'].isin(causal_questions)]
df_res_json = df_res.to_dict(orient='records')

#### TriviaQA

In [None]:
import json
import os
import gzip
from tqdm import tqdm

PATH = '' # path to the TriviaQA directory

with open(os.path.join(PATH, 'unfiltered-web-train.json'), 'r') as z:
    data = z.read()
json_train = json.loads(data)

with open(os.path.join(PATH, 'unfiltered-web-dev.json'), 'r') as z:
    data = z.read()
json_dev = json.loads(data)

def trivia_identify_causal(json_data):
    causal_entries = list()
    for jline in json_data['Data']:
        question = jline['Question']
        q = strip_punct(question)
        if pattern1.search(q) or pattern2.search(q) or pattern3.search(q)\
        or pattern4.search(q) or pattern5.search(q) or pattern6.search(q) or pattern7.search(q):
            causal_entries.append(jline)
    return causal_entries


train_cuasal_entries = trivia_identify_causal(json_train)
dev_causal_entries = trivia_identify_causal(json_dev)

#### Natural Questions

In [None]:
import gzip
import os

def natural_identify_causal(f):
    causal_entries = list()
    for jline in f:
        question = json.loads(jline)['question_text']
        q = strip_punct(question)
        if pattern1.search(q) or pattern2.search(q) or pattern3.search(q)\
        or pattern4.search(q) or pattern5.search(q) or pattern6.search(q) or pattern7.search(q):
            causal_entries.append(json.loads(jline))
        
    return causal_entries

PATH = '' # path to the natural questions directory

with gzip.GzipFile(os.path.join(PATH, 'v1.0-simplified_simplified-nq-train.jsonl.gz'), 'r') as f:
    train_cuasal_entries = natural_identify_causal(f)
    
PATH = '' + '/v1.0/dev/' # there are several json files in the dev directory

flat_data = list()

for filename in tqdm(os.listdir(PATH)):
    with gzip.GzipFile(os.path.join(PATH, filename), 'r') as f:
        for jline in f:
            flat_data.append(jline)
            
dev_causal_entries = trivia_identify_causal(flat_data)