# MAVEN Expansion
This jupyter notebook script adds lemma, PoS, dependency parsing and some other information onto MAVEN dataset with [Stanza](https://stanfordnlp.github.io/stanza).

## Settings

In [None]:
data_dir = './maven_data'
files = ('valid.jsonl', 'test.jsonl', 'train.jsonl')
output_dir = './maven_data'
suffix = '_expanded'

## Resources

In [None]:
import os
import json
import stanza

In [None]:
# (Optional) Uncomment the following line if stanza English model is not downloaded.
# stanza.download('en')

## Processing

In [None]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse', tokenize_pretokenized=True)

In [None]:
def process_doc(doc_item):
    content_item = doc_item['content']
    for sentence_item in content_item:
        sentence_tokens = sentence_item['tokens']
        sentence_result = nlp([sentence_tokens]).sentences[0]
        sentence_result = sentence_result.to_dict()
        for token_result in sentence_result:
            token_result['id'] -= 1
            token_result['head'] -= 1
            token_result.pop('text')
            token_result.pop('misc')
        sentence_item['token_info'] = sentence_result
    return doc_item

In [None]:
def process_file(file_name):
    output_file_name = file_name[:-6] + suffix + '.jsonl'

    file_path = os.path.join(data_dir, file_name)
    output_file_path = os.path.join(output_dir, output_file_name)

    with open(file_path, 'r', encoding='utf-8') as f:
        with open(output_file_path, 'w', encoding='utf-8') as of:
            for line in f:
                if line.strip() == '':
                    continue
                doc_item = json.loads(line)
                expanded_doc_item = process_doc(doc_item)

                expanded_line = json.dumps(expanded_doc_item, ensure_ascii=False)
                of.write(expanded_line + '\n')

In [None]:
for file in files:
    print('processing %s' % file)
    process_file(file)
    print('%s done' % file)

In [None]:
del nlp