In [11]:
import pickle as pkl
import spacy
import csv
import json
from copy import deepcopy
import stanza
import spacy_stanza
import benepar
from tqdm import tqdm
from copy import deepcopy

In [2]:
from data_construction.parallel_corpus.utils import merge_maximum_span
from data_construction.parallel_corpus.utils import clean_sentence_brackets
from data_construction.parallel_corpus.utils import process_nps_punctuation

In [8]:
sm_parser = spacy.load('en_core_web_sm')
berkeley_parser = spacy.load('en_core_web_md')
berkeley_parser.add_pipe("benepar", config={"model": "benepar_en3"})
trf_parser = spacy.load("en_core_web_trf")

In [9]:
sentence = "It is 5:30am in the morning."

print([item for item in sm_parser(sentence).noun_chunks])
print([item for item in berkeley_parser(sentence).noun_chunks])
print([item for item in trf_parser(sentence).noun_chunks])

[It, the morning]
[It, the morning]
[It, the morning]




In [12]:
stanza.download("en")
stanza_parser = spacy_stanza.load_pipeline("en")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.3.0.json:   0%|   …

2022-05-17 13:28:09 INFO: Downloading default packages for language: en (English)...
2022-05-17 13:28:11 INFO: File exists: /Users/boyuanzheng/stanza_resources/en/default.zip.
2022-05-17 13:28:15 INFO: Finished downloading models and saved to /Users/boyuanzheng/stanza_resources.
2022-05-17 13:28:15 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| depparse     | combined  |
| sentiment    | sstplus   |
| constituency | wsj       |
| ner          | ontonotes |

2022-05-17 13:28:15 INFO: Use device: cpu
2022-05-17 13:28:15 INFO: Loading: tokenize
2022-05-17 13:28:15 INFO: Loading: pos
2022-05-17 13:28:16 INFO: Loading: lemma
2022-05-17 13:28:16 INFO: Loading: depparse
2022-05-17 13:28:16 INFO: Loading: sentiment
2022-05-17 13:28:16 INFO: Loading: constituency
2022-05-17 13:28:17 INFO: Loading: ner
2022-05-17 13:28:18 INFO: Done loading pr

In [13]:
print([item for item in stanza_parser(sentence).noun_chunks])

[W NNPACK.cpp:79] Could not initialize NNPACK! Reason: Unsupported hardware.


[It]


In [None]:
stanza_parser.

In [23]:
source_data = []
with open('../parallel_corpus/parallel_data/en_fa_zh_parallel_corpus.json', 'r') as f:
    reader = json.load(f)
    for x in reader:
        source_data.append(x)

In [24]:
parsed_data = []
for instance in tqdm(source_data):
    text = instance['en_utterance']
    utterance = sm_parser(text)
    instance['sm_noun_chunk'] = [(item.text, item.start, item.end) for item in utterance.noun_chunks]
    instance['sm_pron'] = [(item.text,i, i+1, item.pos_, item.tag_) for i, item in enumerate(utterance)]

    utterance = berkeley_parser(text)
    instance['berkeley_noun_chunk'] = [(item.text, item.start, item.end) for item in utterance.noun_chunks]
    instance['berkeley_pron'] = [(item.text,i, i+1, item.pos_, item.tag_) for i, item in enumerate(utterance)]

    utterance = trf_parser(text)
    instance['trf_noun_chunk'] = [(item.text, item.start, item.end) for item in utterance.noun_chunks]
    instance['trf_pron'] = [(item.text,i, i+1, item.pos_, item.tag_) for i, item in enumerate(utterance)]
    parsed_data.append(instance)

with open('parsed_three_way_corpus.pkl', 'wb') as f:
    pkl.dump(parsed_data, f)

100%|██████████| 16252/16252 [1:20:19<00:00,  3.37it/s]  


In [2]:
with open('parsed_three_way_corpus.pkl', 'rb') as f:
    temp = pkl.load(f)

In [7]:
print(temp[0])

{'en_utterance': "That was the closest I've come to sex in, like, two years.", 'fa_utterance': 'اين نزديکترين برخوردي بود که احتمال سکس داشت تو دو سال گذشته', 'zh_utterance': '谢谢 这大概是我两年来 最接近滚床单的时刻了', 'sm_noun_chunk': [('That', 0, 1), ('I', 4, 5), ('sex', 8, 9)], 'sm_pron': [('That', 0, 1, 'PRON', 'DT'), ('was', 1, 2, 'AUX', 'VBD'), ('the', 2, 3, 'DET', 'DT'), ('closest', 3, 4, 'ADJ', 'JJS'), ('I', 4, 5, 'PRON', 'PRP'), ("'ve", 5, 6, 'AUX', 'VBP'), ('come', 6, 7, 'VERB', 'VBN'), ('to', 7, 8, 'ADP', 'IN'), ('sex', 8, 9, 'NOUN', 'NN'), ('in', 9, 10, 'ADP', 'RP'), (',', 10, 11, 'PUNCT', ','), ('like', 11, 12, 'INTJ', 'UH'), (',', 12, 13, 'PUNCT', ','), ('two', 13, 14, 'NUM', 'CD'), ('years', 14, 15, 'NOUN', 'NNS'), ('.', 15, 16, 'PUNCT', '.')], 'berkeley_noun_chunk': [('That', 0, 1), ('I', 4, 5), ('sex', 8, 9)], 'berkeley_pron': [('That', 0, 1, 'PRON', 'DT'), ('was', 1, 2, 'AUX', 'VBD'), ('the', 2, 3, 'DET', 'DT'), ('closest', 3, 4, 'ADJ', 'JJS'), ('I', 4, 5, 'PRON', 'PRP'), ("'ve", 5, 6,

In [12]:
def collect_all_mentions(instance):
    sentence_token = [item[0] for item in instance['sm_pron']]
    sm_nps = process_nps_punctuation(sentence_token, process_nps_punctuation(sentence_token, instance['sm_noun_chunk']))
    berkeley_nps = process_nps_punctuation(sentence_token, process_nps_punctuation(sentence_token, instance['berkeley_noun_chunk']))
    trf_nps = process_nps_punctuation(sentence_token, process_nps_punctuation(sentence_token, instance['trf_noun_chunk']))
    noun_phrase = merge_maximum_span(list(set(sm_nps) | set(berkeley_nps) | set(trf_nps)))
    temp_pron = []
    temp_pron.extend([(item[0], item[1], item[2]) for item in instance['sm_pron'] if item[3]=='PRON'])
    temp_pron.extend([(item[0], item[1], item[2]) for item in instance['berkeley_pron'] if item[3]=='PRON'])
    temp_pron.extend([(item[0], item[1], item[2]) for item in instance['trf_pron'] if item[3]=='PRON'])
    pron = merge_maximum_span(list(set(temp_pron)))
    all = list(set(noun_phrase) | set(pron))
    return all
collect_all_mentions(temp[3])

[('her', 13, 14),
 ('anyone', 8, 9),
 ('my wife', 4, 6),
 ('sex', 17, 18),
 ('it', 19, 20),
 ('me', 21, 22),
 ('that', 2, 3),
 ('my', 4, 5)]

In [22]:
count = 0
for item in tqdm(temp[-2504:]):
    try:
        count += len(collect_all_mentions(item))
    except:
        pass

100%|██████████| 2504/2504 [00:00<00:00, 9775.72it/s]


In [21]:
print(len(temp))

16252


In [23]:
print(count)

11980
