In [5]:
import pickle as pkl
import spacy
import csv
import json
from copy import deepcopy
from tqdm import tqdm
import benepar
csv.field_size_limit(1131072)

1131072

In [6]:
from utils import merge_maximum_span
from utils import process_nps_punctuation
from utils import combine_samples

In [7]:
sm_parser = spacy.load('en_core_web_sm')
berkeley_parser = spacy.load('en_core_web_md')
berkeley_parser.add_pipe("benepar", config={"model": "benepar_en3"})
trf_parser = spacy.load("en_core_web_trf")

In [8]:
with open('pilot_data/pilot_1_source.csv', "r", encoding="utf-8") as csv_fh:
    reader = csv.DictReader(csv_fh)
    output = []
    for line in reader:
        sentences = json.loads(line['json_data'])['sentences']
        all_sentences = []
        all_query_spans = []
        j = 0
        for sent in sentences:
            speaker = sent[0].strip()
            sentence = " ".join(sent[2:]).strip()
            utt = {}
            utt['sm_noun_chunk'] = [(item.text, item.start, item.end) for item in sm_parser(sentence).noun_chunks]
            utt['sm_noun_chunk'] = utt['sm_noun_chunk'] + [(item.text,i, i+1) for i, item in enumerate(sm_parser(sentence)) if item.pos_=="PROPN"]
            utt['sm_pron'] = [(item.text,i, i+1) for i, item in enumerate(sm_parser(sentence)) if item.pos_=="PRON"]

            utt['berkeley_noun_chunk'] = [(item.text, item.start, item.end) for item in berkeley_parser(sentence).noun_chunks]
            utt['berkeley_noun_chunk'] = utt['berkeley_noun_chunk'] + [(item.text,i, i+1) for i, item in enumerate(berkeley_parser(sentence)) if item.pos_=="PROPN"]
            utt['berkeley_pron'] = [(item.text,i, i+1) for i, item in enumerate(berkeley_parser(sentence)) if item.pos_=="PRON"]

            utt['trf_noun_chunk'] = [(item.text, item.start, item.end) for item in trf_parser(sentence).noun_chunks]
            utt['trf_noun_chunk'] = utt['trf_noun_chunk'] + [(item.text,i, i+1) for i, item in enumerate(trf_parser(sentence)) if item.pos_=="PROPN"]
            utt['trf_pron'] = [(item.text,i, i+1) for i, item in enumerate(trf_parser(sentence)) if item.pos_=="PRON"]

            sentence_token = [item.text for item in sm_parser(sentence)]
            noun_phrase = merge_maximum_span(list(set(utt['sm_noun_chunk']) | set(utt['berkeley_noun_chunk']) | set(utt['trf_noun_chunk'])))
            noun_phrase = process_nps_punctuation(sentence_token, process_nps_punctuation(sentence_token, noun_phrase))

            pron = merge_maximum_span(list(set(utt['sm_pron']) | set(utt['berkeley_pron']) | set(utt['trf_pron'])))
            pron = process_nps_punctuation(sentence_token, process_nps_punctuation(sentence_token, pron))

            mention = list(set(noun_phrase)|set(pron))
            mention.sort(key=lambda x: x[1])

            speaker_tokens = [speaker]
            all_sentences.append([speaker_tokens] + [":"] + sentence_token)
            for span in mention:
                all_query_spans.append({
                        "sentenceIndex": j,
                        "startToken": span[1] + len(speaker_tokens) + 1,
                        "endToken": span[2] + len(speaker_tokens) + 1
                    })
            j+=1
        output.append({
            "sentences": all_sentences,
            "querySpans": all_query_spans,
            "candidateSpans": all_query_spans,
            "clickSpans": all_query_spans,
            })



In [11]:
combined_output = []
combined_output.append(combine_samples(output))

In [12]:
with open('pilot_data/corrected_pilot_1_combined.csv', "w", encoding="utf-8") as csv_fh:
        fieldnames = ['json_data']
        writer = csv.DictWriter(csv_fh, fieldnames, lineterminator='\n')
        writer.writeheader()
        for line in combined_output:
            writer.writerow({'json_data': json.dumps(line)})

In [5]:
with open('pilot_data/corrected_pilot_1.csv', "w", encoding="utf-8") as csv_fh:
        fieldnames = ['json_data']
        writer = csv.DictWriter(csv_fh, fieldnames, lineterminator='\n')
        writer.writeheader()
        for line in output:
            writer.writerow({'json_data': json.dumps(line)})