In [None]:
import os
import sys

import pathlib
import logging

logging.basicConfig(stream=sys.stdout, level=logging.ERROR)
#execute before first launch
#python -m spacy download en_core_web_sm

In [None]:
CORPUS = 'ArxivHealthcareNLP'
#CORPUS = 'arxiv_cl'

In [None]:
def load_properties(filepath, sep='=', comment_char='#'):
    '''
    Read the file passed as parameter as a properties file.
    '''
    props = {}
    with open(filepath, "rt") as f:
        for line in f:
            l = line.strip()
            if l and not l.startswith(comment_char):
                key_value = l.split(sep)
                key = key_value[0].strip()
                value = sep.join(key_value[1:]).strip().strip('"') 
                props[key] = value 
    return props

corpus_properties = load_properties(f"corpora/{CORPUS}.properties")
corpus_properties

In [None]:
CORPUS_BASE = corpus_properties['corpus_base']
TXT_BASE = f'{CORPUS_BASE}/text_cleaned/'
TXT_COREF_BASE = f'{CORPUS_BASE}/text_coref_resolved/'

if not os.path.exists(TXT_COREF_BASE):
    print(f'{TXT_COREF_BASE} does not exist. Creating.')
    os.makedirs(TXT_COREF_BASE)

In [None]:
from os import listdir
from os.path import isfile, join
txt_files = [f for f in listdir(TXT_BASE) if isfile(join(TXT_BASE, f))]
len(txt_files)

In [None]:
def read_text_file(filename):
    txt_content = pathlib.Path(filename).read_bytes()
    txt_content = txt_content.decode("utf-8")
    print(f'File length: {len(txt_content)}')
    return txt_content

In [None]:
# python -m spacy download en_core_web_sm
from fastcoref import spacy_component
import spacy

def resolve_corefs(text):
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe(
        "fastcoref", 
        config={'model_architecture': 'LingMessCoref', 
                'model_path': 'biu-nlp/lingmess-coref', 
                'device': 'cpu'}
    )
    doc = nlp(text, component_cfg={"fastcoref": {'resolve_text': True}})
    #print(doc._.coref_clusters)
    txt_resolved = doc._.resolved_text
    return txt_resolved, doc._.coref_clusters


In [None]:
def write_text_file(filename, content):
    pathlib.Path(filename).write_bytes(content.encode('utf-8').strip())

In [None]:
# fastcoref max_doc_len is 4096
# TODO - split a large file in a memory effecive way; add overlap for context
# split texts in 4096 or less; 3200 tokens x 5chars/token
MAX_SLICE_LEN = 12000 # tokens are smaller than words

def split_large_paragraphs(text):
    texts = text.split('.')
    print(f'Sentences: {len(texts)}')
    slices = []
    slice = ''
    for txt in texts:
        if len(slice) + len(txt) < MAX_SLICE_LEN:
            slice = slice + '.' + txt
        else:
            slices.extend([slice])
            slice = ''
    # add last pending slice        
    slices.append(slice)

    return slices

def split_large_file(text):
    texts = text.split('\n')
    print(f'Paragraphs: {len(texts)}')
    slices = []
    slice = ''
    for txt in texts:
        if(len(txt) < MAX_SLICE_LEN):
            if len(slice) + len(txt) < MAX_SLICE_LEN:
                slice = slice + '\n' + txt
            else:
                slices.extend([slice])
                slice = ''
        else:
            if len(slice) > 0:   
                slices.extend([slice])
                slice = ''
            # large paragraphs
            p_slices = split_large_paragraphs(text)
            print(f'large paragraphs: {len(p_slices)} slices')
            slices.extend(p_slices)
 
    # add last pending slice    
    if len(slice) > 0:    
        slices.append(slice)

    return slices

In [None]:

for txt_file in txt_files:
    resolved_file_name = join(TXT_COREF_BASE, txt_file)
    if os.path.exists(resolved_file_name):
        print(f'Skipping exiting resolved file: {resolved_file_name}')
    else:
        # read cleaned file
        file_name = join(TXT_BASE, txt_file)
        print(f'Processing file: {resolved_file_name}')
        txt = read_text_file(file_name) 
        #print(txt)
        splits = split_large_file(txt)
        print(f'Splits: {len(splits)}')
        resolved_txt = ''
        for split in splits: 
            try:
                # resolve corefs
                resolved_split, coref_clusters = resolve_corefs(split)
                print(f'Found {len(coref_clusters)} coref clusters.')
                resolved_txt = resolved_txt + '\n' + resolved_split
            except IndexError as e:
                print(f'Error processig split. Adding unchanged. \n{split}')
                # mostly references
                resolved_txt = resolved_txt + '\n' + split
        # write the file with the resolved corefs
        write_text_file(resolved_file_name, resolved_txt)