In [None]:
import os
import sys

import re
import json
import pathlib
import logging

logging.basicConfig(stream=sys.stdout, level=logging.INFO)

In [None]:
CORPUS = 'ArxivHealthcareNLP'
#CORPUS = 'arxiv_cl'

In [None]:
def load_properties(filepath, sep='=', comment_char='#'):
    '''
    Read the file passed as parameter as a properties file.
    '''
    props = {}
    with open(filepath, "rt") as f:
        for line in f:
            l = line.strip()
            if l and not l.startswith(comment_char):
                key_value = l.split(sep)
                key = key_value[0].strip()
                value = sep.join(key_value[1:]).strip().strip('"') 
                props[key] = value 
    return props

corpus_properties = load_properties(f"corpora/{CORPUS}.properties")
corpus_properties

In [None]:
CORPUS_BASE = corpus_properties['corpus_base']
JSON_RAW_BASE = f'{CORPUS_BASE}/json_raw/'
TXT_BASE = f'{CORPUS_BASE}/text_cleaned/'
JSON_BASE = f'{CORPUS_BASE}/json_cleaned/'

if not os.path.exists(JSON_BASE):
    print(f'{JSON_BASE} does not exist. Creating.')
    os.makedirs(JSON_BASE)

if not os.path.exists(TXT_BASE):
    print(f'{TXT_BASE} does not exist. Creating.')
    os.makedirs(TXT_BASE)

In [None]:
from os import listdir
from os.path import isfile, join
json_files = [f for f in listdir(JSON_RAW_BASE) if isfile(join(JSON_RAW_BASE, f))]
len(json_files)

In [None]:
def read_text_file(filename):
    json_content = pathlib.Path(filename).read_bytes()
    print(len(json_content))
    return json_content

In [None]:
def cleanup_text(raw_text):
    text = raw_text

    # convert split words on line break, e.g. post-\nediting
    text = text.replace('-\n', '')
    # remove lone new lines in the middle of the sentence - leave only the new lines after .(dot)
    one_new_line = r'(?<![\.\n])\n(?!\n)'
    text = re.sub(one_new_line, ' ', text)
    # encoded sequences will always generate sequences larger than the chunk size
    latexit = r'<latexit.*latexit> *'
    text = re.sub(latexit, ' ', text)
    # remove urls - not needed for KG
    text = re.sub(r'http\S+', '', text, flags=re.MULTILINE)
    # remove email addresses - not needed for KG
    text = re.sub(r'\S*@\S*\s?', '', text)
    # remove references
    text = re.sub(r'doi\:.*\n?', '\n', text, flags=re.MULTILINE)
    text = re.sub(r'abs\/.*\n?', '\n', text, flags=re.MULTILINE)
    text = re.sub(r'URL\:.*\n?', '\n', text, flags=re.MULTILINE)
    text = re.sub(r'url\:.*\n?', '\n', text, flags=re.MULTILINE)
    text = re.sub(r'arXiv\:.*\n?', '\n', text, flags=re.MULTILINE)
    # remove everythng between parathesis
    text = re.sub(r'\(([^)]+)\)', '', text, flags=re.MULTILINE)
    text = re.sub(r'\[([^]]+)\]', '', text, flags=re.MULTILINE)
    text = re.sub(r'\{([^}]+)\}', '', text, flags=re.MULTILINE)   
    # remove Figure captions:
    text = re.sub(r'(Figure [0-9]*:*)', '', text)
    # remove Table captions:
    text = re.sub(r'(Table [0-9]*:*)', '', text)
    # remove numbers
    numbers = r'[0-9]'
    text = re.sub(numbers, '', text)
    # lists
    lists = r'(i\))|(ii\))|(iii\))|(xvii)|(xviii)'
    text = re.sub(lists, '', text)
    lists = r'(I\))|(II\))|(III\.)|(XVII)|(XVIII)'
    text = re.sub(lists, '', text)
    #[] () .% -, .+ at the beginning of the line TODO |( \. )
    group_of_characters = r'(\(\))|(\[\])|(\.%)|(, \.)|(–,)|(\-\-)|(, \:\.)'
    text = re.sub(group_of_characters, '', text)
    #( ) [ ] [,][, ]
    group_of_characters = r'(\( \))|(\[ \])|(\[,\])|(\[, \])'
    text = re.sub(group_of_characters, '', text)
    group_of_characters = r'^(\.+\s)|^(,+\s)|^\.'
    text = re.sub(group_of_characters, '', text)
    #stray abbreviations
    words = r'(vol\.)|(no\.)|(pp\.)|(Rec@)|(Fig\.)|(\b\.v\b)|(\sv\s)|(\sb\s)|(Aug\.)|(Jan)|(Nov\.)'
    text = re.sub(words, '', text)
    #stray commas TODO - ( \, )|
    group_of_characters_2 = r'(,(\s*)\n)'
    text = re.sub(group_of_characters_2, '', text)
    #stray characters
    special_characters = r'[–♣♥♠♦•�±𝛹𝐴−𝑦✓⟨⟩ℎ𝑥"𝜃…𝐷𝑋𝑣𝑥"𝑐𝒆"θδ∈×𝑟𝑡𝑨𝒗∗𝒙𝐶𝐿𝑆𝐵𝑀𝑆𝐾𝑙𝑖𝑘𝑒𝑠∀↑↓𝑑→†𝑎𝑔𝑛⊤√π♢♡µλ𝑚𝑝𝑃ε𝑧𝔄𝜋◦𝐻𝑇ℕ∅∑︁⊂∥′✗⇒¬∧▽𝑯𝑻↔ℭ↦𝟏𝐝𝑫]'
    text = re.sub(special_characters, '', text)
    #leave '-' as part of e.g. bert 
    special_characters = r'[\&%=_\ˆ≥+|˜!#$<>—≤¯𝒊𝜽𝝋𝒓⊥∥Φ≠∞𝚺𝜻∼σβ·]'
    text = re.sub(special_characters, '', text)
    special_characters = r'[鹏城实验室鹏城实验室推出面向中文医疗文本处理的预训练模型阻塞性睡眠呼吸暂停下述哪一项不符合SLE血液系统改变回答选项选项A血小板减少选项B 自细胞减少푆去尿过푆]'
   text = re.sub(special_characters, '', text)
    special_characters = r'[选项C自身免疫溶血贫血选项D正色素细胞贫血选项E类白血病样改变SLE是一种自身免疫疾病其血液系统改变包括血小板减少自身免疫溶血贫血正色素细胞贫血等而类白血病样改变是指骨髓现大量幼稚细胞与SLE无关因此选项E不符合SLE血液系统改变]'
    text = re.sub(special_characters, '', text)
    #fi  
    special_characters = r'[\ufb01]'
    text = re.sub(special_characters, 'fi', text)
    #ffi 
    special_characters = r'[\ufb03]'
    text = re.sub(special_characters, 'ffi', text)
    #empty lines
    empty_line = r'\.\n(\.+)'
    text = re.sub(empty_line, '\.\n', text)
    #paragraph names, e.g. A., B., C. - llms are picking them up as topics
    paragraph_names = r'\n(\s*)[a-zA-Z]\.'
    text = re.sub(paragraph_names, '\n', text)
    #try again to remove numbers
    numbers = r'[0-9]'
    text = re.sub(numbers, '', text)
    #stray letters
    group_of_characters_3 = r'(\bD\b)|( B )|( o )|(\bs\b)|( Xn )|(\bX\b)|(\by\b)|( m )|(\bi\b)|( c )|(\br\b)|(\bk\b)|(\bd\b)|(\bt\b)|(\bvt\b)|(\bxn\b)|(\bXn\b)|(\be\b)|(\bL\b)'
    text = re.sub(group_of_characters_3, '', text)
    group_of_characters_3 = r'( m )|( b )|(\bx\b)|(\bS\b)|( F )|( g )|(\bC\b)|( Z )|( z )|( Xu )|( R )|( \/ )|( w )|( U= )|( V )|( M )|(dx)|(\bxt\b)|(\bTn\b)|(\btn\b)'
    text = re.sub(group_of_characters_3, '', text)
    group_of_characters_3 = r'(\bT\b)|(\bP\b)|(\bMC\b)|(\b.-\b)|(\b-.\b)|(\bK\b)|(\bp\b)|(\bl\b)|(\b-.\b)|( Xu )|( R )|(\b\/\b)|( w )|( U= )|( V )|( M )|(dx)|(\bxt\b)|(\bTn\b)'
    text = re.sub(group_of_characters_3, '', text)
    group_of_characters_3 = r'(\bJ\.\b)|(\bM\.\b)|(\bA\.\b)|(\bH\b)|(\bv\b)|(\bE\b)|(\bth\b)|(\bexp\b)|(\bFv\b)|(\bFs\b)|(\bQ\b)|(\bxv\b)'
    text = re.sub(group_of_characters_3, '', text)
    #stray words
    words = r'(kk)|(kknum)|(pp\.)|(Rec@)|(Fig\.)|(\.v)|(\sv\s)|(\sb\s)|(Apr\.)|(Feb\.)|(Nov\.)|(Inf\.)|(CoRR)|(ACM\, \,)|(Vol\.)|(No\.)|(Surv\.)'
    text = re.sub(words, '', text)
    #again [] () {} .% -, .+ 
    group_of_characters = r'(\(\))|(\[\])|(\{\})|(\.%)|(, \.)|(–,)|(\-\-)|(, \:\.)|(\(\.\))|(\(\, \))'
    text = re.sub(group_of_characters, '', text)
    # again ( ) [ ] [,][, ] -
    group_of_characters = r'(\( \))|(\[ \])|(\[,\])|(\[, \])|(\/)|( \- )|(\- )|( \-)|( \-\,)|( \-\.)|(\.\- )'
    text = re.sub(group_of_characters, '', text)
    #final commas
    commas = r'(\s+\, )'
    text = re.sub(commas, ', ', text)
    commas = r'(\, \,)'
    text = re.sub(commas, ', ', text)
    # commas at the end of the line
    commas = r'(\s*\,\s+\n)|(\s*\,\s*\.\s*\n)|(\\\.)|(\.\s*\.)|(\.)+|(\. )+'
    text = re.sub(commas, '.', text)
    # stray dots (\.){2}|
    commas = r'(\. ){2,25}'
    text = re.sub(commas, '.', text)
    commas = r'(\. ){2,25}'
    text = re.sub(commas, '.', text)
    commas = r'(\.){2,25}'
    text = re.sub(commas, '.', text)
    commas = r'(\-){2,25}' #--
    text = re.sub(commas, '', text)
    commas = r'(\.\s+\.)'  #.  .
    text = re.sub(commas, '.', text)
    commas = r'(\:\s+\:)'  #:  :
    text = re.sub(commas, ':', text)
    commas = r'(\,\s+\,)'  #,  ,
    text = re.sub(commas, ',', text)
    commas = r'(\s*\,{2,20}\s*)'  # ,,    ,,,    ,, 
    text = re.sub(commas, ' ', text)
    commas = r'(\s+\;*\s*\:\s)'  #   ;  :  
    text = re.sub(commas, ' ', text)
    commas = r'[^\S\r\n]{2,30}'  # 2+ spaces, no new line
    text = re.sub(commas, ' ', text)
    # try again - remove numbers
    numbers = r'[0-9]'
    text = re.sub(numbers, '', text)
    # Academic stopwords
    words = r'(Furthermore)|(Moreover)|(However)|(What)|(Overall)'
    text = re.sub(words, '', text)
    #arxiv coref hack: we shows up as topic; match only full words
    words = r'(\bWe\b)'
    text = re.sub(words, 'Authors', text)
    words = r'(\bwe\b)'
    text = re.sub(words, 'authors', text)

    return text

In [None]:
def write_text_file(filename, content):
    pathlib.Path(TXT_BASE+filename).write_bytes(content.encode('utf-8').strip())

def write_json_file(filename, content):
    pathlib.Path(JSON_BASE+filename).write_bytes(content.encode('utf-8').strip())

def save_cleaned_file(document):
    filename = document['title']+'.json'
    filename_txt = document['title']+'.txt'
    json_object = json.dumps(document) 
    write_json_file(filename,json_object)
    write_text_file(filename_txt,document['text'])

In [None]:
for json_file in json_files:
    file_name = join(JSON_RAW_BASE, json_file)
    doc_string = read_text_file(file_name) 
    doc_string = doc_string.decode(encoding = 'utf-8')
    #print(doc_string)
    document = json.loads(doc_string)
    #print(document)
    text = document['text']
    #print(text)
    document["text"] = cleanup_text(text)
    save_cleaned_file(document)
    