In [1]:
import os
import sys

import re
import json
import pathlib
import logging

logging.basicConfig(stream=sys.stdout, level=logging.INFO)

In [2]:
CORPUS = 'ArxivHealthcareNLP'
#CORPUS = 'arxiv_cl'

In [3]:
def load_properties(filepath, sep='=', comment_char='#'):
    '''
    Read the file passed as parameter as a properties file.
    '''
    props = {}
    with open(filepath, "rt") as f:
        for line in f:
            l = line.strip()
            if l and not l.startswith(comment_char):
                key_value = l.split(sep)
                key = key_value[0].strip()
                value = sep.join(key_value[1:]).strip().strip('"') 
                props[key] = value 
    return props

corpus_properties = load_properties(f"corpora/{CORPUS}.properties")
corpus_properties

{'account': '@ArxivHealthcareNLP@sigmoid.social',
 'latest': '110779489140780299',
 'corpus_base': '/home/arylwen/datasets/documents/ArxivHealthcareNLP'}

In [4]:
CORPUS_BASE = corpus_properties['corpus_base']
JSON_RAW_BASE = f'{CORPUS_BASE}/json_raw/'
TXT_BASE = f'{CORPUS_BASE}/text_cleaned/'
JSON_BASE = f'{CORPUS_BASE}/json_cleaned/'

if not os.path.exists(JSON_BASE):
    print(f'{JSON_BASE} does not exist. Creating.')
    os.makedirs(JSON_BASE)

if not os.path.exists(TXT_BASE):
    print(f'{TXT_BASE} does not exist. Creating.')
    os.makedirs(TXT_BASE)

In [5]:
from os import listdir
from os.path import isfile, join
json_files = [f for f in listdir(JSON_RAW_BASE) if isfile(join(JSON_RAW_BASE, f))]
len(json_files)

192

In [6]:
def read_text_file(filename):
    json_content = pathlib.Path(filename).read_bytes()
    print(len(json_content))
    return json_content

In [7]:
def cleanup_text(raw_text):
    text = raw_text

    # convert split words on line break, e.g. post-\nediting
    text = text.replace('-\n', '')
    # remove lone new lines in the middle of the sentence - leave only the new lines after .(dot)
    one_new_line = r'(?<![\.\n])\n(?!\n)'
    text = re.sub(one_new_line, ' ', text)
    # encoded sequences will always generate sequences larger than the chunk size
    latexit = r'<latexit.*latexit> *'
    text = re.sub(latexit, ' ', text)
    # remove urls - not needed for KG
    text = re.sub(r'http\S+', '', text, flags=re.MULTILINE)
    # remove email addresses - not needed for KG
    text = re.sub(r'\S*@\S*\s?', '', text)
    # remove references
    text = re.sub(r'doi\:.*\n?', '\n', text, flags=re.MULTILINE)
    text = re.sub(r'abs\/.*\n?', '\n', text, flags=re.MULTILINE)
    text = re.sub(r'URL\:.*\n?', '\n', text, flags=re.MULTILINE)
    text = re.sub(r'url\:.*\n?', '\n', text, flags=re.MULTILINE)
    text = re.sub(r'arXiv\:.*\n?', '\n', text, flags=re.MULTILINE)
    # remove everythng between parathesis
    text = re.sub(r'\(([^)]+)\)', '', text, flags=re.MULTILINE)
    text = re.sub(r'\[([^]]+)\]', '', text, flags=re.MULTILINE)
    text = re.sub(r'\{([^}]+)\}', '', text, flags=re.MULTILINE)   
    # remove numbers
    numbers = r'[0-9]'
    text = re.sub(numbers, '', text)
    # lists
    lists = r'(i\))|(ii\))|(iii\))|(xvii)|(xviii)'
    text = re.sub(lists, '.', text)
    #[] () .% -, .+ at the beginning of the line TODO |( \. )
    group_of_characters = r'(\(\))|(\[\])|(\.%)|(, \.)|(–,)|(\-\-)|(, \:\.)'
    text = re.sub(group_of_characters, '', text)
    #( ) [ ] [,][, ]
    group_of_characters = r'(\( \))|(\[ \])|(\[,\])|(\[, \])'
    text = re.sub(group_of_characters, '', text)
    group_of_characters = r'^(\.+\s)|^(,+\s)|^\.'
    text = re.sub(group_of_characters, '', text)
    #stray abbreviations
    words = r'(vol\.)|(no\.)|(pp\.)|(Rec@)|(Fig\.)|(\.v)|(\sv\s)|(\sb\s)|(Aug\.)|(Jan)|(Nov\.)'
    text = re.sub(words, '', text)
    #stray commas TODO - ( \, )|
    group_of_characters_2 = r'(,(\s*)\n)'
    text = re.sub(group_of_characters_2, '', text)
    #stray characters
    special_characters = r'[–♣♥♠♦•�±𝛹𝐴−𝑦✓⟨⟩ℎ𝑥"𝜃…𝐷𝑋𝑣𝑥"𝑐𝒆"θδ∈×𝑟𝑡𝑨𝒗∗𝒙𝐶𝐿𝑆𝐵𝑀𝑆𝐾𝑙𝑖𝑘𝑒𝑠∀↑↓𝑑→†𝑎𝑔𝑛⊤√π♢♡µλ′]'
    text = re.sub(special_characters, '', text)
    #leave '-' as part of e.g. bert 
    special_characters = r'[\&%=_\ˆ≥+|˜!#$<>—]'
    text = re.sub(special_characters, '', text)
    special_characters = r'[鹏城实验室鹏城实验室推出面向中文医疗文本处理的预训练模型]'
    text = re.sub(special_characters, '', text)
    #fi 
    special_characters = r'[\ufb01]'
    text = re.sub(special_characters, 'fi', text)
    #ffi 
    special_characters = r'[\ufb03]'
    text = re.sub(special_characters, 'ffi', text)
    #empty lines
    empty_line = r'\.\n(\.+)'
    text = re.sub(empty_line, '\.\n', text)
    #paragraph names, e.g. A., B., C. - llms are picking them up as topics
    paragraph_names = r'\n(\s*)[a-zA-Z]\.'
    text = re.sub(paragraph_names, '\n', text)
    #try again to remove numbers
    numbers = r'[0-9]'
    text = re.sub(numbers, '', text)
    #stray letters
    group_of_characters_3 = r'(\bD\b)|( B )|( o )|(\bs\b)|( Xn )|( X )|( y )|( m )|( i )|( c )|( r )|(\bk\b)|( d )|( t )|( vt )|( - )|(\bxn\b)|(\bXn\b)'
    text = re.sub(group_of_characters_3, '', text)
    group_of_characters_3 = r'( m )|( b )|( x )|( S )|( F )|( g )|(\bC\b)|( Z )|( z )|( Xu )|( R )|( \/ )|( w )|( U= )|( V )|( M )|(dx)|(\bxt\b)|(\bTn\b)'
    text = re.sub(group_of_characters_3, '', text)
    group_of_characters_3 = r'(\bT\b)|(\bP\b)|(\bMC\b)|(\b.-\b)|(\b-.\b)|(\bK\b)|(\bp\b)|(\bl\b)|(\b-.\b)|( Xu )|( R )|( \/ )|( w )|( U= )|( V )|( M )|(dx)|(\bxt\b)|(\bTn\b)'
    text = re.sub(group_of_characters_3, '', text)
    #stray words
    words = r'(kk)|(kknum)|(pp\.)|(Rec@)|(Fig\.)|(\.v)|(\sv\s)|(\sb\s)|(Apr\.)|(Feb\.)|(Nov\.)|(Inf\.)|(CoRR)|(ACM\, \,)|(\bth\b)|(\bexp\b)'
    text = re.sub(words, '', text)
    #again [] () .% -, .+ 
    group_of_characters = r'(\(\))|(\[\])|(\.%)|(, \.)|(–,)|(\-\-)|(, \:\.)|(\(\.\))|(\(\, \))'
    text = re.sub(group_of_characters, '', text)
    # again ( ) [ ] [,][, ]
    group_of_characters = r'(\( \))|(\[ \])|(\[,\])|(\[, \])|(\/)'
    text = re.sub(group_of_characters, '', text)
    #final commas
    commas = r'(\s+\, )'
    text = re.sub(commas, ', ', text)
    commas = r'(\, \,)'
    text = re.sub(commas, ', ', text)
    # commas at the end of the line
    commas = r'(\s*\,\s+\n)|(\s*\,\s*\.\s*\n)|(\\\.)|(\.\s*\.)|(\.)+|(\. )+'
    text = re.sub(commas, '.', text)
    # stray dots (\.){2}|
    commas = r'(\. ){2,25}'
    text = re.sub(commas, '.', text)
    commas = r'(\. ){2,25}'
    text = re.sub(commas, '.', text)
    commas = r'(\.){2,25}'
    text = re.sub(commas, '.', text)
    commas = r'(\-){2,25}' #--
    text = re.sub(commas, '', text)
    commas = r'(\.\s+\.)'  #.  .
    text = re.sub(commas, '.', text)
    commas = r'(\:\s+\:)'  #:  :
    text = re.sub(commas, ':', text)
    commas = r'(\,\s+\,)'  #:  :
    text = re.sub(commas, ',', text)
    #arxiv coref hack: we shows up as topic; match only full words
    words = r'(\bWe\b)'
    text = re.sub(words, 'Authors', text)
    words = r'(\bwe\b)'
    text = re.sub(words, 'authors', text)

    return text

In [8]:
def write_text_file(filename, content):
    pathlib.Path(TXT_BASE+filename).write_bytes(content.encode('utf-8').strip())

def write_json_file(filename, content):
    pathlib.Path(JSON_BASE+filename).write_bytes(content.encode('utf-8').strip())

def save_cleaned_file(document):
    filename = document['title']+'.json'
    filename_txt = document['title']+'.txt'
    json_object = json.dumps(document) 
    write_json_file(filename,json_object)
    write_text_file(filename_txt,document['text'])

In [9]:
for json_file in json_files:
    file_name = join(JSON_RAW_BASE, json_file)
    doc_string = read_text_file(file_name) 
    doc_string = doc_string.decode(encoding = 'utf-8')
    #print(doc_string)
    document = json.loads(doc_string)
    #print(document)
    text = document['text']
    #print(text)
    document["text"] = cleanup_text(text)
    save_cleaned_file(document)
    

34909
54020
141310
57162
28012
44971
32224
27429
39531
74486
58611
54635
33722
36999
179195
36129
53921
38915
33111
28462
24232
81400
45224
87749
47722
36104
19263
28344
70523
62807
43523
38154
15071
50158
49055
31708
117644
46334
42091
50185
24545
78960
50583
226236
173367
48606
40946
147326
83818
23457
51984
10804
50796
100788
126284
50113
51129
43610
47728
81996
45188
31702
58584
45734
52798
48871
49255
79165
65161
66011
59802
43639
28110
31930
41853
63537
34605
57219
74222
29838
54517
84892
71537
48308
30497
102249
26717
34953
29955
40684
39218
43188
83008
50621
75610
46450
73415
59104
31564
24949
42876
41006
57458
44448
39418
23790
67694
37561
134125
47176
65475
51152
35618
42259
21343
46123
60276
44632
61152
12361
45178
37010
71922
49071
27696
51811
61916
18850
48761
110874
52836
40678
50008
94631
111741
49937
53721
24805
28330
69587
25688
29591
21479
57124
40258
90651
47292
31692
28370
57075
92147
98252
44032
29595
93652
39640
47974
43306
82628
27829
42477
46782
56322
73292
5082