In [1]:
import json
import os
import pprint
from langdetect import detect, DetectorFactory
import nltk
import spacy
import re
from tqdm import tqdm
import numpy as np
from unidecode import unidecode

ROOT_DIR = os.path.dirname(os.path.abspath(""))
data = json.load(open(os.path.join(ROOT_DIR, "data", "training_data.json")))

In [2]:
data[0]["data"]["text"][:1000]

" nº historia clinica: ** *** *** nºepisodi: ******** sexe: home data de naixement: 16.05.1936 edat: 82 anys procedencia cex mateix hosp servei urologia data d'ingres 24.07.2018 data d'alta 25.07.2018 08:54:04 ates per ***************, *****; ****************, ****** informe d'alta d'hospitalitzacio motiu d'ingres paciente que ingresa de forma programada para realizacion de uretrotomia interna . antecedents alergia a penicilina y cloramfenicol . no habitos toxicos. antecedentes medicos: bloqueo auriculoventricular de primer grado hipertension arterial. diverticulosis extensa insuficiencia renal cronica colelitiasis antecedentes quirurgicos: exeresis de lesiones cutaneas con anestesia local protesis total de cadera cordectomia herniorrafia inguinal proces actual varon de 81a que a raiz de episodio de hematuria macroscopica se realiza cistoscopia que es negativa para lesiones malignas pero se objetiva estenosis de uretra . se intentan dilataciones progresivas en el gabinete de urologia s

In [3]:
# python -m spacy download es_core_news_sm
# python -m spacy download ca_core_news_sm
nlp_es = spacy.load("es_core_news_sm")
nlp_ca = spacy.load("ca_core_news_sm")

In [4]:
DetectorFactory.seed = 0

def detect_lang(text: str):
	try:
		lang = detect(text)
		if lang not in ["es", "ca"]:
			return "es"
		return lang
	except:
		return "es"

In [5]:
def sent_tokenize(text: str):
	sents = nltk.sent_tokenize(text, language="spanish") # assuming for catalan will be similar
	text_cpy = text
	spans = []
	l = 0
	for sent in sents:
		pos = text_cpy.find(sent)
		span = (pos, pos + len(sent))
		span = (0, span[1]) # keep from start in case space was removed during tokenization
		sent = text_cpy[span[0]:span[1]]
		text_cpy = text_cpy[span[1]:]
		span = (l + span[0], l + span[1])
		spans.append({"text":sent, "span":span})
		l = span[1]
	if text_cpy:
		last_sent, last_span = spans[-1]["text"], spans[-1]["span"]
		last_span = (last_span[0], len(text))
		last_sent = last_sent + text_cpy
		spans[-1] = {"text":last_sent, "span":last_span}
	return spans

sents = []
for d in tqdm(data):
	text = d["data"]["text"]
	spans = sent_tokenize(text)
	for i, s in enumerate(spans):
		sent, span = s["text"], s["span"]
		assert sent == text[span[0]:span[1]]
		lang = detect_lang(sent)
		spans[i]["lang"] = lang
	sents.append(spans)

100%|██████████| 254/254 [01:18<00:00,  3.25it/s]


In [6]:
remove_asterisk = True
remove_punctuation = True
remove_spaces = True
replace_numbers = None#"<NUM>"

tokens = []
for d in tqdm(sents):
	d_tokens = []
	for sent in d:
		text = sent["text"]
		span = sent["span"]
		lang = sent["lang"]
		if lang == "es":
			doc = nlp_es.tokenizer(text)
		else:
			doc = nlp_ca.tokenizer(text)
		_tokens = []
		spans = []
		asterisk_len = 0
		asterisk_pos = 0
		for token in doc:
			word = token.text
			# handle asterisks
			if word == "*":
				if remove_asterisk: continue
				else:
					if asterisk_len == 0:
						asterisk_pos = token.idx
					asterisk_len += 1
					continue
			else:
				if asterisk_len > 0:
					_tokens.append("<HIDDEN>")
					spans.append([span[0] + asterisk_pos, span[0] + asterisk_pos + asterisk_len])
					asterisk_len = 0
			# handle numbers
			if re.fullmatch(r'(\d+[.,:/-]?)+', word) and replace_numbers:
				word = replace_numbers
			# remove punctuation
			elif remove_punctuation:
				word = re.sub(r'[^\w\s]|[ºª]', '', word)
			# remove spaces
			if remove_spaces:
				word = word.strip()
			# unidecode in case of weird characters
			word = unidecode(word)
			# append token
			if word:
				_tokens.append(word)
				spans.append([span[0] + token.idx, span[0] + token.idx + len(token.text)])
		sent_tokens = {"tokens": np.array(_tokens), "spans": np.array(spans), "lang": lang}
		d_tokens.append(sent_tokens)
	tokens.append(d_tokens)

100%|██████████| 254/254 [00:03<00:00, 64.35it/s]


In [7]:
sents[0][0]

{'text': " nº historia clinica: ** *** *** nºepisodi: ******** sexe: home data de naixement: 16.05.1936 edat: 82 anys procedencia cex mateix hosp servei urologia data d'ingres 24.07.2018 data d'alta 25.07.2018 08:54:04 ates per ***************, *****; ****************, ****** informe d'alta d'hospitalitzacio motiu d'ingres paciente que ingresa de forma programada para realizacion de uretrotomia interna .",
 'span': (0, 397),
 'lang': 'ca'}

In [8]:
tokens[0]

[{'tokens': array(['n', 'historia', 'clinica', 'nepisodi', 'sexe', 'home', 'data',
         'de', 'naixement', '16051936', 'edat', '82', 'anys', 'procedencia',
         'cex', 'mateix', 'hosp', 'servei', 'urologia', 'data', 'd',
         'ingres', '24072018', 'data', 'd', 'alta', '25072018', '085404',
         'ates', 'per', 'informe', 'd', 'alta', 'd', 'hospitalitzacio',
         'motiu', 'd', 'ingres', 'paciente', 'que', 'ingresa', 'de',
         'forma', 'programada', 'para', 'realizacion', 'de', 'uretrotomia',
         'interna'], dtype='<U15'),
  'spans': array([[  1,   3],
         [  4,  12],
         [ 13,  20],
         [ 33,  42],
         [ 53,  57],
         [ 59,  63],
         [ 64,  68],
         [ 69,  71],
         [ 72,  81],
         [ 83,  93],
         [ 94,  98],
         [100, 102],
         [103, 107],
         [108, 119],
         [120, 123],
         [124, 130],
         [131, 135],
         [136, 142],
         [143, 151],
         [152, 156],
         [157, 

In [9]:
from spacy.tokenizer import Tokenizer
from copy import deepcopy

def custom_tokenizer(nlp):
    return Tokenizer(nlp.vocab, token_match=re.compile(r'\S+').match)

print("Preparing tokenizers...")
nlp_es_custom = deepcopy(nlp_es)
nlp_es_custom.tokenizer = custom_tokenizer(nlp_es_custom)
nlp_ca_custom = deepcopy(nlp_ca)
nlp_ca_custom.tokenizer = custom_tokenizer(nlp_ca_custom)

Preparing tokenizers...


In [10]:
# lemmatize
tokens_lemmatized = []
for d in tqdm(tokens):
	d_tokens = []
	for sent in d:
		lang = sent["lang"]
		# lemmatization works better with the whole sentence
		if lang == "es":
			doc = nlp_es_custom(" ".join(sent["tokens"]))
		else:
			doc = nlp_ca_custom(" ".join(sent["tokens"]))
		lemmas = [token.lemma_ for token in doc]
		assert len(lemmas) == len(sent["tokens"]), f"\n{lemmas}\n{sent['tokens']}"
		sent_tokens = {"tokens": np.array(lemmas), "spans": sent["spans"], "lang": lang}
		d_tokens.append(sent_tokens)
	tokens_lemmatized.append(d_tokens)

100%|██████████| 254/254 [01:04<00:00,  3.95it/s]


In [11]:
tokens_lemmatized[0]

[{'tokens': array(['n', 'historia', 'clinic', 'nepisodi', 'sexe', 'home', 'datar',
         'de', 'naixement', '16051936', 'edat', '82', 'any', 'procedencia',
         'cex', 'mateix', 'hosp', 'servei', 'urologiar', 'data', 'de',
         'ingres', '24072018', 'data', 'de', 'alt', '25072018', '085404',
         'at', 'per', 'informe', 'de', 'alta', 'de', 'hospitalitzacio',
         'motiu', 'de', 'ingres', 'paciente', 'que', 'ingresar', 'de',
         'forma', 'programat', 'parar', 'realizacion', 'de', 'uretrotomia',
         'intern'], dtype='<U15'),
  'spans': array([[  1,   3],
         [  4,  12],
         [ 13,  20],
         [ 33,  42],
         [ 53,  57],
         [ 59,  63],
         [ 64,  68],
         [ 69,  71],
         [ 72,  81],
         [ 83,  93],
         [ 94,  98],
         [100, 102],
         [103, 107],
         [108, 119],
         [120, 123],
         [124, 130],
         [131, 135],
         [136, 142],
         [143, 151],
         [152, 156],
         [157

In [12]:
def save_tokens(tokens, path):
	tokens = [
		{
			"tokens": sent["tokens"].tolist(),
			"spans": sent["spans"].tolist(),
			"lang": sent["lang"]
		}
		for d in tokens for sent in d
	]
	with open(path, 'w') as f:
		json.dump(tokens, f)

save_tokens(tokens_lemmatized, os.path.join(ROOT_DIR, "data", "training_data_tokens.json"))

In [13]:
def load_tokens(path):
	with open(path, 'r') as f:
		tokens = json.load(f)
	tokens = [
		{
			"tokens": np.array(sent["tokens"]),
			"spans": np.array(sent["spans"]),
			"lang": sent["lang"]
		}
		for sent in tokens
	]
	return tokens

tokens_lemmatized = load_tokens(os.path.join(ROOT_DIR, "data", "training_data_tokens.json"))