In [1]:
import json
import os
import pprint
from langdetect import detect, DetectorFactory
import nltk
import spacy
import re
from tqdm import tqdm
import numpy as np

data = json.load(open(os.path.join("data", "training_data.json")))

In [10]:
data[0]["data"]["text"][:1000]

" nº historia clinica: ** *** *** nºepisodi: ******** sexe: home data de naixement: 16.05.1936 edat: 82 anys procedencia cex mateix hosp servei urologia data d'ingres 24.07.2018 data d'alta 25.07.2018 08:54:04 ates per ***************, *****; ****************, ****** informe d'alta d'hospitalitzacio motiu d'ingres paciente que ingresa de forma programada para realizacion de uretrotomia interna . antecedents alergia a penicilina y cloramfenicol . no habitos toxicos. antecedentes medicos: bloqueo auriculoventricular de primer grado hipertension arterial. diverticulosis extensa insuficiencia renal cronica colelitiasis antecedentes quirurgicos: exeresis de lesiones cutaneas con anestesia local protesis total de cadera cordectomia herniorrafia inguinal proces actual varon de 81a que a raiz de episodio de hematuria macroscopica se realiza cistoscopia que es negativa para lesiones malignas pero se objetiva estenosis de uretra . se intentan dilataciones progresivas en el gabinete de urologia s

In [3]:
# python -m spacy download es_core_news_sm
# python -m spacy download ca_core_news_sm
nlp_es = spacy.load("es_core_news_sm")
nlp_ca = spacy.load("ca_core_news_sm")

In [4]:
DetectorFactory.seed = 0

def detect_lang(text: str):
	try:
		lang = detect(text)
		if lang not in ["es", "ca"]:
			return "es"
		return lang
	except:
		return "es"

In [5]:
def sent_tokenize(text: str):
	sents = nltk.sent_tokenize(text, language="spanish") # assuming for catalan will be similar
	text_cpy = text
	spans = []
	l = 0
	for sent in sents:
		pos = text_cpy.find(sent)
		span = (pos, pos + len(sent))
		span = (0, span[1]) # keep from start in case space was removed during tokenization
		sent = text_cpy[span[0]:span[1]]
		text_cpy = text_cpy[span[1]:]
		span = (l + span[0], l + span[1])
		spans.append({"text":sent, "span":span})
		l = span[1]
	if text_cpy:
		last_sent, last_span = spans[-1]["text"], spans[-1]["span"]
		last_span = (last_span[0], len(text))
		last_sent = last_sent + text_cpy
		spans[-1] = {"text":last_sent, "span":last_span}
	return spans

sents = []
for d in tqdm(data):
	text = d["data"]["text"]
	spans = sent_tokenize(text)
	for i, s in enumerate(spans):
		sent, span = s["text"], s["span"]
		assert sent == text[span[0]:span[1]]
		lang = detect_lang(sent)
		spans[i]["lang"] = lang
	sents.append(spans)

100%|██████████| 254/254 [01:08<00:00,  3.68it/s]


In [6]:
sents[0][1]

{'text': ' antecedents alergia a penicilina y cloramfenicol .',
 'span': (397, 448),
 'lang': 'ca'}

In [11]:
tokens = []
for d in tqdm(sents):
	d_tokens = []
	for sent in d:
		text = sent["text"]
		span = sent["span"]
		lang = sent["lang"]
		if lang == "es":
			doc = nlp_es.tokenizer(text)
		else:
			doc = nlp_ca.tokenizer(text)
		_tokens = []
		spans = []
		for token in doc:
			if token.text != "*": _tokens.append(token.text)
			spans.append([span[0] + token.idx, span[0] + token.idx + len(token.text)])
		sent_tokens = {"tokens": np.array(_tokens), "spans": np.array(spans), "lang": lang}
		d_tokens.append(sent_tokens)
	tokens.append(d_tokens)

100%|██████████| 254/254 [00:02<00:00, 90.58it/s] 


In [12]:
tokens[0]

[{'tokens': array([' ', 'nº', 'historia', 'clinica', ':', 'nºepisodi', ':', 'sexe',
         ':', 'home', 'data', 'de', 'naixement', ':', '16.05.1936', 'edat',
         ':', '82', 'anys', 'procedencia', 'cex', 'mateix', 'hosp',
         'servei', 'urologia', 'data', "d'", 'ingres', '24.07.2018', 'data',
         "d'", 'alta', '25.07.2018', '08:54:04', 'ates', 'per', ',', ';',
         ',', 'informe', "d'", 'alta', "d'", 'hospitalitzacio', 'motiu',
         "d'", 'ingres', 'paciente', 'que', 'ingresa', 'de', 'forma',
         'programada', 'para', 'realizacion', 'de', 'uretrotomia',
         'interna', '.'], dtype='<U15'),
  'spans': array([[  0,   1],
         [  1,   3],
         [  4,  12],
         [ 13,  20],
         [ 20,  21],
         [ 22,  23],
         [ 23,  24],
         [ 25,  26],
         [ 26,  27],
         [ 27,  28],
         [ 29,  30],
         [ 30,  31],
         [ 31,  32],
         [ 33,  42],
         [ 42,  43],
         [ 44,  45],
         [ 45,  46],
    

In [33]:
string = data[0]["data"]["text"]
doc = nlp_ca.tokenizer(string)
for token in doc:
	print(token.text, token.idx)

  0
nº 1
historia 4
clinica 13
: 20
* 22
* 23
* 25
* 26
* 27
* 29
* 30
* 31
nºepisodi 33
: 42
* 44
* 45
* 46
* 47
* 48
* 49
* 50
* 51
sexe 53
: 57
home 59
data 64
de 69
naixement 72
: 81
16.05.1936 83
edat 94
: 98
82 100
anys 103
procedencia 108
cex 120
mateix 124
hosp 131
servei 136
urologia 143
data 152
d' 157
ingres 159
24.07.2018 166
data 177
d' 182
alta 184
25.07.2018 189
08:54:04 200
ates 209
per 214
* 218
* 219
* 220
* 221
* 222
* 223
* 224
* 225
* 226
* 227
* 228
* 229
* 230
* 231
* 232
, 233
* 235
* 236
* 237
* 238
* 239
; 240
* 242
* 243
* 244
* 245
* 246
* 247
* 248
* 249
* 250
* 251
* 252
* 253
* 254
* 255
* 256
* 257
, 258
* 260
* 261
* 262
* 263
* 264
* 265
informe 267
d' 275
alta 277
d' 282
hospitalitzacio 284
motiu 300
d' 306
ingres 308
paciente 315
que 324
ingresa 328
de 336
forma 339
programada 345
para 356
realizacion 361
de 373
uretrotomia 376
interna 388
. 396
antecedents 398
alergia 410
a 418
penicilina 420
y 431
cloramfenicol 433
. 447
no 449
habitos 452
toxicos 