In [None]:
# Install medcat
! pip install medcat==1.3.0
# Get the spacy model
! python -m spacy download en_core_web_md
try:
    from medcat.cat import CAT
except:
    print("WARNING: Runtime will restart automatically and please run other cells thereafter.")
    exit()

**Restart the runtime if on colab, sometimes necessary after installing models**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import json 

from matplotlib import pyplot as plt
from medcat.cat import CAT
from medcat.cdb import CDB
from medcat.config import Config
from medcat.vocab import Vocab
from medcat.meta_cat import MetaCAT
from medcat.config_meta_cat import ConfigMetaCAT
from medcat.preprocessing.tokenizers import TokenizerWrapperBPE, TokenizerWrapperBERT
from tokenizers import ByteLevelBPETokenizer


  from tqdm.autonotebook import tqdm, trange


In [21]:
import glob
import zipfile

In [12]:
DATA_DIR = "./data/data_p4.2"
#! DATA_DIR="./data_p4.2/"
vocab_path = DATA_DIR + "/vocab.dat"
cdb_path = DATA_DIR + "/cdb-medmen-v1_2.dat"

In [3]:
# Download the models and required data
!wget -N https://raw.githubusercontent.com/CogStack/MedCATtutorials/main/notebooks/introductory/data/MedCAT_Export.json -P $DATA_DIR
# You can also use the models created in Part 4.1 of the Tutorial
!wget -N https://medcat.rosalind.kcl.ac.uk/media/mc_status.zip -P $DATA_DIR

--2023-03-06 15:46:03--  https://raw.githubusercontent.com/CogStack/MedCATtutorials/main/notebooks/introductory/data/MedCAT_Export.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 272538 (266K) [text/plain]
Saving to: './data/data_p4.2/MedCAT_Export.json'

     0K .......... .......... .......... .......... .......... 18%  572K 0s
    50K .......... .......... .......... .......... .......... 37% 2.15M 0s
   100K .......... .......... .......... .......... .......... 56% 1013K 0s
   150K .......... .......... .......... .......... .......... 75% 2.44M 0s
   200K .......... .......... .......... .......... .......... 93% 1.19M 0s
   250K .......... ......                                     100% 1.51M=0.2s

Last-modified header missing -- time-stamps turn

In [4]:
# Get MedCAT models components (Alternatively you can use a previously created MedCAT model packs)
!wget -N https://medcat.rosalind.kcl.ac.uk/media/vocab.dat -P $DATA_DIR
!wget -N https://medcat.rosalind.kcl.ac.uk/media/cdb-medmen-v1_2.dat -P $DATA_DIR

--2023-03-06 15:52:51--  https://medcat.rosalind.kcl.ac.uk/media/vocab.dat
Resolving medcat.rosalind.kcl.ac.uk (medcat.rosalind.kcl.ac.uk)... 193.61.202.225
Connecting to medcat.rosalind.kcl.ac.uk (medcat.rosalind.kcl.ac.uk)|193.61.202.225|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 274445907 (262M) [application/octet-stream]
Saving to: './data/data_p4.2/vocab.dat'

     0K .......... .......... .......... .......... ..........  0% 57.7K 77m23s
    50K .......... .......... .......... .......... ..........  0% 46.0K 87m12s
   100K .......... .......... .......... .......... ..........  0% 44.0K 91m55s
   150K .......... .......... .......... .......... ..........  0% 38.3K 98m5s
   200K .......... .......... .......... .......... ..........  0% 50.4K 96m10s
   250K .......... .......... .......... .......... ..........  0% 76.5K 89m51s
   300K .......... .......... .......... .......... ..........  0% 68.4K 86m19s
   350K .......... .......... .......... .

## MedCATtrainer Export

In [6]:
#@title
data = json.load(open())

In [7]:
#@title
print(data.keys())

dict_keys(['projects'])


In [8]:
#@title
data['projects'][0].keys()

dict_keys(['name', 'id', 'cuis', 'tuis', 'documents'])

In [9]:
#@title
data['projects'][0]['documents'][0].keys()

dict_keys(['id', 'name', 'text', 'last_modified', 'annotations'])

In [10]:
#@title
data['projects'][0]['documents'][0]['annotations'][0].keys()

dict_keys(['id', 'user', 'cui', 'value', 'start', 'end', 'validated', 'correct', 'deleted', 'alternative', 'killed', 'last_modified', 'manually_created', 'acc', 'meta_anns'])

In [11]:
#@title
data['projects'][0]['documents'][0]['annotations'][0]['meta_anns'][0].keys()

dict_keys(['name', 'value', 'acc', 'validated'])

##Fine-tuning the NER+L model

First we load the existing MedCAT models that we will fine-tune.



In [13]:
# Create and load the CDB (Concept Database)
cdb = CDB.load(cdb_path)

# Create and load the Vocabulary
vocab = Vocab.load(vocab_path)

# Setup config
config = Config()
config.general['spacy_model'] = 'en_core_web_md'

# Create CAT - the main class from medcat used for concept annotation
cat = CAT(cdb=cdb, config=config, vocab=vocab)

The CDB was exported by an unknown version of MedCAT.


To fine-tune the existing models we use the `train_supervised()` method from MedCAT. The method has the following options:


---



`data_path` - Path to the JSON file exported from MedCATtrainer

`reset_cui_count` - Each cui has an internal counter that is used for weight decay, this will reset it.

`nepochs` - Numeber of epochs

`reset_cui_count` - Used for training with weight_decay (annealing). Each concept has a count that is there from the beginning of the CDB, that count is used for annealing. Resetting the count will significantly increase the training impact. This will reset the count only for concepts that exist in the the training data.

`print_stats` - Print statistics during training (F1/P/R)

`test_set` - Provide another file for testing

`use_filters` - During the training do you want to set the cui/tui filters from the medcattrainer project configuration. 

`never_terminate` - If True no termination of concepts will be applied
 
`terminate_last` - If true, concept termination will be done after all training.

`use_groups` - If True concepts that have groups will be combined and stats will be reported on groups.

`use_overlaps` - Allow overlapping entites, nearly always False as it is very difficult to annotate overlapping entites.
 
`use_cui_doc_limit` - If True the metrics for a CUI will be only calculated if that CUI appears in a document.
 
`train_from_false_positives` - If True it will use false positive examples detected by medcat and train from them as negative examples.



In [15]:
cat.train_supervised(data_path=DATA_DIR + "/MedCAT_Export.json", 
                     nepochs=1,
                     reset_cui_count=False,
                     print_stats=True, 
                     use_filters=True) 

Stats project:   0%|          | 0/1 [00:00<?, ?it/s]

Stats document:   0%|          | 0/27 [00:00<?, ?it/s]

Epoch: 0, Prec: 0.7948717948717948, Rec: 0.7828282828282829, F1: 0.7888040712468194

Docs with false positives: 1687; 1577; 716; 1694; 516; 611; 1734; 1383; 1881; 1605

Docs with false negatives: 1070; 1577; 716; 1694; 611; 1734; 1383; 1881; 1605; 898



False Positives

Diabetes                                                               - C0011847             -         18
Obesity                                                                - C0028754             -          5
Hypertensive disease                                                   - C0020538             -          5
nervous system disorder                                                - C0027765             -          4
Disease                                                                - C0012634             -          3
Alzheimer's Disease                                                    - C0002395             -          3
Anxiety Disorders                                                      - C0003469     

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Project:   0%|          | 0/1 [00:00<?, ?it/s]

Document:   0%|          | 0/27 [00:00<?, ?it/s]

Stats project:   0%|          | 0/1 [00:00<?, ?it/s]

Stats document:   0%|          | 0/27 [00:00<?, ?it/s]

Epoch: 1, Prec: 0.843520782396088, Rec: 0.8712121212121212, F1: 0.857142857142857

Docs with false positives: 1687; 1577; 716; 1694; 516; 611; 1383; 1881; 1605; 898

Docs with false negatives: 1070; 1881; 323; 1605; 641; 466; 688; 1557; 496; 716



False Positives

Diabetes                                                               - C0011847             -         15
Hypertensive disease                                                   - C0020538             -          5
Diabetes Mellitus                                                      - C0011849             -          4
Disease                                                                - C0012634             -          4
Cognition Disorders                                                    - C0009241             -          3
Anxiety Disorders                                                      - C0003469             -          3
Coronary Arteriosclerosis                                              - C0010054           

({'C0011847': 15,
  'C0020538': 5,
  'C0011849': 4,
  'C0012634': 4,
  'C0009241': 3,
  'C0003469': 3,
  'C0010054': 2,
  'C0036529': 2,
  'C0011860': 2,
  'C0037284': 2,
  'C0233794': 2,
  'C0276289': 2,
  'C0349782': 2,
  'C0029456': 2,
  'C0028754': 1,
  'C0038443': 1,
  'C0002792': 1,
  'C0009319': 1,
  'C0018939': 1,
  'C0339573': 1,
  'C1704436': 1,
  'C0038454': 1,
  'C0018801': 1,
  'C0023351': 1,
  'C0029408': 1,
  'C0030567': 1,
  'C0031099': 1,
  'C0519066': 1},
 {'C0011860': 9,
  'C0011849': 7,
  'C0854135': 3,
  'C0034065': 3,
  'C0010068': 2,
  'C0003864': 2,
  'C0018799': 2,
  'C0039082': 2,
  'C0037284': 2,
  'C0041582': 2,
  'C0011854': 2,
  'C0018939': 1,
  'C0238792': 1,
  'C0008679': 1,
  'C1301700': 1,
  'C0021167': 1,
  'C0009324': 1,
  'C0002871': 1,
  'C3844825': 1,
  'C0018889': 1,
  'C0033377': 1,
  'C0042769': 1,
  'C0263746': 1,
  'C0206172': 1,
  'C0021400': 1,
  'C0085762': 1},
 {'C0020538': 43,
  'C0037284': 19,
  'C0020473': 12,
  'C0010054': 10,
  'C049

In [16]:
# If we want to know the F1, P, R for each cui, we can call the stats method
data = json.load(open(DATA_DIR + "/MedCAT_Export.json"))
fps, fns, tps, cui_prec, cui_rec, cui_f1, cui_counts, examples = cat._print_stats(data, extra_cui_filter=True)

Stats project:   0%|          | 0/1 [00:00<?, ?it/s]

Stats document:   0%|          | 0/27 [00:00<?, ?it/s]

Epoch: 0, Prec: 0.07706173952230734, Rec: 0.8486352357320099, F1: 0.14129312125593887

Docs with false positives: 1070; 1687; 1577; 716; 1694; 516; 611; 1734; 1383; 1881

Docs with false negatives: 1070; 1577; 716; 1694; 611; 1383; 1881; 1605; 496; 466



False Positives

Patients                                                               - C0030705             -        200
Normal                                                                 - C0205307             -         67
Medical History                                                        - C0262926             -         57
Daily                                                                  - C0332173             -         46
year                                                                   - C0439234             -         41
Bilateral                                                              - C0238767             -         40
Right                                                                  - C0205090    

In [17]:
# Now we can check F1/P/R for a cui using
cui = "C0020538" # Hypertension
print(cui_f1[cui], cui_prec[cui], cui_rec[cui])

0.9662921348314606 0.9347826086956522 1.0


In [18]:
# inspect fn, fp examples for specific CUIs of interest...
examples['fn']['C0010068']

[{'text': ' appendectomy, oophorectomy.,FAMILY HISTORY: , Positive for coronary artery disease in her father and brother in their 40s.,SOCIAL HISTORY: , S',
  'cui': 'C0010068',
  'source value': 'coronary artery disease',
  'acc': 1,
  'project index': 0,
  'document inedex': 1},
 {'text': 'on fraction of 20%-25% in December 2005, COPD, mild diffuse coronary artery disease, and renal insufficiency.,ALLERGIES:, NO KNOWN DRUG ALLERGI',
  'cui': 'C0010068',
  'source value': 'coronary artery disease',
  'acc': 1,
  'project index': 0,
  'document inedex': 25}]

In [19]:
# annotations inside first projects...
[anno for d in data['projects'][0]['documents'] for anno in d['annotations']][0:5]

[{'id': 45580,
  'user': 'wish',
  'cui': 'C0017168',
  'value': 'gastroesophageal reflux',
  'start': 332,
  'end': 355,
  'validated': True,
  'correct': True,
  'deleted': False,
  'alternative': False,
  'killed': False,
  'last_modified': '2020-04-01 22:06:34.303633+00:00',
  'manually_created': False,
  'acc': 1.0,
  'meta_anns': [{'name': 'Status',
    'value': 'Other',
    'acc': 1.0,
    'validated': True}]},
 {'id': 45581,
  'user': 'wish',
  'cui': 'C0020538',
  'value': 'hypertension',
  'start': 255,
  'end': 267,
  'validated': True,
  'correct': True,
  'deleted': False,
  'alternative': False,
  'killed': False,
  'last_modified': '2020-04-01 22:06:30.394941+00:00',
  'manually_created': False,
  'acc': 1.0,
  'meta_anns': [{'name': 'Status',
    'value': 'Confirmed',
    'acc': 1.0,
    'validated': True}]},
 {'id': 45582,
  'user': 'wish',
  'cui': 'C0012634',
  'value': 'disorder',
  'start': 356,
  'end': 364,
  'validated': True,
  'correct': False,
  'deleted': Fa

##MetaAnnotations

During the annotation process we have created only one meta-annotation called "Status". Here we are going to train a BiLSTM to detect that meta-annotation.

### MetaCAT

The class we are going to use to train meta-annotations is called MetaCAT. As input it takes:

`tokenizer` - A BBPE tokenizer from [huggingface](https://github.com/huggingface/tokenizers)

`embeddings` - The precalculated embeddings for the tokens produced by the tokenizer. Can be anything from Word2Vec to BERT. This is a numpy matrix, or python list of embeddings. 

`cntx_left` - Size of context from the left side of the entity that will be taken into account.

`cntx_right` - Size of context from the right side of the entity that will be taken into account.

`save_dir` - Where do we want to save the trained models.

`pad_id` - Padding index in the embeddings matrix. 

`device` - On which device to run this `cpu` or `cuda`

In [20]:
#!unzip $DATA_DIR/mc_status.zip

'unzip' is not recognized as an internal or external command,
operable program or batch file.


In [22]:
files = glob.glob('./data/data_p4.2/*.zip')
files

['./data/data_p4.2\\mc_status.zip']

In [24]:
for file in files:
    print('Unzipping:',file)

    with zipfile.ZipFile(file, 'r') as zip_ref:
        zip_ref.extractall('data/raw')

Unzipping: ./data/data_p4.2\mc_status.zip


In [29]:
# Get the required tokenizer (note that we have already downloaded the required models)
mc = MetaCAT.load('./data/raw/Status/')

### MetaCAT configuration

For a full list of all the configurable parameter. Follow this [link](https://github.com/CogStack/MedCAT/blob/master/medcat/config_meta_cat.py).

Some noteable parameters:

`category_name` - What is the name of this meta-annotation (same as the name in the MedCATtrainer)

`model_name` - for now only `lstm`

`lr` - Learning rate

`test_size` - Proportion of the test set

`bach_size` - Batch size

`nepochs` - Number of epochs to run for

`lowercase` - Do you want to lowercase the text

`class_weights` - Pytorch LSTM parameter for unbalanced classes

`ignore_cpos` - The position of the entity will be ignored, do not use this.

`auto_save_model` - This will autosave the top performing epoch during the training process

In [30]:
# Example of how to change parameters
mc.config.model['input_size'] = 768
mc.config.model['hidden_size'] = 300

mc.config.train['nepochs'] = 55
mc.config.train['auto_save_model'] = True

### Train MetaCAT
To run the training we use the `train` method that allows us to specify:

`json_path`: Path to a MedCATtrainer export containing the meta_annotations we want to train for.


`save_dir_path`: (optional, defaults to `None`): In case we have auto_save_model (meaning during the training the best model will be saved) we need to set a save path.


In [31]:
mc.train(json_path= DATA_DIR+"/MedCAT_Export.json", save_dir_path='status')
# Saving the model this way will only save the model epoch with the best performance

The number of classes set in the config is not the same as the one found in the data: 2 vs 3
Auto-setting the nclasses value in config and rebuilding the model.


Epoch: 0 **************************************************  Train
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.47      0.51      0.49       113
           2       0.78      0.70      0.73       253

    accuracy                           0.64       366
   macro avg       0.42      0.40      0.41       366
weighted avg       0.68      0.64      0.66       366

Epoch: 0 **************************************************  Test
              precision    recall  f1-score   support

           1       1.00      0.27      0.42        15
           2       0.69      1.00      0.82        25

    accuracy                           0.73        40
   macro avg       0.85      0.63      0.62        40
weighted avg       0.81      0.72      0.67        40



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



##### Model saved to status\model.dat at epoch: 0 and weighted avg/f1-score: 0.6701898188093184 #####

Epoch: 1 **************************************************  Train
              precision    recall  f1-score   support

           1       1.00      0.15      0.26       113
           2       0.72      1.00      0.84       253

    accuracy                           0.74       366
   macro avg       0.86      0.58      0.55       366
weighted avg       0.81      0.74      0.66       366

Epoch: 1 **************************************************  Test
              precision    recall  f1-score   support

           1       1.00      0.20      0.33        15
           2       0.68      1.00      0.81        25

    accuracy                           0.70        40
   macro avg       0.84      0.60      0.57        40
weighted avg       0.80      0.70      0.63        40

Epoch: 2 **************************************************  Train
              precision    recall  f1-scor

{'report': {'1': {'precision': 0.8125,
   'recall': 0.8666666666666667,
   'f1-score': 0.8387096774193549,
   'support': 15},
  '2': {'precision': 0.9166666666666666,
   'recall': 0.88,
   'f1-score': 0.8979591836734694,
   'support': 25},
  'accuracy': 0.875,
  'macro avg': {'precision': 0.8645833333333333,
   'recall': 0.8733333333333333,
   'f1-score': 0.8683344305464121,
   'support': 40},
  'weighted avg': {'precision': 0.8776041666666666,
   'recall': 0.875,
   'f1-score': 0.8757406188281764,
   'support': 40}},
 'epoch': 5}

We can now save the models using the `save` function. It has only one argument `full_save` if `True` it will also save the embeddings and tokenizers (note that this is slightly redundant, as we no training was done on the embeddings/tokenizers). 

In [32]:
# Alternative way to save
mc.save("alternative_status_metamodel")

## Test of the whole pipeline

In [33]:
# Set filter
tui_filter = ['T047'] # Detect only diseases
cui_filters = set()
for tui in tui_filter:
    cui_filters.update(cdb.addl_info['type_id2cuis'][tui])
cdb.config.linking['filters']['cuis'] = cui_filters

# To add the meta-annotation model to the medcat pipeline
cat = CAT(cdb=cdb, config=config, vocab=vocab, meta_cats=[mc])

In [34]:
"C0035078" in cui_filters

True

In [35]:
text = "John Doe has epilepsy and hypertention but does not suffer from hypertension"
doc = cat(text)

In [36]:
for ent in doc.ents:
    print("Entity: " + ent.text)
    print("Meta Annotations: " + str(ent._.meta_anns))
    print("\n")

Entity: epilepsy
Meta Annotations: {'Status': {'value': 'Confirmed', 'confidence': 0.7397191524505615, 'name': 'Status'}}


Entity: hypertention
Meta Annotations: {'Status': {'value': 'Confirmed', 'confidence': 0.8003191947937012, 'name': 'Status'}}


Entity: hypertension
Meta Annotations: {'Status': {'value': 'Other', 'confidence': 0.6010704040527344, 'name': 'Status'}}




Notice how the medcat metamodel labels John Doe with entities: "epilepsy", "hypertension" as __"Confirmed"__.

Whilst "suffer" and "hypertension" is labelled as __"Other"__.

This is extremely useful when conducting a context-based extract of concepts from text.

End of Tutorial