In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from notebook_utils import *

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# PATHS (don't forget to change them if need be)

In [3]:
# Folder where the .txt files are stored
data_path = "/home/yoann/thèse/data/corpus_xavier"

vocs_path = "vocs.pkl" 

hp_path = "hp.yaml"

model_path = "checkpoint-145.pt"

prediction_path = 'preds_test_neg/'

# Import data and preprocess

In [4]:
import pickle as pkl

with open(vocs_path, "rb") as f:
    vocs = pkl.load(f)

bert_name = "camembert-base"
dataset = load_from_brat(data_path)

docs, sentences, tokens, deltas, _ = preprocess(
    dataset=dataset,
    max_sentence_length=120,
    bert_name=bert_name,
    ner_labels= list(vocs['ner_label']),
    unknown_labels="drop",
    vocabularies=vocs,
)

prep = Dataset(
    sentences=sentences,
    tokens=tokens,
    deltas=deltas,
)

batcher, encoded, ids = make_batcher(docs, sentences, tokens)

Dataset: Dataset(
  (docs): 1266 * ('doc_id', 'text', 'split')
)
Transform texts... done
Splitting into sentences... 

  for i, part in enumerate(reg_split.split(txt)):


Tokenizing... done
Computing vocabularies...
Normalized split, with given vocabulary and no unk
Normalized split, with given vocabulary and no unk
Normalized split, with given vocabulary and no unk
Normalized token, with given vocabulary and no unk
Normalized text, with given vocabulary and no unk
Normalized text, with given vocabulary and no unk
done


# Load the model

### Load the hyperparameters

In [5]:
import yaml

with open(hp_path, 'r') as f:
    hyperparameters = yaml.load(f, Loader=yaml.FullLoader)

scheme = hyperparameters["scheme"]
hidden_dim = hyperparameters["hidden_dim"]
seed = hyperparameters["seed"]
lr = hyperparameters["lr"]
bert_lr = hyperparameters["bert_lr"]
tag_dim = hyperparameters["tag_dim"]
token_dim = hyperparameters["token_dim"]
max_grad_norm = hyperparameters["max_grad_norm"]
tags_lr = hyperparameters["tags_lr"]
bert_weight_decay = hyperparameters["bert_weight_decay"]
random_perm = hyperparameters["random_perm"]
observed_zone_sizes = hyperparameters["observed_zone_sizes"]
n_per_zone = hyperparameters["n_per_zone"]
n_freeze = hyperparameters["n_freeze"]
custom_embeds_layer_index = hyperparameters["custom_embeds_layer_index"]
bert_dropout = hyperparameters["bert_dropout"]
top_dropout = hyperparameters["top_dropout"]


### Load specified model checkpoint

In [6]:
import os
import traceback
from tqdm import tqdm

from custom_bert import CustomBertModel
from transformers import AdamW, BertModel

from tqdm import tqdm
from scipy.sparse import csr_matrix
from logic_crf import CRF, ConstraintFactor, HintFactor, Indexer

from nlstruct.environment import get_cache, load
from nlstruct.utils import evaluating, torch_global as tg, freeze
from nlstruct.scoring import compute_metrics, merge_pred_and_gold
from nlstruct.train import make_optimizer_and_schedules, run_optimization, seed_all
from nlstruct.train.schedule import ScaleOnPlateauSchedule, LinearSchedule, ConstantSchedule
    
device = torch.device('cpu')
tg.set_device(device)

# To release gpu memory before allocating new parameters for a new model
# A better idea would be to run xp in a function, so that all variables are released when exiting the fn
# but this way we can debug after this cell if something goes wrong
if "all_nets" in globals(): del all_nets
if "state" in globals(): del state
    
seed_all(seed) # /!\ Super important to enable reproducibility

ner_net = NERNet(
        n_tokens=len(vocs["token"]),
        token_dim=token_dim,
        n_labels=len(vocs["ner_label"]),
        embeddings=CustomBertModel.from_pretrained(bert_name, custom_embeds_layer_index=custom_embeds_layer_index),

        dropout=top_dropout,
        hidden_dim=hidden_dim,
        tag_scheme=scheme,
        metric='linear') # cosine might be better but looks less stable, oddly,
all_nets = torch.nn.ModuleDict({
    "ner_net": ner_net,
    "tag_embeddings": torch.nn.Embedding(ner_net.crf.num_tags - 1, tag_dim),
}).to(device=tg.device)
del ner_net

state = {"all_nets": all_nets}  

try:
    print(f"Loading from {model_path} ...")
    dumped = torch.load(model_path, map_location=device)
    if dumped is not None:
        for name in dumped.keys():
            persistable = state.get(name, None)
            if name in state and hasattr(persistable, 'load_state_dict'):
                persistable.load_state_dict(dumped[name])
            else:
                state[name] = dumped[name]
    print("Model loaded")

except Exception as e:

    # We catch any exception otherwise some variables (including torch parameters on the gpu) end up being stored globally in sys.last_value, leading to memory errors)
    traceback.print_exc()

Available CUDA devices 0
Current device cpu
before layer norm


Some weights of the model checkpoint at camembert-base were not used when initializing CustomBertModel: ['roberta.embeddings.word_embeddings.weight', 'roberta.embeddings.position_embeddings.weight', 'roberta.embeddings.token_type_embeddings.weight', 'roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.self.query.weight', 'roberta.encoder.layer.0.attention.self.query.bias', 'roberta.encoder.layer.0.attention.self.key.weight', 'roberta.encoder.layer.0.attention.self.key.bias', 'roberta.encoder.layer.0.attention.self.value.weight', 'roberta.encoder.layer.0.attention.self.value.bias', 'roberta.encoder.layer.0.attention.output.dense.weight', 'roberta.encoder.layer.0.attention.output.dense.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.intermediate.dense.weight', 'roberta.encoder.layer.0.intermediate.dense.bias', 'roberta.encoder.l

Loading from checkpoint-145.pt ...
Model loaded


# Extract the inferred mentions

In [7]:
# reduced_docs = dataset['docs']
# reduced_docs = reduced_docs[reduced_docs.apply(lambda x: len(x['text']) < 120, axis=1)]
# dataset["docs"] = reduced_docs

In [8]:
import tqdm
[i.close() for i in list(tqdm.tqdm._instances)]

[]

In [14]:
# You can reduce batch_size if the model doesn't fit in memory (restart the notebook kernel first to clean memory)
pred_batcher = extract_mentions(batcher, all_nets=all_nets, hyperparameters=hyperparameters, batch_size=2048)

post_mentions = postprocess_batcher(pred_batcher, dataset, prep, ids, vocs)




  0%|          | 0/16 [00:00<?, ?it/s][A[A[A


  6%|▋         | 1/16 [00:21<05:22, 21.52s/it][A[A[A


 12%|█▎        | 2/16 [00:51<05:37, 24.12s/it][A[A[A


 19%|█▉        | 3/16 [01:29<06:07, 28.29s/it][A[A[A


 25%|██▌       | 4/16 [02:19<06:56, 34.75s/it][A[A[A


 31%|███▏      | 5/16 [03:15<07:32, 41.13s/it][A[A[A


 38%|███▊      | 6/16 [04:26<08:22, 50.22s/it][A[A[A


 44%|████▍     | 7/16 [05:54<09:12, 61.39s/it][A[A[A


 50%|█████     | 8/16 [07:49<10:19, 77.38s/it][A[A[A


 56%|█████▋    | 9/16 [10:23<11:43, 100.53s/it][A[A[A


 62%|██████▎   | 10/16 [14:12<13:54, 139.04s/it][A[A[A


 69%|██████▉   | 11/16 [19:52<16:36, 199.20s/it][A[A[A


 75%|███████▌  | 12/16 [26:11<16:53, 253.28s/it][A[A[A


 81%|████████▏ | 13/16 [32:46<14:47, 295.81s/it][A[A[A


 88%|████████▊ | 14/16 [40:14<11:22, 341.38s/it][A[A[A


 94%|█████████▍| 15/16 [49:24<06:44, 404.03s/it][A[A[A


100%|██████████| 16/16 [58:08<00:00, 218.02s/it][A[A[A



  0

In [15]:
post_mentions

Unnamed: 0,sentence_id,begin,end,ner_label,mention_id,doc_id,begin_sentence,token_idx,begin_char,token_idx_char,end_char,text
0,00948-W-66-3194391074935790803-248659563936831...,1690,1703,0,0,00948-W-66-3194391074935790803-248659563936831...,1821,1,0,2,14,sme. Non doul
1,00948-W-66-3194391074935790803-248659563936831...,1706,1722,0,60,00948-W-66-3194391074935790803-248659563936831...,1838,1,0,3,16,eux. Absence de
2,00948-W-66-3194391074935790803-248659563936831...,1294,1319,0,2139,00948-W-66-3194391074935790803-248659563936831...,1369,6,24,12,54,t sans\nvomissements ni di
3,00948-W-66-3194391074935790803-248659563936831...,1366,1429,0,5122,00948-W-66-3194391074935790803-248659563936831...,1473,1,0,27,69,ium. N’a pas supporté et n’a pas pris le ZYPRE...
4,00948-W-66-3194391074935790803-248659563936831...,1446,1517,0,5130,00948-W-66-3194391074935790803-248659563936831...,1559,1,0,20,77,ar. N’a pas pris son DiffuK.\nAlimentation par...
...,...,...,...,...,...,...,...,...,...,...,...,...
18856,01174-M-89-6018119463327297063-810926137653010...,1523,1756,0,18453,01174-M-89-6018119463327297063-810926137653010...,1513,45,132,118,373,il ne répond pas aux ordres simple\nPupilles r...
18857,01174-M-89-6018119463327297063-810926137653010...,1783,1897,0,18454,01174-M-89-6018119463327297063-810926137653010...,1513,131,403,159,519,"HA+, pas de masses palpable\nBruits du coeur r..."
18858,01150-W-84--226911327820780510--72287399478836...,1316,1326,0,18614,01150-W-84--226911327820780510--72287399478836...,1258,61,209,63,219,plaie nécr
18859,01150-W-84--226911327820780510--72287399478836...,1423,1499,0,18615,01150-W-84--226911327820780510--72287399478836...,1258,102,319,128,396,", pas de DSM objectivé\nbdc réguliers, pas de ..."


# Export to brat files

In [11]:
# Exports the predictions to the specified prediction_path 
# !!! OVERWRITES PREVIOUS PREDICTIONS IN THE PATH !!!
preds_to_ann(post_mentions, dataset, vocs, prediction_path)