In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from notebook_utils import *

# PATHS (don't forget to change them if need be)

In [3]:
# Folder where the .txt files are stored
data_path = "/home/yoann/thèse/data/corpus_dalloux/CAS_neg_brat"

vocs_path = "vocs.pkl" 

hp_path = "hp.yaml"

model_path = "checkpoint-145.pt"

prediction_path = 'preds_test_neg/'

# Import data and preprocess

In [4]:
import pickle as pkl

with open(vocs_path, "rb") as f:
    vocs = pkl.load(f)

bert_name = "camembert-base"
dataset = load_from_brat(data_path)

docs, sentences, tokens, deltas, _ = preprocess(
    dataset=dataset,
    max_sentence_length=120,
    bert_name=bert_name,
    ner_labels= list(vocs['ner_label']),
    unknown_labels="drop",
    vocabularies=vocs,
)

prep = Dataset(
    sentences=sentences,
    tokens=tokens,
    deltas=deltas,
)

batcher, encoded, ids = make_batcher(docs, sentences, tokens)

Dataset: Dataset(
  (docs):       3790 * ('doc_id', 'text', 'split')
  (mentions):   1023 * ('doc_id', 'mention_id', 'label', 'text')
  (fragments):  1023 * ('doc_id', 'mention_id', 'fragment_id', 'begin', 'end')
  (attributes):    0 * ('doc_id', 'mention_id', 'attribute_id', 'label', 'value')
  (relations):     0 * ('doc_id', 'relation_id', 'relation_label', 'from_mention_id', 'to_mention_id')
  (comments):      0 * ('doc_id', 'comment_id', 'mention_id', 'comment')
)
Transform texts... done
Splitting into sentences... Tokenizing... 



done
Computing vocabularies...
done


# Load the model

### Load the hyperparameters

In [5]:
import yaml

with open(hp_path, 'r') as f:
    hyperparameters = yaml.load(f, Loader=yaml.FullLoader)

scheme = hyperparameters["scheme"]
hidden_dim = hyperparameters["hidden_dim"]
seed = hyperparameters["seed"]
lr = hyperparameters["lr"]
bert_lr = hyperparameters["bert_lr"]
tag_dim = hyperparameters["tag_dim"]
token_dim = hyperparameters["token_dim"]
max_grad_norm = hyperparameters["max_grad_norm"]
tags_lr = hyperparameters["tags_lr"]
bert_weight_decay = hyperparameters["bert_weight_decay"]
random_perm = hyperparameters["random_perm"]
observed_zone_sizes = hyperparameters["observed_zone_sizes"]
n_per_zone = hyperparameters["n_per_zone"]
n_freeze = hyperparameters["n_freeze"]
custom_embeds_layer_index = hyperparameters["custom_embeds_layer_index"]
bert_dropout = hyperparameters["bert_dropout"]
top_dropout = hyperparameters["top_dropout"]


### Load specified model checkpoint

In [6]:
import os
import traceback
from tqdm import tqdm

from custom_bert import CustomBertModel
from transformers import AdamW, BertModel

from tqdm import tqdm
from scipy.sparse import csr_matrix
from logic_crf import CRF, ConstraintFactor, HintFactor, Indexer

from nlstruct.environment import get_cache, load
from nlstruct.utils import evaluating, torch_global as tg, freeze
from nlstruct.scoring import compute_metrics, merge_pred_and_gold
from nlstruct.train import make_optimizer_and_schedules, run_optimization, seed_all
from nlstruct.train.schedule import ScaleOnPlateauSchedule, LinearSchedule, ConstantSchedule
    
device = torch.device('cpu')
tg.set_device(device)

# To release gpu memory before allocating new parameters for a new model
# A better idea would be to run xp in a function, so that all variables are released when exiting the fn
# but this way we can debug after this cell if something goes wrong
if "all_nets" in globals(): del all_nets
if "state" in globals(): del state
    
seed_all(seed) # /!\ Super important to enable reproducibility

ner_net = NERNet(
        n_tokens=len(vocs["token"]),
        token_dim=token_dim,
        n_labels=len(vocs["ner_label"]),
        embeddings=CustomBertModel.from_pretrained(bert_name, custom_embeds_layer_index=custom_embeds_layer_index),

        dropout=top_dropout,
        hidden_dim=hidden_dim,
        tag_scheme=scheme,
        metric='linear') # cosine might be better but looks less stable, oddly,
all_nets = torch.nn.ModuleDict({
    "ner_net": ner_net,
    "tag_embeddings": torch.nn.Embedding(ner_net.crf.num_tags - 1, tag_dim),
}).to(device=tg.device)
del ner_net

state = {"all_nets": all_nets}  

try:
    print(f"Loading from {model_path} ...")
    dumped = torch.load(model_path, map_location=device)
    if dumped is not None:
        for name in dumped.keys():
            persistable = state.get(name, None)
            if name in state and hasattr(persistable, 'load_state_dict'):
                persistable.load_state_dict(dumped[name])
            else:
                state[name] = dumped[name]
    print("Model loaded")

except Exception as e:

    # We catch any exception otherwise some variables (including torch parameters on the gpu) end up being stored globally in sys.last_value, leading to memory errors)
    traceback.print_exc()

before layer norm


Some weights of the model checkpoint at camembert-base were not used when initializing CustomBertModel: ['roberta.embeddings.word_embeddings.weight', 'roberta.embeddings.position_embeddings.weight', 'roberta.embeddings.token_type_embeddings.weight', 'roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.self.query.weight', 'roberta.encoder.layer.0.attention.self.query.bias', 'roberta.encoder.layer.0.attention.self.key.weight', 'roberta.encoder.layer.0.attention.self.key.bias', 'roberta.encoder.layer.0.attention.self.value.weight', 'roberta.encoder.layer.0.attention.self.value.bias', 'roberta.encoder.layer.0.attention.output.dense.weight', 'roberta.encoder.layer.0.attention.output.dense.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.intermediate.dense.weight', 'roberta.encoder.layer.0.intermediate.dense.bias', 'roberta.encoder.l

Loading from checkpoint-145.pt ...
Model loaded


# Extract the inferred mentions

In [8]:
# You can reduce batch_size if the model doesn't fit in memory (restart the notebook kernel first to clean memory)
pred_batcher = extract_mentions(batcher, all_nets=all_nets, hyperparameters=hyperparameters, batch_size=128)

post_mentions = postprocess_batcher(pred_batcher, dataset, prep, ids, vocs)

100%|██████████| 30/30 [02:21<00:00,  4.72s/it]


# Export to brat files

In [10]:
# Exports the predictions to the specified prediction_path in .ann format
# !!! OVERWRITES PREVIOUS PREDICTIONS IN THE PATH !!!
# Doesn't create .ann files if no mentions are detected
preds_to_ann(post_mentions, dataset, vocs, prediction_path)