<a href="https://colab.research.google.com/github/abysee/OpenUE/blob/main/OpenUE_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
! nvidia-smi
! pip install openue
! git clone  https://github.com/zjunlp/OpenUE.git
! pip install pytorch_lightning==1.3.1

Tue Jan  4 05:51:56 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P8    32W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import argparse
import importlib

import numpy as np
import torch
import pytorch_lightning as pl
import openue.lit_models as lit_models
import yaml
import time
from transformers import AutoConfig
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:

# 设置一些参数和动态调用包
def _import_class(module_and_class_name: str) -> type:
    module_name, class_name = module_and_class_name.rsplit(".", 1)
    module = importlib.import_module(module_name)
    class_ = getattr(module, class_name)
	
    return class_


def _setup_parser():
    """Set up Python's ArgumentParser with data, model, trainer, and other arguments."""
    parser = argparse.ArgumentParser(add_help=False)

    # Add Trainer specific arguments, such as --max_epochs, --gpus, --precision
    # trainer_parser = pl.Trainer.add_argparse_args(parser)
    # trainer_parser._action_groups[1].title = "Trainer Args"  # pylint: disable=protected-access
    # parser = argparse.ArgumentParser(add_help=False, parents=[trainer_parser])

    # Basic arguments
    parser.add_argument("--wandb", action="store_true", default=False)
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--litmodel_class", type=str, default="SEQLitModel")
    parser.add_argument("--data_class", type=str, default="REDataset")
    parser.add_argument("--model_class", type=str, default="BertForRelationClassification")
    parser.add_argument("--load_checkpoint", type=str, default=None)

    # Get the data and model classes, so that we can add their specific arguments
    temp_args, _ = parser.parse_known_args()
    data_class = _import_class(f"openue.data.{temp_args.data_class}")
    model_class = _import_class(f"openue.models.{temp_args.model_class}")

    # Get data, model, and LitModel specific arguments
    data_group = parser.add_argument_group("Data Args")
    data_class.add_to_argparse(data_group)

    model_group = parser.add_argument_group("Model Args")
    model_class.add_to_argparse(model_group)

    lit_model_group = parser.add_argument_group("LitModel Args")
    lit_models.BaseLitModel.add_to_argparse(lit_model_group)

    parser.add_argument("--help", "-h", action="help")
    return parser

def _save_model(litmodel, tokenizer, path):
    os.system(f"mkdir -p {path}")
    litmodel.model.save_pretrained(path)
    tokenizer.save_pretrained(path)

In [6]:
parser = _setup_parser()
args = parser.parse_args(args=[])

path = "OpenUE/config/run_seq.yaml"
# 使用config.yaml 载入超参设置
opt = yaml.load(open(path), Loader=yaml.FullLoader)
args.__dict__.update(opt)



np.random.seed(args.seed)
torch.manual_seed(args.seed)
data_class = _import_class(f"openue.data.{args.data_class}")
model_class = _import_class(f"openue.models.{args.model_class}")
litmodel_class = _import_class(f"openue.lit_models.{args.litmodel_class}")

data = data_class(args)
lit_model = litmodel_class(args=args, data_config=data.get_config())
logger = pl.loggers.TensorBoardLogger("/logs")
if args.wandb:
    logger = pl.loggers.WandbLogger(project="openue demo")
    logger.log_hyperparams(vars(args))


early_callback = pl.callbacks.EarlyStopping(monitor="Eval/f1", mode="max", patience=5)
model_checkpoint = pl.callbacks.ModelCheckpoint(monitor="Eval/f1", mode="max",
    filename='{epoch}-{Eval/f1:.2f}',
    dirpath="output",
    save_weights_only=True
)
callbacks = [early_callback, model_checkpoint]
trainer = pl.Trainer.from_argparse_args(args, callbacks=callbacks, logger=logger, 
                                        default_root_dir="training/logs")
trainer.fit(lit_model, datamodule=data)
trainer.test(lit_model, datamodule=data)


# _save_model(litmodel=lit_model, tokenizer=data.tokenizer, path="seq_model")

01/04/2022 05:55:42 - INFO - openue.data.data_module -   add total special tokens: 50 
 ['[relation0]', '[relation1]', '[relation2]', '[relation3]', '[relation4]', '[relation5]', '[relation6]', '[relation7]', '[relation8]', '[relation9]', '[relation10]', '[relation11]', '[relation12]', '[relation13]', '[relation14]', '[relation15]', '[relation16]', '[relation17]', '[relation18]', '[relation19]', '[relation20]', '[relation21]', '[relation22]', '[relation23]', '[relation24]', '[relation25]', '[relation26]', '[relation27]', '[relation28]', '[relation29]', '[relation30]', '[relation31]', '[relation32]', '[relation33]', '[relation34]', '[relation35]', '[relation36]', '[relation37]', '[relation38]', '[relation39]', '[relation40]', '[relation41]', '[relation42]', '[relation43]', '[relation44]', '[relation45]', '[relation46]', '[relation47]', '[relation48]', '[relation49]']
Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForRelationClassification: 

Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

01/04/2022 07:54:12 - INFO - openue.data.utils -   Loading features from cached file ./dataset/ske/cached_train_BertTokenizerFast_seq
01/04/2022 07:54:31 - INFO - openue.data.utils -   Loading features from cached file ./dataset/ske/cached_dev_BertTokenizerFast_seq
01/04/2022 07:54:32 - INFO - openue.data.utils -   Loading features from cached file ./dataset/ske/cached_test_BertTokenizerFast_seq
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'Test/f1': 0.9087914114351351}
--------------------------------------------------------------------------------


[{'Test/f1': 0.9087914114351351}]

In [7]:
parser = _setup_parser()
args = parser.parse_args(args=[])

path = "OpenUE/config/run_ner.yaml"
# 使用config.yaml 载入超参设置
opt = yaml.load(open(path), Loader=yaml.FullLoader)
args.__dict__.update(opt)



np.random.seed(args.seed)
torch.manual_seed(args.seed)
data_class = _import_class(f"openue.data.{args.data_class}")
model_class = _import_class(f"openue.models.{args.model_class}")
litmodel_class = _import_class(f"openue.lit_models.{args.litmodel_class}")

data = data_class(args)
lit_model = litmodel_class(args=args, data_config=data.get_config())
logger = pl.loggers.TensorBoardLogger("/logs")
if args.wandb:
    logger = pl.loggers.WandbLogger(project="openue demo")
    logger.log_hyperparams(vars(args))


early_callback = pl.callbacks.EarlyStopping(monitor="Eval/f1", mode="max", patience=5)
model_checkpoint = pl.callbacks.ModelCheckpoint(monitor="Eval/f1", mode="max",
    filename='{epoch}-{Eval/f1:.2f}',
    dirpath="output",
    save_weights_only=True
)
callbacks = [early_callback, model_checkpoint]
trainer = pl.Trainer.from_argparse_args(args, callbacks=callbacks, logger=logger, 
                                        default_root_dir="training/logs")
trainer.fit(lit_model, datamodule=data)
trainer.test(lit_model, datamodule=data)


# _save_model(litmodel=lit_model, tokenizer=data.tokenizer, path="seq_model")

01/04/2022 08:16:58 - INFO - openue.data.data_module -   add total special tokens: 50 
 ['[relation0]', '[relation1]', '[relation2]', '[relation3]', '[relation4]', '[relation5]', '[relation6]', '[relation7]', '[relation8]', '[relation9]', '[relation10]', '[relation11]', '[relation12]', '[relation13]', '[relation14]', '[relation15]', '[relation16]', '[relation17]', '[relation18]', '[relation19]', '[relation20]', '[relation21]', '[relation22]', '[relation23]', '[relation24]', '[relation25]', '[relation26]', '[relation27]', '[relation28]', '[relation29]', '[relation30]', '[relation31]', '[relation32]', '[relation33]', '[relation34]', '[relation35]', '[relation36]', '[relation37]', '[relation38]', '[relation39]', '[relation40]', '[relation41]', '[relation42]', '[relation43]', '[relation44]', '[relation45]', '[relation46]', '[relation47]', '[relation48]', '[relation49]']
Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForNER: ['cls.predictions.t

语料有问题句子比例是 0.0033274025463872264


01/04/2022 08:20:50 - INFO - openue.data.utils -   Creating example from cached file ./dataset/ske/cached_dev_BertTokenizerFast.examples
01/04/2022 08:20:52 - INFO - openue.data.utils -   Creating features from dataset file at ./dataset/ske
01/04/2022 08:20:52 - INFO - openue.data.utils -   Writing example 0 of 10000
01/04/2022 08:20:52 - INFO - openue.data.utils -   InputExample(text_id=0, words='查尔斯·阿兰基斯（Charles Aránguiz），1989年4月17日出生于智利圣地亚哥，智利职业足球运动员，司职中场，效力于德国足球甲级联赛勒沃库森足球俱乐部', triples=[['查尔斯·阿兰基斯', '出生地', '圣地亚哥'], ['查尔斯·阿兰基斯', '出生日期', '1989年4月17日']])
01/04/2022 08:20:52 - INFO - openue.data.utils -   {'input_ids': [101, 3389, 2209, 3172, 185, 7350, 1065, 1825, 3172, 8020, 10403, 100, 8021, 8024, 8528, 2399, 125, 3299, 8126, 3189, 1139, 4495, 754, 3255, 1164, 1760, 1765, 762, 1520, 8024, 3255, 1164, 5466, 689, 6639, 4413, 6817, 1220, 1447, 8024, 1385, 5466, 704, 1767, 8024, 3126, 1213, 754, 2548, 1744, 6639, 4413, 4508, 5277, 5468, 6612, 1239, 3753, 2417, 3481, 6639, 4413, 936, 727,

语料有问题句子比例是 0.0024


01/04/2022 08:21:02 - INFO - openue.data.utils -   Creating example from cached file ./dataset/ske/cached_test_BertTokenizerFast.examples
01/04/2022 08:21:06 - INFO - openue.data.utils -   Creating features from dataset file at ./dataset/ske
01/04/2022 08:21:06 - INFO - openue.data.utils -   Writing example 0 of 11639
01/04/2022 08:21:06 - INFO - openue.data.utils -   InputExample(text_id=0, words='梅亭社区是广东省深圳市福田区梅林街道办事处所辖的一个居民小区，社区面积1.5平方公里，总人口18567人', triples=[['梅亭社区', '人口数量', '18567人'], ['梅亭社区', '面积', '1.5平方公里']])
01/04/2022 08:21:06 - INFO - openue.data.utils -   {'input_ids': [101, 3449, 777, 4852, 1277, 3221, 2408, 691, 4689, 3918, 1766, 2356, 4886, 4506, 1277, 3449, 3360, 6125, 6887, 1215, 752, 1905, 2792, 6785, 4638, 671, 702, 2233, 3696, 2207, 1277, 8024, 4852, 1277, 7481, 4916, 122, 119, 126, 2398, 3175, 1062, 7027, 8024, 2600, 782, 1366, 9560, 9411, 782, 102, 21135, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

语料有问题句子比例是 0.004038147607182747


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  cpuset_checked))

  | Name    | Type              | Params
----------------------------------------------
0 | loss_fn | BCEWithLogitsLoss | 0     
1 | model   | BertForNER        | 102 M 
----------------------------------------------
102 M     Trainable params
0         Non-trainable params
102 M     Total params
409.249   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

01/04/2022 08:21:45 - INFO - openue.data.utils -   Creating example from cached file ./dataset/ske/cached_train_BertTokenizerFast.examples
01/04/2022 08:22:32 - INFO - openue.data.utils -   Creating features from dataset file at ./dataset/ske
01/04/2022 08:22:32 - INFO - openue.data.utils -   Writing example 0 of 173108
01/04/2022 08:22:32 - INFO - openue.data.utils -   InputExample(text_id=0, words='如何演好自己的角色，请读《演员自我修养》《喜剧之王》周星驰崛起于穷困潦倒之中的独门秘笈', triples=[['喜剧之王', '主演', '周星驰']])
01/04/2022 08:22:32 - INFO - openue.data.utils -   {'input_ids': [101, 1963, 862, 4028, 1962, 5632, 2346, 4638, 6235, 5682, 8024, 6435, 6438, 517, 4028, 1447, 5632, 2769, 934, 1075, 518, 517, 1599, 1196, 722, 4374, 518, 1453, 3215, 7720, 2307, 6629, 754, 4956, 1737, 4057, 948, 722, 704, 4638, 4324, 7305, 4908, 5007, 102, 21133, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], 'attention_mask': [1

语料有问题句子比例是 0.0033274025463872264


01/04/2022 08:25:28 - INFO - openue.data.utils -   Creating example from cached file ./dataset/ske/cached_dev_BertTokenizerFast.examples
01/04/2022 08:25:31 - INFO - openue.data.utils -   Creating features from dataset file at ./dataset/ske
01/04/2022 08:25:31 - INFO - openue.data.utils -   Writing example 0 of 10000
01/04/2022 08:25:31 - INFO - openue.data.utils -   InputExample(text_id=0, words='查尔斯·阿兰基斯（Charles Aránguiz），1989年4月17日出生于智利圣地亚哥，智利职业足球运动员，司职中场，效力于德国足球甲级联赛勒沃库森足球俱乐部', triples=[['查尔斯·阿兰基斯', '出生地', '圣地亚哥'], ['查尔斯·阿兰基斯', '出生日期', '1989年4月17日']])
01/04/2022 08:25:31 - INFO - openue.data.utils -   {'input_ids': [101, 3389, 2209, 3172, 185, 7350, 1065, 1825, 3172, 8020, 10403, 100, 8021, 8024, 8528, 2399, 125, 3299, 8126, 3189, 1139, 4495, 754, 3255, 1164, 1760, 1765, 762, 1520, 8024, 3255, 1164, 5466, 689, 6639, 4413, 6817, 1220, 1447, 8024, 1385, 5466, 704, 1767, 8024, 3126, 1213, 754, 2548, 1744, 6639, 4413, 4508, 5277, 5468, 6612, 1239, 3753, 2417, 3481, 6639, 4413, 936, 727,

语料有问题句子比例是 0.0024


01/04/2022 08:25:44 - INFO - openue.data.utils -   Creating example from cached file ./dataset/ske/cached_test_BertTokenizerFast.examples
01/04/2022 08:25:47 - INFO - openue.data.utils -   Creating features from dataset file at ./dataset/ske
01/04/2022 08:25:47 - INFO - openue.data.utils -   Writing example 0 of 11639
01/04/2022 08:25:47 - INFO - openue.data.utils -   InputExample(text_id=0, words='梅亭社区是广东省深圳市福田区梅林街道办事处所辖的一个居民小区，社区面积1.5平方公里，总人口18567人', triples=[['梅亭社区', '人口数量', '18567人'], ['梅亭社区', '面积', '1.5平方公里']])
01/04/2022 08:25:47 - INFO - openue.data.utils -   {'input_ids': [101, 3449, 777, 4852, 1277, 3221, 2408, 691, 4689, 3918, 1766, 2356, 4886, 4506, 1277, 3449, 3360, 6125, 6887, 1215, 752, 1905, 2792, 6785, 4638, 671, 702, 2233, 3696, 2207, 1277, 8024, 4852, 1277, 7481, 4916, 122, 119, 126, 2398, 3175, 1062, 7027, 8024, 2600, 782, 1366, 9560, 9411, 782, 102, 21135, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

语料有问题句子比例是 0.004038147607182747


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'Test/f1': 0.7404730858937392}
--------------------------------------------------------------------------------


[{'Test/f1': 0.7404730858937392}]

In [8]:
args

Namespace(accumulate_grad_batches=1, batch_size=16, check_val_every_n_epoch=1, checkpoint_callback=True, data_class='REDataset', data_dir='./dataset/ske', gpus='0,', limit_predict_batches=1.0, limit_test_batches=1.0, limit_train_batches=1.0, limit_val_batches=1.0, litmodel_class='RELitModel', load_checkpoint=None, logger=True, lr=3e-05, max_epochs=5, max_seq_length=128, max_steps=32, max_time=None, min_epochs=None, min_steps=None, model_class='BertForNER', model_name_or_path='bert-base-chinese', model_type='bert', move_metrics_to_cpu=False, multiple_trainloader_mode='max_size_cycle', num_nodes=1, num_processes=1, num_sanity_val_steps=2, num_workers=8, optimizer='AdamW', overfit_batches=0.0, overwrite_cache=True, plugins=None, precision=32, prepare_data_per_node=True, process_position=0, profiler=None, progress_bar_refresh_rate=None, reload_dataloaders_every_epoch=False, replace_sampler_ddp=True, resume_from_checkpoint=None, seed=42, stochastic_weight_avg=False, sync_batchnorm=False, ta