In [1]:
import functools
import gc
import os
import sys

import copy
import copydf
import datasets
import decouple
import einops
import numpy as np
import pandas as pd
import transformers as tfs
import torch
import tqdm

import data_collator
import modeling_bert
import utils

# HiBert (choose one)

In [2]:
is_hierarchical = True
output_dir = './project_dir/cse599/hibert'
data_path = './project_dir/cse599/data/preprocessed_data.jsonl'
pretrained_hibert_model_path = './project_dir/cse599/hibert/checkpoint-500'

# Bert (choose one)

In [None]:
is_hierarchical = False
output_dir = './project_dir/cse599/bert'
data_path = './project_dir/cse599/data/preprocessed_abstracts.jsonl'

In [3]:
coordinator_config_path = './ctx-hibert-absolute-pos-config.json'
model_path = './project_dir/pretrained_models/allenai/specter'
emb_dir = os.path.join(output_dir, 'embs')
os.makedirs(emb_dir, exist_ok=True)

In [4]:
tokenizer = tfs.AutoTokenizer.from_pretrained(model_path)
doc_ds = datasets.load_dataset('json', data_files=data_path)
preprocess = utils.get_preprocess_fn(tokenizer, 128)
ds = doc_ds.map(preprocess, batched=True)
ds.set_format('torch', columns=['input_ids', 'attention_mask'])

Using custom data configuration default-2507f9f70be6dc2f
Reusing dataset json (/homes/roylu/.cache/huggingface/datasets/json/default-2507f9f70be6dc2f/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /homes/roylu/.cache/huggingface/datasets/json/default-2507f9f70be6dc2f/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253/cache-277494dfe93753b5.arrow


In [5]:
dc = data_collator.DataCollatorForWholeWordMaskAndWholeSentenceMask(
    tokenizer=tokenizer, 
    mlm_probability=0.15,
    msm_probability=0.15,
    max_num_turns=32,
    mask_whole_sentence=True)

# Train (Hibert)
skip if using Bert

In [6]:
args = tfs.TrainingArguments(
    num_train_epochs=200,
    learning_rate=1e-5,
    gradient_accumulation_steps=16,
    per_device_train_batch_size=1,
    output_dir=output_dir)
model_init_fn = functools.partial(
    utils.ctx_hibert_model_init,
    tokenizer=tokenizer,
    model_path=model_path,
    coordinator_config_path=coordinator_config_path)
trainer = tfs.Trainer(
    args=args,
    model_init=model_init_fn,
    train_dataset=ds['train'],
    data_collator=dc)

comet_ml is installed but `COMET_API_KEY` is not set.
loading configuration file ./ctx-hibert-absolute-pos-config.json
Model config BertConfig {
  "_name_or_path": "./ctx-hibert-absolute-pos-config.json",
  "add_absolute_position_embeddings": true,
  "add_ctx_pooled_output_to_tokens": true,
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 300,
  "initializer_range": 0.005,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 1,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.21.3",
  "type_vocab_size": 2,
  "use_cache": false,
  "vocab_size": 30522
}

loading configuration file ./project_dir/pretrained_models/allenai/specter/config.json
Mode

In [7]:
trainer.train()
trainer.model.save_pretrained(output_dir)

loading configuration file ./ctx-hibert-absolute-pos-config.json
Model config BertConfig {
  "_name_or_path": "./ctx-hibert-absolute-pos-config.json",
  "add_absolute_position_embeddings": true,
  "add_ctx_pooled_output_to_tokens": true,
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 300,
  "initializer_range": 0.005,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 1,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.21.3",
  "type_vocab_size": 2,
  "use_cache": false,
  "vocab_size": 30522
}

loading configuration file ./project_dir/pretrained_models/allenai/specter/config.json
Model config BertConfig {
  "_name_or_path": "./project_di

Step,Training Loss


In [None]:
del trainer.model
del trainer
gc.collect()
torch.cuda.empty_cache()

# Extract embeddings

In [None]:
cuda_device = 0
coordinator_config = tfs.AutoConfig.from_pretrained(coordinator_config_path)
model_config = tfs.AutoConfig.from_pretrained(model_path)

if is_hierarchical:
    model = modeling_bert.HierarchicalBertForMaskedLM.from_pretrained(
        pretrained_hibert_model_path, config=model_config, coordinator_config=coordinator_config)
else:
    model = modeling_bert.HierarchicalBertForMaskedLM(
        config=model_config, coordinator_config=coordinator_config)
    bert = tfs.AutoModel.from_pretrained(model_path)
    model.hibert.bert = bert
model = model.hibert

_ = model.cuda(cuda_device)
_ = model.eval()

# Extracts embeddings.

In [None]:
print('Processing training set.')
train_output = utils.get_turn_embeddings(
    model, ds['train'], dc, slide=None, cuda_device=cuda_device)
utils.save_embeddings(emb_dir, 'train', train_output)

# Loads extraced embeddings.

In [None]:
train_output = utils.load_embeddings(emb_dir, 'train')