<a href="https://colab.research.google.com/github/areias/bert_covid_sentiment/blob/main/bert_predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -r drive/MyDrive/covid-twitter-bert/requirements.txt

Collecting tensorflow==2.2.0
  Downloading tensorflow-2.2.0-cp37-cp37m-manylinux2010_x86_64.whl (516.2 MB)
[K     |████████████████████████████████| 516.2 MB 3.3 kB/s 
Collecting tensorflow_addons==0.11.2
  Downloading tensorflow_addons-0.11.2-cp37-cp37m-manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 62.1 MB/s 
Collecting emoji
  Downloading emoji-1.6.1.tar.gz (170 kB)
[K     |████████████████████████████████| 170 kB 64.4 MB/s 
[?25hCollecting unidecode
  Downloading Unidecode-1.3.2-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 67.1 MB/s 
[?25hCollecting cloud-tpu-client
  Downloading cloud_tpu_client-0.10-py3-none-any.whl (7.4 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 81.4 MB/s 
Collecting gast==0.3.3
  Downloading gast-0.3.3-py2.py3-none-any.whl (9.7 kB)
Collecting tensorboard<

In [3]:
import tensorflow
print(tensorflow.__version__)

2.2.0


In [4]:
import tensorflow_addons
print(tensorflow_addons.__version__)

0.11.2


In [5]:
import sys
sys.path.append('drive/MyDrive/covid-twitter-bert')
sys.path.append('drive/MyDrive/covid-twitter-bert/tensorflow_models')


In [6]:
from official.utils.misc import distribution_utils
from official.nlp.bert import bert_models
from official.nlp.bert import configs as bert_configs
from official.nlp.bert import tokenization
from official.nlp.bert.input_pipeline import single_file_dataset

import os
import datetime
import time
import argparse
import logging
from tqdm import tqdm
import json
import tensorflow as tf
from utils.misc import ArgParseDefault, add_bool_arg, save_to_json
from config import PRETRAINED_MODELS
import collections
import pandas as pd

In [7]:
VOCAB_PATH = 'drive/MyDrive/covid-twitter-bert/vocabs'

In [8]:
# start time
s_time = time.time()


In [9]:
from collections import namedtuple
arguments = namedtuple('arguments', ['run_name','model_class','use_tf_hub',
                                     'eval_batch_size'])

args = arguments('run_2021-11-24_16-39-54_269137_test_run','covid-twitter-bert', True,
                 8)
args

arguments(run_name='run_2021-11-24_16-39-54_269137_test_run', model_class='covid-twitter-bert', use_tf_hub=True, eval_batch_size=8)

In [10]:
# paths
run_dir = os.path.join('drive/MyDrive/covid-twitter-bert/data/finetune/',args.run_name,'crowdbreaks')
ts = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S_%f')
output_folder = os.path.join('drive/MyDrive/covid-twitter-bert/data/finetune/', args.run_name,'crowdbreaks', 'predictions', f'predictions_{ts}')
predictions_output_folder = os.path.join('drive/MyDrive/covid-twitter-bert/data/finetune/', args.run_name,'crowdbreaks','predictions', f'predictions_{ts}', 'predictions')

In [11]:
print(run_dir)
print(output_folder)
print(predictions_output_folder)

drive/MyDrive/covid-twitter-bert/data/finetune/run_2021-11-24_16-39-54_269137_test_run/crowdbreaks
drive/MyDrive/covid-twitter-bert/data/finetune/run_2021-11-24_16-39-54_269137_test_run/crowdbreaks/predictions/predictions_2021-11-25_15-19-57_125895
drive/MyDrive/covid-twitter-bert/data/finetune/run_2021-11-24_16-39-54_269137_test_run/crowdbreaks/predictions/predictions_2021-11-25_15-19-57_125895/predictions


In [12]:
if not os.path.isdir(predictions_output_folder):
    os.makedirs(predictions_output_folder)

In [13]:
def read_run_log(run_dir):
    with tf.io.gfile.GFile(os.path.join(run_dir, 'run_logs.json'), 'rb') as reader:
        run_log = json.loads(reader.read().decode('utf-8'))
    return run_log

In [14]:
def get_model_config_path(args):
    try:
        config_path = PRETRAINED_MODELS[args.model_class]['config']
    except KeyError:
        raise ValueError(f'Could not find a pretrained model matching the model class {args.model_class}')
    return os.path.join('drive/MyDrive/covid-twitter-bert/configs', config_path)

In [15]:
def get_model_config(config_path):
    config = bert_configs.BertConfig.from_json_file(config_path)
    return config

In [16]:
# read configs
#logger.info(f'Reading run configs...')
run_log = read_run_log(run_dir)
pretrained_model_config_path = get_model_config_path(args)
model_config = get_model_config(pretrained_model_config_path)
max_seq_length = run_log['max_seq_length']
label_mapping = run_log['label_mapping']
num_labels = len(label_mapping)

In [17]:
def get_tokenizer(model_class):
    model = PRETRAINED_MODELS[model_class]
    vocab_file = os.path.join(VOCAB_PATH, model['vocab_file'])
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=model['lower_case'])
    return tokenizer

In [18]:
 # load tokenizer
#logger.info(f'Loading tokenizer...')
tokenizer = get_tokenizer(args.model_class)

In [19]:
tokenizer

<official.nlp.bert.tokenization.FullTokenizer at 0x7fd51a5d41d0>

In [20]:
def get_model(args, model_config, num_labels, max_seq_length):
    if args.use_tf_hub and PRETRAINED_MODELS[args.model_class]['is_tfhub_model']:
        hub_module_url = f"https://tfhub.dev/{PRETRAINED_MODELS[args.model_class]['hub_url']}"
        hub_module_trainable = True
    else:
        hub_module_url = None
        hub_module_trainable = False
    classifier_model, _ = bert_models.classifier_model(
            model_config,
            num_labels,
            max_seq_length,
            hub_module_url=hub_module_url,
            hub_module_trainable=hub_module_trainable)
    return classifier_model

In [21]:
model_config.to_dict()

{'attention_probs_dropout_prob': 0.1,
 'backward_compatible': True,
 'hidden_act': 'gelu',
 'hidden_dropout_prob': 0.1,
 'hidden_size': 1024,
 'initializer_range': 0.02,
 'intermediate_size': 4096,
 'max_position_embeddings': 512,
 'num_attention_heads': 16,
 'num_hidden_layers': 24,
 'type_vocab_size': 2,
 'vocab_size': 30522}

In [22]:
# load model
#logger.info(f'Loading model...')
model = get_model(args, model_config, num_labels, max_seq_length)

In [23]:
 # restore fine-tuned run
checkpoint_path = os.path.join(run_dir, 'checkpoint')
#logger.info(f'Restore run checkpoint {checkpoint_path}...')

In [24]:
# load weights (expect partial state because we don't want need the optimizer state)
try:
    model.load_weights(checkpoint_path).expect_partial()
except:
    print(f'Restoring from checkpoint unsuccessful. Use the flag --use_tf_hub if the TFHub was used to initialize the model.')
else:
    print(f'... successfully restored checkpoint')
   

... successfully restored checkpoint


In [25]:
# predict
num_predictions = 0
predictions = []


In [26]:
def get_tfrecord_dataset(input_file, eval_batch_size, max_seq_length):
    def _dataset_fn(ctx=None):
        """Returns tf.data.Dataset for distributed prediction."""
        batch_size = ctx.get_per_replica_batch_size(eval_batch_sizeglobal_batch_size) if ctx else eval_batch_size
        dataset = create_tfrecord_dataset_pipeline(input_file, max_seq_length, batch_size, input_pipeline_context=ctx)
        return dataset
    return _dataset_fn

In [27]:
run_dir

'drive/MyDrive/covid-twitter-bert/data/finetune/run_2021-11-24_16-39-54_269137_test_run/crowdbreaks'

In [28]:
def create_tfrecord_dataset_pipeline(input_file, max_seq_length, batch_size, input_pipeline_context=None):
    name_to_features = {
        'input_word_ids': tf.io.FixedLenFeature([max_seq_length], tf.int64),
        'input_mask': tf.io.FixedLenFeature([max_seq_length], tf.int64),
        'input_type_ids': tf.io.FixedLenFeature([max_seq_length], tf.int64),
    }
    dataset = single_file_dataset(input_file, name_to_features)
    # shard dataset between hosts
    if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
        dataset = dataset.shard(input_pipeline_context.num_input_pipelines, input_pipeline_context.input_pipeline_id)
    dataset = dataset.batch(batch_size, drop_remainder=False)
    dataset = dataset.prefetch(1024)
    return dataset

In [29]:
"""#elif args.input_tfrecord_files:
s_time_predict = time.time()
for input_file_pattern in args.input_tfrecord_files:
    for input_file in tf.io.gfile.glob(input_file_pattern):
        logger.info(f'Processing file {input_file}')"""
input_file=run_dir+"/tfrecords/dev.tfrecords"
dataset = get_tfrecord_dataset(input_file, args.eval_batch_size, max_seq_length)()


In [30]:
dataset

<PrefetchDataset shapes: {input_mask: (None, 96), input_type_ids: (None, 96), input_word_ids: (None, 96)}, types: {input_mask: tf.int32, input_type_ids: tf.int32, input_word_ids: tf.int32}>

In [31]:
num_batches = sum(1 for _ in tf.data.TFRecordDataset(input_file).batch(args.eval_batch_size))
f_out_name = os.path.basename(input_file).split('.')[-2]
f_out = os.path.join(predictions_output_folder, f'{f_out_name}.jsonl')

In [32]:
num_batches

85

In [33]:
f_out_name

'dev'

In [34]:
f_out

'drive/MyDrive/covid-twitter-bert/data/finetune/run_2021-11-24_16-39-54_269137_test_run/crowdbreaks/predictions/predictions_2021-11-25_15-19-57_125895/predictions/dev.jsonl'

In [35]:
for batch in tqdm(dataset, total=num_batches, unit='batch'):
                preds = model.predict(batch)
                preds = format_prediction(preds, label_mapping, args.label_name)
                num_predictions += len(preds)
                with open(f_out, 'a') as f:
                    for pred in preds:
                        f.write(json.dumps(pred) + '\n')

  0%|          | 0/85 [00:00<?, ?batch/s]


InvalidArgumentError: ignored