# Starting the Bert server encoding for embedding retreival

In [None]:
!pip install transformers



In [None]:
%tensorflow_version 1.x

TensorFlow 1.x selected.


# Here we don't have the model checkpoint anymore but you can generate it with the SciBERT notebook.

In [None]:
# !gdown https://drive.google.com/uc?id=1jYkmAhdEdGwZs2kgcwWMVz6x0htnTIJu

Downloading...
From: https://drive.google.com/uc?id=1jYkmAhdEdGwZs2kgcwWMVz6x0htnTIJu
To: /content/model.ckpt
1.27GB [00:06, 189MB/s]


In [None]:
"""Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint."""

import os
import argparse
import torch
import numpy as np
import tensorflow as tf
from transformers import AutoModel


def convert_pytorch_checkpoint_to_tf(model, ckpt_dir, model_name):
    tensors_to_transpose = (
        "dense.weight",
        "attention.self.query",
        "attention.self.key",
        "attention.self.value"
    )

    var_map = (
        ('layer.', 'layer_'),
        ('word_embeddings.weight', 'word_embeddings'),
        ('position_embeddings.weight', 'position_embeddings'),
        ('token_type_embeddings.weight', 'token_type_embeddings'),
        ('.', '/'),
        ('LayerNorm/weight', 'LayerNorm/gamma'),
        ('LayerNorm/bias', 'LayerNorm/beta'),
        ('weight', 'kernel')
    )

    if not os.path.isdir(ckpt_dir):
        os.makedirs(ckpt_dir)

    state_dict = model.state_dict()

    def to_tf_var_name(name:str):
        for patt, repl in iter(var_map):
            name = name.replace(patt, repl)
        return 'bert/{}'.format(name)

    def create_tf_var(tensor:np.ndarray, name:str, session:tf.Session):
        tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
        tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer())
        session.run(tf.variables_initializer([tf_var]))
        session.run(tf_var)
        return tf_var

    tf.reset_default_graph()
    with tf.Session() as session:
        for var_name in state_dict:
            tf_name = to_tf_var_name(var_name)
            torch_tensor = state_dict[var_name].numpy()
            if any([x in var_name for x in tensors_to_transpose]):
                torch_tensor = torch_tensor.T
            tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
            tf.keras.backend.set_value(tf_var, torch_tensor)
            tf_weight = session.run(tf_var)
            print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor)))

        saver = tf.train.Saver(tf.trainable_variables())
        saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))

In [None]:
pytorch_model_path = 'model.ckpt'
model_name = 'allenai/scibert_scivocab_uncased'
tf_cache_dir = 'tf_model/'

model = AutoModel.from_pretrained(
    pretrained_model_name_or_path=model_name,
    state_dict=torch.load(pytorch_model_path),
)
print("after model")
    
convert_pytorch_checkpoint_to_tf(
    model=model,
    ckpt_dir=tf_cache_dir,
    model_name=model_name
)

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['iter', 'model_state_dict', 'optimizer_state_dict']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['embeddings.word_embeddings.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.LayerNorm.weight', 'embeddings.LayerNorm.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.laye

after model
Successfully created bert/embeddings/position_ids: True
Successfully created bert/embeddings/word_embeddings: True
Successfully created bert/embeddings/position_embeddings: True
Successfully created bert/embeddings/token_type_embeddings: True
Successfully created bert/embeddings/LayerNorm/gamma: True
Successfully created bert/embeddings/LayerNorm/beta: True
Successfully created bert/encoder/layer_0/attention/self/query/kernel: True
Successfully created bert/encoder/layer_0/attention/self/query/bias: True
Successfully created bert/encoder/layer_0/attention/self/key/kernel: True
Successfully created bert/encoder/layer_0/attention/self/key/bias: True
Successfully created bert/encoder/layer_0/attention/self/value/kernel: True
Successfully created bert/encoder/layer_0/attention/self/value/bias: True
Successfully created bert/encoder/layer_0/attention/output/dense/kernel: True
Successfully created bert/encoder/layer_0/attention/output/dense/bias: True
Successfully created bert/en

In [None]:
!wget https://s3-us-west-2.amazonaws.com/ai2-s2-research/scibert/tensorflow_models/scibert_scivocab_uncased.tar.gz

--2021-02-09 17:02:37--  https://s3-us-west-2.amazonaws.com/ai2-s2-research/scibert/tensorflow_models/scibert_scivocab_uncased.tar.gz
Resolving s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)... 52.218.178.24
Connecting to s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)|52.218.178.24|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1216161420 (1.1G) [application/x-tar]
Saving to: ‘scibert_scivocab_uncased.tar.gz’


2021-02-09 17:03:09 (36.9 MB/s) - ‘scibert_scivocab_uncased.tar.gz’ saved [1216161420/1216161420]



In [None]:
!tar -xf scibert_scivocab_uncased.tar.gz

In [None]:
!pip install bert-serving-client
!pip install -U bert-serving-server[http]

Collecting bert-serving-client
  Downloading https://files.pythonhosted.org/packages/1f/09/aae1405378a848b2e87769ad89a43d6d71978c4e15534ca48e82e723a72f/bert_serving_client-1.10.0-py2.py3-none-any.whl
Installing collected packages: bert-serving-client
Successfully installed bert-serving-client-1.10.0
Collecting bert-serving-server[http]
[?25l  Downloading https://files.pythonhosted.org/packages/b0/bd/cab677bbd0c5fb08b72e468371d2bca6ed9507785739b4656b0b5470d90b/bert_serving_server-1.10.0-py3-none-any.whl (61kB)
[K     |████████████████████████████████| 71kB 10.5MB/s 
Collecting GPUtil>=1.3.0
  Downloading https://files.pythonhosted.org/packages/ed/0e/5c61eedde9f6c87713e89d794f01e378cfd9565847d4576fa627d758c554/GPUtil-1.4.0.tar.gz
Collecting flask-compress; extra == "http"
  Downloading https://files.pythonhosted.org/packages/b2/7a/9c4641f975fb9daaf945dc39da6a52fd5693ab3bbc2d53780eab3b5106f4/Flask_Compress-1.8.0-py3-none-any.whl
Collecting flask-cors; extra == "http"
  Downloading https

In [None]:
# starting the client with 256 as maximum sequence length (trimmed right), first embedding and two last hidden state layers
!nohup bert-serving-start -max_seq_len 256 -pooling_layer -1 -num_worker=4 -gpu_memory_fraction 0.8 -model_dir=./scibert_scivocab_uncased > out.file 2>&1 &
# !bert-serving-start -max_seq_len 512 -pooling_layer -1 -num_worker=2 -tuned_model_dir=./tf_model/allenai/ -ckpt_name=scibert_scivocab_uncased.ckpt -gpu_memory_fraction 0.8 -model_dir=./scibert_scivocab_uncased > out.file 2>&1 &

In [None]:
!cat out.file

I:[35mVENTILATOR[0m:[__i:__i: 67]:freeze, optimize and export graph, could take a while...
usage: /usr/local/bin/bert-serving-start -max_seq_len 256 -pooling_layer -1 -num_worker=4 -gpu_memory_fraction 0.8 -model_dir=./scibert_scivocab_uncased
                 ARG   VALUE
__________________________________________________
           ckpt_name = bert_model.ckpt
         config_name = bert_config.json
                cors = *
                 cpu = False
          device_map = []
       do_lower_case = True
  fixed_embed_length = False
                fp16 = False
 gpu_memory_fraction = 0.8
       graph_tmp_dir = None
    http_max_connect = 10
           http_port = None
        mask_cls_sep = False
      max_batch_size = 256
         max_seq_len = 256
           model_dir = ./scibert_scivocab_uncased
no_position_embeddings = False
    no_special_token = False
          num_worker = 4
       pooling_layer = [-1]
    pooling_strategy = REDUCE_MEAN
                port = 5555
           

In [None]:
from bert_serving.client import BertClient
bert_client = BertClient(check_length=False)

list_text = ['you '* 240 + 'only', 'Nan']
embedded_text = bert_client.encode(list_text)
embedding_dim = len(embedded_text[1])
print(f"Encoding dimension: {embedding_dim}")

Encoding dimension: 768


# Reading our abstracts

In [None]:
!gdown "https://drive.google.com/uc?id=1w8cCfCd9A_Ph6jIVTs34pVZMOhKTiX0m"

Downloading...
From: https://drive.google.com/uc?id=1w8cCfCd9A_Ph6jIVTs34pVZMOhKTiX0m
To: /content/abstract_train_full.csv
103MB [00:01, 56.7MB/s] 


In [None]:
!gdown "https://drive.google.com/uc?id=1RqwwBkarpAEZ0Zs1SVxndjI5mhtwZre5"

Downloading...
From: https://drive.google.com/uc?id=1RqwwBkarpAEZ0Zs1SVxndjI5mhtwZre5
To: /content/abstract_test_full.csv
919MB [00:09, 98.1MB/s]


In [None]:
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd

df_train = pd.read_csv("abstract_train_full.csv", index_col=0)
df_test = pd.read_csv("abstract_test_full.csv", index_col=0)

  mask |= (ar1 == a)


In [None]:
df_author_abstract = pd.concat([df_train, df_test])
del df_author_abstract["h_index"]
df_author_abstract = df_author_abstract[df_author_abstract["abstract"] != "\n"]
df_author_abstract

Unnamed: 0,authorID,abstract
0,7248981,fuelled bring internet things concept real int...
1,7248981,recent advances mobile devices network technol...
2,7248981,several research groups working designing new ...
3,7248981,next generation internet provide ubiquitous co...
4,7248981,recent huge trend towards running network inte...
...,...,...
2081145,2908506980,
2081146,2908506980,
2081147,2908506980,
2081148,2908506980,


In [None]:
def process_batch(group_df, author_to_embeddings, author_count_abstracts):
    abstract_batch = group_df.iloc[:, 1].values
    stringified_batches = [str(x) for x in abstract_batch]
    nan_indices = set(np.where(np.array(stringified_batches) == 'nan')[0])
    embeddings = bert_client.encode(stringified_batches)
    for i, author_id in enumerate(group_df["authorID"]):
        if i not in nan_indices:
            author_to_embeddings[author_id] += embeddings[i]
            author_count_abstracts[author_id] += 1

In [None]:
n_splits = 1000
print(f"Total abstracts in each of the {n_splits} groups: {len(df_author_abstract)*10 // n_splits}")
# maps author to list of its embeddings: 1245334 -> list of shape (10, 768)
author_to_embeddings = {
    author_id: np.zeros(embedding_dim) for author_id in df_author_abstract["authorID"]
}
author_count_abstracts = {
    author_id: 0 for author_id in df_author_abstract["authorID"]
}

Total abstracts in each of the 1000 groups: 23123


In [None]:
for group_df in tqdm(np.array_split(df_author_abstract, n_splits), total=n_splits):
    process_batch(group_df, author_to_embeddings, author_count_abstracts)

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

In [None]:
# get the mean for each author
for author_id, embedding_sum in tqdm(author_to_embeddings.items()):
    abstract_count = author_count_abstracts[author_id]
    if abstract_count:
        author_to_embeddings[author_id] = embedding_sum / abstract_count

In [None]:
np.save('author_to_embeddings_ours.npy', author_to_embeddings)

In [None]:
author_retreived = np.load('author_to_embeddings_ours.npy').item()
# dict {authodID: average of his abstracts embeddings (768,)}