In [1]:
import pickle
import os
import json
import numpy as np
import pandas as pd
import re
import time
from multiprocessing import Pool

import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel


## set up tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained('roberta-base')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
# %env CUDA_VISIBLE_DEVICES=1

env: CUDA_VISIBLE_DEVICES=1


In [3]:
device = torch.device("cuda")
model = model.to(device)
print('model on GPU')

model on GPU


In [4]:
dataset_name = 'wos46985'
base_dir = '../../data/WOS/'
data_file = base_dir+'Meta-data/Data.csv'

In [5]:
df = pd.read_csv(data_file)
print('Num Data:',len(df))

Num Data: 46985


In [6]:
df.head(2)

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract
0,0,12,12,CS,Symbolic computation,(2+1)-dimensional non-linear optical waves; e...,"""(2 + 1)-dimensional non-linear optical waves ..."
1,5,2,74,Medical,Alzheimer's Disease,Aging; Tau; Amyloid; PET; Alzheimer's disease...,"""(beta-amyloid (A beta) and tau pathology beco..."


In [7]:
def get_bert_embeddings(texts, method = 'pool'):
    tokens_train = tokenizer(
        texts,
        padding='max_length',
        max_length=256,
        return_token_type_ids=True,
        return_attention_mask=True,
        truncation=True,
        return_length=False,
        return_tensors='pt'
    )

    train_seq = tokens_train['input_ids']
    train_mask = tokens_train['attention_mask']
    train_y = torch.tensor([0]*len(texts))

    # define a batch size
    batch_size = 20
    # wrap tensors
    train_data = TensorDataset(train_seq, train_mask, train_y)
    # dataLoader for train set
    train_dataloader = DataLoader(train_data, batch_size=batch_size)
    
    output_reps_pool = []

    for step,batch in enumerate(train_dataloader):
        # progress update after every 50 batches.
        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

        # push the batch to gpu
        batch = [r.to(device) for r in batch]

        sent_id, mask, labels = batch

        # get model predictions for the current batch
        preds = model(sent_id, mask)

        attention_mask = mask.cpu()
        base_output = preds.last_hidden_state.detach().cpu()

        if method=='pool':
            # Mean Pool and ignore Padding
            base_output[attention_mask==0, :] = float('nan')
            output_representation = torch.div(torch.nansum(base_output, axis=1), torch.sum(attention_mask==1, axis=1,  keepdim=True))
        elif method=='cls':
            output_representation = base_output[:,0,:]
        output_reps_pool.append(output_representation)
        
    output_reps_df = torch.vstack(output_reps_pool).numpy()
    print('Embeddings Shape:',output_reps_df.shape)
    
    return output_reps_df

In [8]:
method = 'pool'
text_embedding_dir = '../data/'+dataset_name

In [10]:
texts = df.Abstract.tolist()

text_embeddings = get_bert_embeddings(texts, method)
text_embedding_file = text_embedding_dir+'/roberta-embedding-'+method+'.pkl'

if text_embeddings.shape[0] == len(texts):
    print('Saving Embeddings...')
    pickle.dump(text_embeddings, open(text_embedding_file, "wb"))

Embeddings Shape: (46985, 768)
Saving Embeddings...
