In [1]:
from datasets import load_dataset, load_from_disk, Dataset
import numpy as np
import pandas as pd
from transformers import RobertaTokenizer, RobertaModel
import torch
from torch.utils.data import TensorDataset, DataLoader

In [2]:
data_base = 'data'
# data_base = ''

dataset_name = 'cnn_dailymail'

data_path = data_base+'/'+dataset_name

In [3]:
## set up tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained('roberta-base')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_VISIBLE_DEVICES=1


In [5]:
device = torch.device("cuda")
model = model.to(device)
print('model on GPU')

model on GPU


In [6]:
dataset = load_from_disk(
    data_path+'/sentence_embs_'
)

train_sentence_data = dataset['train']
test_sentence_data = dataset['test']
val_sentence_data = dataset['validation']

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article_ids', 'attention_mask', 'input_ids'],
        num_rows: 11358552
    })
    validation: Dataset({
        features: ['article_ids', 'embeddings'],
        num_rows: 449277
    })
    test: Dataset({
        features: ['article_ids', 'embeddings'],
        num_rows: 391493
    })
})

In [8]:
train_df = pd.DataFrame(dataset['train'])

In [9]:
def get_bert_embeddings(batch, method = 'pool'):
    
    train_seq = torch.tensor(batch['input_ids'].to_list()).to(device)
    train_mask = torch.tensor(batch['attention_mask'].to_list()).to(device)

    preds = model(train_seq, train_mask)

    attention_mask = train_mask.cpu()
    base_output = preds.last_hidden_state.detach().cpu()

    if method=='pool':
        # Mean Pool and ignore Padding
        base_output[attention_mask==0, :] = float('nan')
        output_representation = torch.div(torch.nansum(base_output, axis=1), torch.sum(attention_mask==1, axis=1,  keepdim=True))
    elif method=='cls':
        output_representation = base_output[:,0,:]

    output_representation = output_representation.numpy()
    # print('Embeddings Shape:',output_representation.shape)
    
    batch['input_ids'] = batch['input_ids'] #.numpy()
    batch['attention_mask'] = batch['attention_mask'] #.numpy()
    batch['embeddings'] = output_representation.tolist()
    return batch

In [None]:
batch_size = 40

train_embeddings = []
for i in range(0, len(train_df), batch_size):
    train_embeddings.append(
        get_bert_embeddings(train_df.iloc[i:i+batch_size])['embeddings'].tolist()
    )
    if i//batch_size%100==0:
        print('Completed',i//batch_size,'batches of',len(train_df)//batch_size,'batches')
    if i//batch_size==180000:
        break

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['input_ids'] = batch['input_ids'] #.numpy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['attention_mask'] = batch['attention_mask'] #.numpy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['embeddings'] = output_representation.tolist()


Completed 0 batches of 283963 batches
Completed 100 batches of 283963 batches
Completed 200 batches of 283963 batches
Completed 300 batches of 283963 batches
Completed 400 batches of 283963 batches
Completed 500 batches of 283963 batches
Completed 600 batches of 283963 batches
Completed 700 batches of 283963 batches
Completed 800 batches of 283963 batches
Completed 900 batches of 283963 batches
Completed 1000 batches of 283963 batches
Completed 1100 batches of 283963 batches
Completed 1200 batches of 283963 batches
Completed 1300 batches of 283963 batches
Completed 1400 batches of 283963 batches
Completed 1500 batches of 283963 batches
Completed 1600 batches of 283963 batches
Completed 1700 batches of 283963 batches
Completed 1800 batches of 283963 batches
Completed 1900 batches of 283963 batches
Completed 2000 batches of 283963 batches
Completed 2100 batches of 283963 batches
Completed 2200 batches of 283963 batches
Completed 2300 batches of 283963 batches
Completed 2400 batches of 28

In [31]:
len(train_embeddings), i

(180001, 7200000)

In [26]:
# batch_size = 40

# new_i = 7200000
# train_embeddings = []

# for i in range(new_i, len(train_df), batch_size):
#     train_embeddings.append(
#         get_bert_embeddings(train_df.iloc[i:i+batch_size])['embeddings'].tolist()
#     )
#     if i//batch_size%100==0:
#         print('Completed',i//batch_size,'batches of',len(train_df)//batch_size,'batches')

In [14]:
len(train_embeddings), i, i//batch_size

(103964, 11358520, 283963)

In [1]:
import pickle

In [35]:
part = 1
start = 20000*(part-1)
end = 20000*(part)
with open('train_part_'+str(part)+'.pkl', 'wb') as pickle_file:
    pickle.dump(train_embeddings[start:end], pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

part = 2
start = 20000*(part-1)
end = 20000*(part)
with open('train_part_'+str(part)+'.pkl', 'wb') as pickle_file:
    pickle.dump(train_embeddings[start:end], pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

part = 3
start = 20000*(part-1)
end = 20000*(part)
with open('train_part_'+str(part)+'.pkl', 'wb') as pickle_file:
    pickle.dump(train_embeddings[start:end], pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

part = 4
start = 20000*(part-4)
end = 20000*(part-3)
with open('train_part_'+str(part)+'.pkl', 'wb') as pickle_file:
    pickle.dump(train_embeddings[start:end], pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

part = 5
start = 20000*(part-4)
end = 20000*(part-3)
with open('train_part_'+str(part)+'.pkl', 'wb') as pickle_file:
    pickle.dump(train_embeddings[start:end], pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

part = 6
start = 20000*(part-4)
end = 20000*(part-3)
with open('train_part_'+str(part)+'.pkl', 'wb') as pickle_file:
    pickle.dump(train_embeddings[start:end], pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

part = 7
start = 20000*(part-4)
end = 20000*(part-3)
with open('train_part_'+str(part)+'.pkl', 'wb') as pickle_file:
    pickle.dump(train_embeddings[start:end], pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

part = 8
start = 20000*(part-4)
end = 20000*(part-3)
with open('train_part_'+str(part)+'.pkl', 'wb') as pickle_file:
    pickle.dump(train_embeddings[start:end], pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

part = 9
start = 20000*(part-4)
end = 20000*(part-3)
with open('train_part_'+str(part)+'.pkl', 'wb') as pickle_file:
    pickle.dump(train_embeddings[start:end], pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

In [24]:
part = 10
start = 20000*(part-10)
end = 20000*(part-9)
with open('train_part_'+str(part)+'.pkl', 'wb') as pickle_file:
    pickle.dump(train_embeddings[start:end], pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

part = 11
start = 20000*(part-10)
end = 20000*(part-9)
with open('train_part_'+str(part)+'.pkl', 'wb') as pickle_file:
    pickle.dump(train_embeddings[start:end], pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

part = 12
start = 20000*(part-10)
end = 20000*(part-9)
with open('train_part_'+str(part)+'.pkl', 'wb') as pickle_file:
    pickle.dump(train_embeddings[start:end], pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

part = 13
start = 20000*(part-10)
end = 20000*(part-9)
with open('train_part_'+str(part)+'.pkl', 'wb') as pickle_file:
    pickle.dump(train_embeddings[start:end], pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

part = 14
start = 20000*(part-10)
end = 20000*(part-9)
with open('train_part_'+str(part)+'.pkl', 'wb') as pickle_file:
    pickle.dump(train_embeddings[start:end], pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

part = 15
start = 20000*(part-10)
end = 20000*(part-9)
with open('train_part_'+str(part)+'.pkl', 'wb') as pickle_file:
    pickle.dump(train_embeddings[start:end], pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
train_embeddings = np.vstack(train_embeddings)

In [3]:
dataset = load_from_disk(
    'sentence_embs_'
)

test_sentence_data = dataset['test']
val_sentence_data = dataset['validation']

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article_ids', 'attention_mask', 'input_ids'],
        num_rows: 11358552
    })
    validation: Dataset({
        features: ['article_ids', 'embeddings'],
        num_rows: 449277
    })
    test: Dataset({
        features: ['article_ids', 'embeddings'],
        num_rows: 391493
    })
})

In [5]:
test_df = pd.DataFrame(dataset['test'])

In [9]:
batch_size = 40

test_embeddings = []
for i in range(0, test_sentence_data.num_rows, batch_size):
    test_embeddings.append(
        test_df.iloc[i:i+batch_size]['embeddings'].tolist()
    )
    if i//batch_size%100==0:
        print('Loaded',i//batch_size,'batches of',test_sentence_data.num_rows//batch_size,'batches')

Loaded 0 batches of 9787 batches
Loaded 100 batches of 9787 batches
Loaded 200 batches of 9787 batches
Loaded 300 batches of 9787 batches
Loaded 400 batches of 9787 batches
Loaded 500 batches of 9787 batches
Loaded 600 batches of 9787 batches
Loaded 700 batches of 9787 batches
Loaded 800 batches of 9787 batches
Loaded 900 batches of 9787 batches
Loaded 1000 batches of 9787 batches
Loaded 1100 batches of 9787 batches
Loaded 1200 batches of 9787 batches
Loaded 1300 batches of 9787 batches
Loaded 1400 batches of 9787 batches
Loaded 1500 batches of 9787 batches
Loaded 1600 batches of 9787 batches
Loaded 1700 batches of 9787 batches
Loaded 1800 batches of 9787 batches
Loaded 1900 batches of 9787 batches
Loaded 2000 batches of 9787 batches
Loaded 2100 batches of 9787 batches
Loaded 2200 batches of 9787 batches
Loaded 2300 batches of 9787 batches
Loaded 2400 batches of 9787 batches
Loaded 2500 batches of 9787 batches
Loaded 2600 batches of 9787 batches
Loaded 2700 batches of 9787 batches
Load

In [10]:
len(test_embeddings)

9788

In [15]:
import pickle

part = 1
start = 20000*(part-1)
end = 20000*(part)
with open('test_part_'+str(part)+'.pkl', 'wb') as pickle_file:
    pickle.dump(test_embeddings[start:end], pickle_file, protocol=pickle.HIGHEST_PROTOCOL)