In [1]:
from datasets import load_dataset, load_from_disk, Dataset
import numpy as np
import pandas as pd
from transformers import RobertaTokenizer, RobertaModel
import torch
from torch.utils.data import TensorDataset, DataLoader

In [2]:
data_base = 'data'
# data_base = ''

dataset_name = 'cnn_dailymail'

data_path = data_base+'/'+dataset_name

In [3]:
## set up tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained('roberta-base')

dataset = load_from_disk(
    data_path
)

train_data = dataset['train']
test_data = dataset['test']
val_data = dataset['validation']

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [4]:
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_VISIBLE_DEVICES=1


In [5]:
device = torch.device("cuda")
model = model.to(device)
print('model on GPU')

model on GPU


In [6]:
val_data

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 13368
})

In [7]:
dataset['validation']['article'][0]

'(CNN)Singer-songwriter David Crosby hit a jogger with his car Sunday evening, a spokesman said. The accident happened in Santa Ynez, California, near where Crosby lives. Crosby was driving at approximately 50 mph when he struck the jogger, according to California Highway Patrol Spokesman Don Clotworthy. The posted speed limit was 55. The jogger suffered multiple fractures, and was airlifted to a hospital in Santa Barbara, Clotworthy said. His injuries are not believed to be life threatening. "Mr. Crosby was cooperative with authorities and he was not impaired or intoxicated in any way. Mr. Crosby did not see the jogger because of the sun," said Clotworthy. According to the spokesman, the jogger and Crosby were on the same side of the road. Pedestrians are supposed to be on the left side of the road walking toward traffic, Clotworthy said. Joggers are considered pedestrians. Crosby is known for weaving multilayered harmonies over sweet melodies. He belongs to the celebrated rock group 

In [5]:
def clean_split_article(article):
  article['article'] = article['article'].replace('(CNN)','').replace('\n',' ')
  article['article_sentences'] = [
                                  doc.strip()
                                  for doc in article['article'].split('.')
                                  if len(doc.strip())>=4
                                  ]
  article['article_length'] = len(article['article_sentences'])
  article['article_sentence_max_length'] = max([
                                                len(doc.split(' '))
                                                for doc in article['article_sentences']
  ])
  return article

train_data = train_data.map(
    clean_split_article, 
    batched=False
)
test_data = test_data.map(
    clean_split_article, 
    batched=False
)
val_data = val_data.map(
    clean_split_article, 
    batched=False
)

Loading cached processed dataset at data/cnn_dailymail/train/cache-75281d55c4a92911.arrow
Loading cached processed dataset at data/cnn_dailymail/test/cache-963638bcb423e4cc.arrow
Loading cached processed dataset at data/cnn_dailymail/validation/cache-7eb4e1fa156d1c43.arrow


In [6]:
val_data['article_sentences'][0], val_data['article_length'][0], val_data['article_sentence_max_length'][0]

(['Singer-songwriter David Crosby hit a jogger with his car Sunday evening, a spokesman said',
  'The accident happened in Santa Ynez, California, near where Crosby lives',
  'Crosby was driving at approximately 50 mph when he struck the jogger, according to California Highway Patrol Spokesman Don Clotworthy',
  'The posted speed limit was 55',
  'The jogger suffered multiple fractures, and was airlifted to a hospital in Santa Barbara, Clotworthy said',
  'His injuries are not believed to be life threatening',
  'Crosby was cooperative with authorities and he was not impaired or intoxicated in any way',
  'Crosby did not see the jogger because of the sun," said Clotworthy',
  'According to the spokesman, the jogger and Crosby were on the same side of the road',
  'Pedestrians are supposed to be on the left side of the road walking toward traffic, Clotworthy said',
  'Joggers are considered pedestrians',
  'Crosby is known for weaving multilayered harmonies over sweet melodies',
  'He b

In [7]:
print((max(train_data['article_sentence_max_length']), np.median(train_data['article_sentence_max_length'])),
(max(test_data['article_sentence_max_length']), np.median(test_data['article_sentence_max_length'])),
(max(val_data['article_sentence_max_length']), np.median(val_data['article_sentence_max_length'])))

(532, 41.0) (193, 42.0) (188, 42.0)


In [11]:
def create_sentence_dataset(data_):
  df = pd.DataFrame({
      "sentences": np.concatenate(data_['article_sentences']).tolist(),
      "article_ids": np.repeat(data_['id'], data_['article_length']).tolist()
  })
  data_dataset = Dataset.from_pandas(df)
  return data_dataset

train_sentence_data = create_sentence_dataset(train_data)
test_sentence_data = create_sentence_dataset(test_data)
val_sentence_data = create_sentence_dataset(val_data)

In [13]:
train_sentence_data, test_sentence_data, val_sentence_data

(Dataset({
     features: ['sentences', 'article_ids'],
     num_rows: 11358552
 }),
 Dataset({
     features: ['sentences', 'article_ids'],
     num_rows: 391493
 }),
 Dataset({
     features: ['sentences', 'article_ids'],
     num_rows: 449277
 }))

In [14]:
from datasets import DatasetDict

dataset = DatasetDict({
    "train": train_sentence_data,
    "validation": val_sentence_data,
    "test": test_sentence_data,
})

dataset.save_to_disk(data_path+'/sentences')

In [15]:
batch_size = 1024
encoder_max_length = 200

def process_data_to_model_inputs(batch):
  # tokenize the inputs and labels
  inputs = tokenizer(
      batch["sentences"], 
      padding="max_length", 
      truncation=True, 
      max_length=encoder_max_length
  )

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask

  return batch

train_sentence_data = train_sentence_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["sentences"]
)
train_sentence_data.set_format(
    type="torch", 
    columns=[
             "input_ids", "attention_mask", "article_ids"
    ],
)

test_sentence_data = test_sentence_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["sentences"]
)
test_sentence_data.set_format(
    type="torch", 
    columns=[
             "input_ids", "attention_mask", "article_ids"
    ],
)

val_sentence_data = val_sentence_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["sentences"]
)
val_sentence_data.set_format(
    type="torch", 
    columns=[
             "input_ids", "attention_mask", "article_ids"
    ],
)

  0%|          | 0/11093 [00:00<?, ?ba/s]

  0%|          | 0/383 [00:00<?, ?ba/s]

  0%|          | 0/439 [00:00<?, ?ba/s]

In [7]:
val_sentence_data

Dataset({
    features: ['article_ids', 'attention_mask', 'input_ids'],
    num_rows: 449277
})

In [18]:
from datasets import DatasetDict

dataset = DatasetDict({
    "train": train_sentence_data,
    "validation": val_sentence_data,
    "test": test_sentence_data,
})

dataset.save_to_disk(data_path+'/sentences')

In [6]:
dataset = load_from_disk(
    data_path+'/sentences'
)#sentence_embs_

train_sentence_data = dataset['train']
test_sentence_data = dataset['test']
val_sentence_data = dataset['validation']

In [6]:
def get_bert_embeddings(batch, method = 'pool'):
    
    train_seq = torch.tensor(batch['input_ids']).to(device)
    train_mask = torch.tensor(batch['attention_mask']).to(device)

    preds = model(train_seq, train_mask)

    attention_mask = train_mask.cpu()
    base_output = preds.last_hidden_state.detach().cpu()

    if method=='pool':
        # Mean Pool and ignore Padding
        base_output[attention_mask==0, :] = float('nan')
        output_representation = torch.div(torch.nansum(base_output, axis=1), torch.sum(attention_mask==1, axis=1,  keepdim=True))
    elif method=='cls':
        output_representation = base_output[:,0,:]

    output_representation = output_representation.numpy()
    # print('Embeddings Shape:',output_representation.shape)
    
    batch['input_ids'] = batch['input_ids'] #.numpy()
    batch['attention_mask'] = batch['attention_mask'] #.numpy()
    batch['embeddings'] = output_representation
    return batch

In [9]:
batch_size = 30

# val_sentence_data = val_sentence_data.remove_columns("article_ids").map(
#     get_bert_embeddings, 
#     batched=True, 
#     batch_size=batch_size, 
#     remove_columns=['attention_mask', 'input_ids']
# )

train_sentence_data = train_sentence_data.map(
    get_bert_embeddings, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=['attention_mask', 'input_ids']
)

test_sentence_data = test_sentence_data.map(
    get_bert_embeddings, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=['attention_mask', 'input_ids']
)

val_sentence_data = val_sentence_data.map(
    get_bert_embeddings, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=['attention_mask', 'input_ids']
)

  0%|          | 0/378619 [00:00<?, ?ba/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter serve

OSError: [Errno 122] Error writing bytes to file. Detail: [errno 122] Disk quota exceeded

In [12]:
train_sentence_data

Dataset({
    features: ['article_ids', 'attention_mask', 'input_ids'],
    num_rows: 11358552
})

In [6]:
dataset = load_from_disk(
    data_path+'/sentence_embs_'
)

train_sentence_data = dataset['train']
test_sentence_data = dataset['test']
val_sentence_data = dataset['validation']

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article_ids', 'attention_mask', 'input_ids'],
        num_rows: 11358552
    })
    validation: Dataset({
        features: ['article_ids', 'embeddings'],
        num_rows: 449277
    })
    test: Dataset({
        features: ['article_ids', 'embeddings'],
        num_rows: 391493
    })
})

In [8]:
train_df = pd.DataFrame(dataset['train'])

In [30]:
def get_bert_embeddings(batch, method = 'pool'):
    
    train_seq = torch.tensor(batch['input_ids'].to_list()).to(device)
    train_mask = torch.tensor(batch['attention_mask'].to_list()).to(device)

    preds = model(train_seq, train_mask)

    attention_mask = train_mask.cpu()
    base_output = preds.last_hidden_state.detach().cpu()

    if method=='pool':
        # Mean Pool and ignore Padding
        base_output[attention_mask==0, :] = float('nan')
        output_representation = torch.div(torch.nansum(base_output, axis=1), torch.sum(attention_mask==1, axis=1,  keepdim=True))
    elif method=='cls':
        output_representation = base_output[:,0,:]

    output_representation = output_representation.numpy()
    # print('Embeddings Shape:',output_representation.shape)
    
    batch['input_ids'] = batch['input_ids'] #.numpy()
    batch['attention_mask'] = batch['attention_mask'] #.numpy()
    batch['embeddings'] = output_representation.tolist()
    return batch

In [35]:
def get_train_embs():
    batch_size = 40
    train_embeddings = []
    for i in range(0, len(train_df), batch_size):
        train_embeddings.append(
            get_bert_embeddings(train_df.iloc[i:i+batch_size])['embeddings'].tolist()
        )
        if i%100==0:
            print('Completed',i//batch_size,'batches of',len(train_df)//batch_size,'batches')

    train_embeddings = np.vstack(train_embeddings)
    return train_embeddings

In [36]:
train_embeddings = get_train_embs()

RuntimeError: CUDA out of memory. Tried to allocate 24.00 MiB (GPU 0; 11.91 GiB total capacity; 11.01 GiB already allocated; 4.94 MiB free; 11.12 GiB reserved in total by PyTorch)

In [None]:
train_embeddings