In [12]:
from IPython.utils import io

with io.capture_output() as captured:
  !pip3 install datasets;
  !pip3 install tqdm;

import numpy as np
import pandas as pd

import torch

from datasets import Dataset
from tqdm import tqdm
import multiprocessing


def get_device():
    device = 'cpu'

    # set gpu if available
    if torch.backends.mps.is_available():
        device = torch.device('mps')
    elif torch.cuda.is_available():
        device = 'cuda'

    return device


def read_data(dataset='train', colab=True, local=False, hf=False, dir_relative_path=None):

  def read_from_hf():
    import huggingface_hub
    huggingface_hub.login()

    from datasets import load_dataset
    dataset = load_dataset("aeirya/irhw3", use_auth_token=True)

    import pandas as pd
    text = dataset['train']['text']

    with open('buffer.txt', 'w') as file:
      file.write('\n'.join(text))

    df = pd.read_csv('buffer.txt', delimiter='\t')

    return df


  def read_from_drive():
    from google.colab import drive
    try:
      drive.mount('/content/drive')
    except:
      pass

    dir = '/content/drive/My Drive/'
    if dir_relative_path:
      dir += dir_relative_path
    else:
      dir += 'Assignment/InformationRetrieval/hw3/dataset'

    file = f'{dir}/{dataset}_dataset.txt'

    return pd.read_csv(file, delimiter='\t')


  def read_from_local():
    file = f'dataset/{dataset}_dataset.txt'
    df = pd.read_csv(file, delimiter='\t')

    return df


  if local:
     return read_from_local()
  if hf:
    return read_from_hf()
  if colab:
    return read_from_drive()


def init_model_tokenizer(model_name='distilroberta-base', device='cpu'):
  from transformers import AutoTokenizer, AutoModel

  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModel.from_pretrained(model_name)

  model.eval()
  model.to(device)

  return model, tokenizer


def tokenize(sentences, tokenizer, max_length=250):
    return tokenizer(
        sentences,
        add_special_tokens=True,
        return_tensors="pt",
        return_attention_mask=True,
        padding=True,
        truncation=True,
        max_length=max_length
    )


def bert(tokenized_input, model):

    with torch.no_grad():
        output = model(**tokenized_input).last_hidden_state

    return output


def sent2vec(output, alpha=0.2):
    '''
    aggregate tokens in every sentence together

    alpha: the effect of max pooling

    @param output: (n_sentences x n_tokens x n_hidden_size)
    '''

    max_ = output.max(axis=1).values
    mean = output.mean(axis=1)
    return alpha * max_ + (1-alpha) * mean


def init_review_ds(df, tokenizer, num_cores, batch_size=12000, max_review_len=250):
  # warning: df should not contain nan

  def tok(texts):
     return tokenize(texts, tokenizer, max_review_len)

  ds = Dataset.from_pandas(df)\
    .map(tok, batched=True, batch_size=batch_size, input_columns='review_text', remove_columns=['review_text'],
         num_proc=num_cores, desc='tokenizing')

  ds.set_format('torch')

  print("dataset ready")

  return ds


def item_indices(df):
  items = list(set(df['item_id']))
  item2idx = { x:i for i,x in enumerate(items) }

  return item2idx


def bert_encode_items(df, tok_ds, model, device):

  item2idx = item_indices(df)
  n_items = len(item2idx)

  # encoding matrix of items
  enc = torch.zeros((n_items, 768))

  # reviews for each item
  groups = df.groupby(by='item_id')

  for item, group in tqdm(groups, desc='encoding items'):

      b_input = tok_ds[group.index]
      b_input = {k:b_input[k].to(device) for k in ['input_ids', 'token_type_ids', 'attention_mask'] if k in b_input}

      out = bert(b_input, model)

      # average of all sentences
      v = sent2vec(out).mean(axis=0)

      enc[item2idx[item], :] = v

  return enc

In [13]:
device = get_device()
num_cores = multiprocessing.cpu_count()

print(f'device: {device}, cores: {num_cores}')

model_name = 'roberta-base' #'roberta-base', 'distilroberta-base', 'distilbert-base-uncased', 'bert-base-uncased'
model, tokenizer = init_model_tokenizer(model_name, device)

print(f'using {model_name} model')
print()

df = read_data('train', colab=True, dir_relative_path='ir-dataset').dropna().reset_index()[['item_id', 'review_text']]
ds = init_review_ds(df, tokenizer, num_cores)

device: cuda, cores: 2


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using roberta-base model

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


tokenizing (num_proc=2):   0%|          | 0/23037 [00:00<?, ? examples/s]

dataset ready


In [14]:
E = bert_encode_items(df, ds, model, device)

out = 'roberta_item_encodings.npy'
np.save(out, E)

print()
print(f'output matrix saved to {out}')

encoding items: 100%|██████████| 733/733 [05:07<00:00,  2.38it/s]


output matrix saved to roberta_item_encodings.npy



