In [None]:
import gzip
import json
import os

In [None]:
import pandas as pd
import numpy as np

In [None]:
import torch
import tqdm
from transformers import T5ForConditionalGeneration, T5Tokenizer

In [None]:
print(torch.__version__)

2.6.0+cu124


In [None]:
from sentence_transformers import util

In [None]:
# read the downloaded data
df = pd.read_parquet('ml_research_assistant/data/papers_with_abstract.parquet')
print(df.shape)
df.sample(3)

(20286, 5)


Unnamed: 0,paper_year,paper_url,paper_title,paper_author,paper_abstract
2730,2023,https://papers.nips.cc/paper_files/paper/2023/...,Federated Linear Bandits with Finite Adversari...,"Li Fan, Ruida Zhou, Chao Tian, Cong Shen","We study a federated linear bandits model, whe..."
19639,1992,https://papers.nips.cc/paper_files/paper/1992/...,Learning Fuzzy Rule-Based Neural Networks for ...,"Charles Higgins, Rodney Goodman",A three-step method for function approxima...
18915,1997,https://papers.nips.cc/paper_files/paper/1997/...,A Simple and Fast Neural Network Approach to S...,Rolf Henkel,A neural network approach to stereovision...


In [None]:
paragraphs = df['paper_abstract'].tolist()
print("Paragraphs:", len(paragraphs))

Paragraphs: 20286


In [None]:
paragraphs[1]

"We present a new algorithm, Cross-Episodic Curriculum (CEC), to boost the learning efficiency and generalization of Transformer agents. Central to CEC is the placement of cross-episodic experiences into a Transformer’s context, which forms the basis of a curriculum. By sequentially structuring online learning trials and mixed-quality demonstrations, CEC constructs curricula that encapsulate learning progression and proficiency increase across episodes. Such synergy combined with the potent pattern recognition capabilities of Transformer models delivers a powerful cross-episodic attention mechanism. The effectiveness of CEC is demonstrated under two representative scenarios: one involving multi-task reinforcement learning with discrete control, such as in DeepMind Lab, where the curriculum captures the learning progression in both individual and progressively complex settings; and the other involving imitation learning with mixed-quality data for continuous control, as seen in RoboMimi

In [None]:
# load the model that is able to generate queries given a paragraph
tokenizer = T5Tokenizer.from_pretrained("BeIR/query-gen-msmarco-t5-large-v1")
model = T5ForConditionalGeneration.from_pretrained("BeIR/query-gen-msmarco-t5-large-v1")
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=4096, bias=False)
              (wo): Linear(in_features=4096, out_features=1024, bias=False)
              (d

In [None]:
# load the model on cuda
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=4096, bias=False)
              (wo): Linear(in_features=4096, out_features=1024, bias=False)
              (d

In [None]:
model.device

device(type='cuda', index=0)

In [None]:
batch_size = 8
num_queries = 3  # Number of queries to generate for every paragraph
max_length_paragraph = 512  # Max length for paragraph
max_length_query = 64  # Max length for output query

In [None]:
tokenizer.model_max_length

512

In [None]:
# for every paragraph in our corpus, we generate the queries, and save it to a file
with open("ml_research_assistant/data/generated_queries_for_abstracts.tsv", "w") as fOut:
    for start_idx in tqdm.trange(0, len(paragraphs), batch_size):
        sub_paragraphs = paragraphs[start_idx : start_idx + batch_size]
        inputs = tokenizer.prepare_seq2seq_batch(
            sub_paragraphs, max_length=max_length_paragraph, truncation=True, return_tensors="pt"
        ).to(device)
        outputs = model.generate(
            **inputs, max_length=max_length_query, do_sample=True, top_p=0.95, num_return_sequences=num_queries
        )

        for idx, out in enumerate(outputs):
            query = tokenizer.decode(out, skip_special_tokens=True)
            para = sub_paragraphs[int(idx / num_queries)]
            fOut.write("{}\t{}\n".format(query.replace("\t", " ").strip(), para.replace("\t", " ").strip()))

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

100%|██████████| 2536/2536 [1:17:47<00:00,  1.84s/it]
