In [1]:
pip install datasets


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:0

In [2]:
from datasets import load_dataset

ds = load_dataset("gfissore/arxiv-abstracts-2021")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/6.75k [00:00<?, ?B/s]

arxiv-abstracts.jsonl.gz:   0%|          | 0.00/940M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1999486 [00:00<?, ? examples/s]

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi', 'abstract', 'report-no', 'categories', 'versions'],
        num_rows: 1999486
    })
})

In [4]:
print(ds.keys())

dict_keys(['train'])


In [5]:
train_dataset = ds['train']
print(train_dataset.column_names)

['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi', 'abstract', 'report-no', 'categories', 'versions']


In [6]:
title = train_dataset['title']
title[:10]

['Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
 'Sparsity-certifying Graph Decompositions',
 'The evolution of the Earth-Moon system based on the dark matter field\n  fluid model',
 'A determinant of Stirling cycle numbers counts unlabeled acyclic\n  single-source automata',
 'From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\alpha}$',
 'Bosonic characters of atomic Cooper pairs across resonance',
 'Polymer Quantum Mechanics and its Continuum Limit',
 'Numerical solution of shock and ramp compression for general material\n  properties',
 'The Spitzer c2d Survey of Large, Nearby, Insterstellar Clouds. IX. The\n  Serpens YSO Population As Observed With IRAC and MIPS',
 'Partial cubes: structures, characterizations, and constructions']

In [12]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


# Title generation

In [17]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset
#from evaluate import load_metric
import nltk
from math import exp
from nltk.translate.bleu_score import sentence_bleu

nltk.download('punkt')

# Carregar o dataset do arXiv (descomente a linha abaixo para carregar seu dataset corretamente)
# ds = load_dataset("gfissore/arxiv-abstracts-2021")

# Carregar o modelo e tokenizer
model_name = 't5-base'  # Pode trocar para flan-t5, bart, distil-t5
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Limitar para as primeiras 100 amostras para teste (você pode aumentar conforme necessário)
abstracts = ds['train']['abstract'][:100]
titles_references = ds['train']['title'][:100]

# Função para calcular Perplexidade
def calculate_perplexity(log_probs):
    return exp(-log_probs.mean().item())

# Função para calcular as métricas BLEU e Perplexity
def evaluate_generation(abstract, title_reference):
    # Gerar título a partir do abstract
    input_text = "summarize: " + abstract
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs, max_length=50, num_beams=4, early_stopping=True)
    generated_title = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Tokenização para BLEU
    reference_tokens = [nltk.word_tokenize(title_reference.lower())]
    generated_tokens = nltk.word_tokenize(generated_title.lower())

    # Métrica BLEU
    bleu_score = sentence_bleu(reference_tokens, generated_tokens)

    # Perplexidade (usando log-likelihoods)
    with torch.no_grad():
        input_ids = tokenizer(abstract, return_tensors="pt").input_ids
        decoder_input_ids = tokenizer(title_reference, return_tensors="pt").input_ids
        outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        log_probs = outputs.logits
        perplexity = calculate_perplexity(log_probs)

    return generated_title, bleu_score, perplexity

# Avaliar a geração para as primeiras 100 amostras
for i, abstract in enumerate(abstracts):
    generated_title, bleu_score, perplexity = evaluate_generation(abstract, titles_references[i])
    print(f"Generated Title: {generated_title}")
#print(f"ROUGE Scores: {rouge_scores}")
    print(f"BLEU Score: {bleu_score}")
    print(f"Perplexity: {perplexity}")
#print("-" * 50)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Generated Title: the region of phase space is specified in which the calculation is most reliable. good agreement is demonstrated with data from the Fermilab Tevatron. predictions are made for distributions of diphoton pairs produced at the energy of the
BLEU Score: 1.0832677820940877e-231
Perplexity: 13944789105.687975
Generated Title: we describe a new algorithm, the $(k,ell)$-pebble game with colors. it obtains a characterization of the family of $(k,ell)$-sparse graph
BLEU Score: 7.437597952034396e-232
Perplexity: 17189687116.840607
Generated Title: the evolution of the Earth-Moon system is described by the dark matter field fluid model. the closest distance of the Moon to Earth was about 259000 km at 4.5 billion years ago. the model predicts that the Mars
BLEU Score: 0.22637483204056413
Perplexity: 29898059564.376015
Generated Title: a determinant of Stirling cycle numbers counts unlabeled acyclic single-source automata. proof involves a bijection from these automata to certain ma

KeyboardInterrupt: 

# Abstract generation

In [22]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset
#from evaluate import load_metric
import nltk
from math import exp
from nltk.translate.bleu_score import sentence_bleu

nltk.download('punkt')

# Carregar o dataset do arXiv
#ds = load_dataset("gfissore/arxiv-abstracts-2021")

# Carregar o modelo e tokenizer
model_name = 't5-base'  # Pode trocar para flan-t5, bart, distil-t5
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Limitar para as primeiras 100 amostras para teste (você pode aumentar conforme necessário)
titles = ds['train']['title'][:100]
abstracts_references = ds['train']['abstract'][:100]

# Função para calcular Perplexidade
def calculate_perplexity(log_probs):
    return exp(-log_probs.mean().item())

# Função para calcular as métricas ROUGE, BLEU e Perplexity
def evaluate_abstract_generation(title, abstract_reference):
    # Gerar abstract a partir do título
    input_text = "summarize: " + title  # Podemos usar o mesmo prompt de sumarização
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs, max_length=150, num_beams=4, early_stopping=True)
    generated_abstract = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Tokenização para BLEU
    reference_tokens = [nltk.word_tokenize(abstract_reference.lower())]
    generated_tokens = nltk.word_tokenize(generated_abstract.lower())

    # Métrica ROUGE
    #rouge = load_metric('rouge')
    #rouge_scores = rouge.compute(predictions=[generated_abstract], references=[abstract_reference])

    # Métrica BLEU
    bleu_score = sentence_bleu(reference_tokens, generated_tokens)

    # Perplexidade (usando log-likelihoods)
    with torch.no_grad():
        input_ids = tokenizer(title, return_tensors="pt").input_ids
        decoder_input_ids = tokenizer(abstract_reference, return_tensors="pt").input_ids
        outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        log_probs = outputs.logits
        perplexity = calculate_perplexity(log_probs)

    #return generated_abstract, rouge_scores, bleu_score, perplexity
    return generated_abstract, bleu_score, perplexity

# Avaliar a geração de abstracts para as primeiras 100 amostras
for i, title in enumerate(titles):
#generated_abstract, rouge_scores, bleu_score, perplexity = evaluate_abstract_generation(title, abstracts_references[i])
    generated_abstract, bleu_score, perplexity = evaluate_abstract_generation(title, abstracts_references[i])
    print(f"Generated Abstract: {generated_abstract}")
#print(f"ROUGE Scores: {rouge_scores}")
    print(f"BLEU Score: {bleu_score}")
    print(f"Perplexity: {perplexity}")
#print("-" * 50)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Generated Abstract: Calculation of prompt diphoton production cross sections at Tevatron and LHC energies.
BLEU Score: 2.5700202427751015e-236
Perplexity: 7414159735207.794
Generated Abstract: Sparsity-certifying Graph Decompositions.
BLEU Score: 8.32517080038094e-247
Perplexity: 1031067069495.1296
Generated Abstract: evolution of the Earth-Moon system based on the dark matter field fluid model.
BLEU Score: 9.118643781368e-06
Perplexity: 1081351031453.9288
Generated Abstract: a determinant of Stirling cycle numbers counts unlabeled acyclic single-source automata.
BLEU Score: 0.12451447144412296
Perplexity: 139270403260.71976
Generated Abstract: From dyadic $Lambda_alpha$ to $Lambda_alpha$.
BLEU Score: 2.261083294816499e-233
Perplexity: 25543672513.787846
Generated Abstract: Bosonic characters of atomic Cooper pairs across resonance.
BLEU Score: 2.5299296275422946e-162
Perplexity: 3618711189393.051
Generated Abstract: Polymer Quantum Mechanics and its Continuum Limit.
BLEU Score: 9.6114

KeyboardInterrupt: 

# Category Generation

In [None]:
import torch
from transformers import pipeline
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from datasets import load_dataset
import numpy as np

# Carregar o dataset do arXiv
#ds = load_dataset("gfissore/arxiv-abstracts-2021")

# Pipeline de classificação
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Limitar para as primeiras 100 amostras para teste (você pode aumentar conforme necessário)
abstracts_titles = [title + ". " + abstract for title, abstract in zip(ds['train']['title'][:100], ds['train']['abstract'][:100])]
true_labels = ds['train']['categories'][:100]  # Categorias verdadeiras

# Labels para classificação (ajuste conforme as categorias do dataset)
candidate_labels = ["Computer Science", "Mathematics", "Physics", "Biology"]

# Função para realizar a classificação
def classify_paper(abstract_title):
    result = classifier(abstract_title, candidate_labels)
    predicted_label = result['labels'][0]
    return predicted_label

# Avaliar a classificação
predicted_labels = []
for abstract_title in abstracts_titles:
    predicted_label = classify_paper(abstract_title)
    predicted_labels.append(predicted_label)

# Calcular métricas de classificação
accuracy = accuracy_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels, average='weighted')
conf_matrix = confusion_matrix(true_labels, predicted_labels, labels=candidate_labels)

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix:\n {conf_matrix}")

model.safetensors:  19%|#8        | 304M/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]