In [1]:
from transformers import (
    AutoTokenizer,
    BartForConditionalGeneration,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
from datasets import load_from_disk, concatenate_datasets, Dataset

from evaluate import load

import os
import torch
import gc
import numpy as np

model_path = "/home/debian/develop/denis/Neuro-research/BART/bart-finetuned/final_model"
CHUNKS_DIR = "/home/debian/develop/denis/Neuro-research/BART/data"
BATCH_SIZE = 4
MAX_INPUT_LENGTH = 1024
MAX_TARGET_LENGTH = 256
SEED = 42

model = BartForConditionalGeneration.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import load_dataset, concatenate_datasets
import aiohttp
from pathlib import Path
from datasets import Dataset
import numpy as np

arxiv = load_dataset("scientific_papers", "arxiv", 
                     split="train", 
                     trust_remote_code=True, 
                     storage_options={'client_kwargs': {'timeout': aiohttp.ClientTimeout(total=3600)}},
                     cache_dir='/home/debian/.cache/huggingface/datasets')


arxiv = arxiv.remove_columns(['section_names'])
arxiv = arxiv.rename_column('abstract', 'summary')

def load_filtered_dataset(data_root="/home/debian/develop/denis/Neuro-research/BART/converted_data/data/test"):
    data_path = Path(data_root)
    return Dataset.from_json([
        str(p) for p in data_path.rglob("*.json")
    ])

patent_dataset = load_filtered_dataset()


patent_dataset = patent_dataset.remove_columns(['publication_number', 'application_number'])
patent_dataset = patent_dataset.rename_column('abstract', 'summary')
patent_dataset = patent_dataset.rename_column('description', 'article')



def split_and_combine_datasets(arxiv_ds: Dataset, 
                              patent_ds: Dataset, 
                              seed: int = 42,
                              train_ratio: float = 0.8,
                              val_ratio: float = 0.1) -> tuple[Dataset, Dataset, Dataset]:
    """
    Разделяет каждый датасет на train/val/test и объединяет соответствующие части
    
    Параметры:
    arxiv_ds: Датасет arXiv
    patent_ds: Датасент патентов
    seed: Сид для воспроизводимости
    train_ratio: Доля тренировочных данных (0.0-1.0)
    val_ratio: Доля валидационных данных (0.0-1.0)
    
    Возвращает:
    (train, val, test) - объединенные датасеты
    """
    
    assert np.isclose(train_ratio + val_ratio + (1 - train_ratio - val_ratio), 1.0), "Пропорции должны суммироваться к 1"
    
    def split_single(ds: Dataset) -> tuple[Dataset, Dataset, Dataset]:
        train_test = ds.train_test_split(
            test_size=1-train_ratio, 
            seed=seed,
            shuffle=True
        )
        
        val_test = train_test['test'].train_test_split(
            test_size=val_ratio/(val_ratio + (1 - train_ratio - val_ratio)), 
            seed=seed,
            shuffle=True
        )
        
        return train_test['train'], val_test['train'], val_test['test']
    
    arxiv_train, arxiv_val, arxiv_test = split_single(arxiv_ds)
    patent_train, patent_val, patent_test = split_single(patent_ds)
    
    combined_train = concatenate_datasets([arxiv_train, patent_train])
    combined_val = concatenate_datasets([arxiv_val, patent_val])
    combined_test = concatenate_datasets([arxiv_test, patent_test])
    
    return combined_train, combined_val, combined_test

train_ds, val_ds, test_ds = split_and_combine_datasets(
    arxiv_ds=arxiv,
    patent_ds=patent_dataset,
    seed=42,
    train_ratio=0.8,
    val_ratio=0.1
)

print(f"Размеры финальных датасетов:")
print(f"Train: {len(train_ds)} samples")
print(f"Val: {len(val_ds)} samples")
print(f"Test: {len(test_ds)} samples") 

Размеры финальных датасетов:
Train: 216086 samples
Val: 27010 samples
Test: 27013 samples


In [3]:
def load_chunks_optimized(chunk_dir):
    chunk_dirs = [
        os.path.join(chunk_dir, d)
        for d in sorted(os.listdir(chunk_dir))
        if d.startswith("chunk") and os.path.isdir(os.path.join(chunk_dir, d))
    ]

    chunk_files = []
    for d in chunk_dirs:
        filename = "data-00000-of-00001.arrow"
        file_path = os.path.join(d, filename)

        if os.path.exists(file_path):
            chunk_files.append(file_path)
        else:
            raise FileNotFoundError(f"File {file_path} not found in directory {d}")

    if not chunk_files:
        raise ValueError(f"No valid chunk files found in {chunk_dir}")

    print(f"Loading {len(chunk_files)} chunks from {chunk_dir}")
    return concatenate_datasets([
        Dataset.from_file(f) for f in chunk_files
    ])

PREDS

In [None]:
import torch
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

predictions = []
references = []

with open('/home/debian/develop/denis/Neuro-research/BART/page_content.txt', 'r', encoding='utf-8') as f:
    example = f.read()

for example in tqdm(test_ds, desc="Generating predictions"):
    inputs = tokenizer(
        example, 
        return_tensors="pt", 
        max_length=MAX_INPUT_LENGTH, 
        truncation=True
    ).to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=MAX_TARGET_LENGTH,
            num_beams=4,
            early_stopping=True
        )
    
    pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predictions.append(pred_text)
    references.append(example["summary"])

In [8]:
references[0]

' the clustering properties of local , @xmath0  mjy , radio sources are investigated for a sample of 820 objects drawn from the joint use of the first and 2df galaxy redshift surveys . to this aim \n , we present 271 new @xmath1 spectroscopic counterparts of first radio sources to be added to those already introduced in magliocchetti et al . \n ( 2002 ) . \n the two - point correlation function for the local radio population is found to be entirely consistent with estimates obtained for the whole sample of 2dfgrs galaxies . from measurements of the redshift - space correlation function \n @xmath2 we derive a redshift - space clustering length @xmath3  mpc , while from the projected correlation function @xmath4 we estimate the parameters of the real - space correlation function @xmath5 , @xmath6  mpc and @xmath7 , where @xmath8 is assumed . \n different results are instead obtained if we only consider sources that present signatures of agn activity in their spectra . \n these objects ar

In [9]:
predictions[0]


' we investigate the clustering properties of @xmath0  mjy radio galaxies drawn from the joint use of the first and 2df galaxy redshift surveys as illustrated in magliocchetti et al . \n ( 2002 ) . by doing this \n , we not only extend the peacock & nicholson ( 1991 ) measurements to a statistically more significant sample involving less local objects , but we also probe much lower flux densities where the population contains radio - emitting sources that differ from typical agns ( such as galaxies undergoing intense star formation ) . in addition to the above analysis , we also estimate the two - point correlation function ( both in redshift space and real space ) for the homogeneous sample of radio galaxies ( sources that present signatures of agn activity in their optical spectra ) . '

In [10]:
from evaluate import load

rouge = load("rouge")
bleu = load("bleu")

rouge_results = rouge.compute(
    predictions=predictions,
    references=references,
    use_stemmer=True
)

bleu_results = bleu.compute(
    predictions=predictions,
    references=[[ref] for ref in references]
)

print("ROUGE Scores:")
print(f"ROUGE-1: {rouge_results['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_results['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_results['rougeL']:.4f}")

print("\nBLEU Score:")
print(f"BLEU: {bleu_results['bleu']:.4f}")

ROUGE Scores:
ROUGE-1: 0.3656
ROUGE-2: 0.1436
ROUGE-L: 0.2116

BLEU Score:
BLEU: 0.0076


In [4]:
from datasets import Dataset
from pathlib import Path

def create_test_dataset(text_file: str) -> Dataset:
    # Прочитать содержимое файла
    with open(text_file, "r", encoding="utf-8") as f:
        content = f.read()

    # Создать структуру для датасета
    data = {
        "article": [content],  # Весь текст в одной строке
        "summary": [""]        # Пустые строки для заполнения
    }

    return Dataset.from_dict(data)

test_dataset = create_test_dataset('/home/debian/develop/denis/Neuro-research/BART/example.txt')
print(test_dataset)

Dataset({
    features: ['article', 'summary'],
    num_rows: 1
})


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

MAX_INPUT_LENGTH = 1024
MAX_TARGET_LENGTH = 256




inputs = tokenizer(
        example, 
        return_tensors="pt", 
        max_length=MAX_INPUT_LENGTH, 
        truncation=True
    ).to(device)
    
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_length=MAX_TARGET_LENGTH,
        num_beams=4,
        early_stopping=True
    )
    
    pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

NameError: name 'torch' is not defined

In [3]:
pred_text

NameError: name 'pred_text' is not defined

In [18]:
from transformers import BartForConditionalGeneration, BartTokenizer
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Используемое устройство: {device}")

model_path = "/home/debian/develop/denis/Neuro-research/BART/bart-finetuned/final_model"

model = BartForConditionalGeneration.from_pretrained(model_path).to(device)
tokenizer = BartTokenizer.from_pretrained(model_path)

def chunk_text(text, chunk_size=512, overlap=64):
    tokens = tokenizer.tokenize(text)
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk_tokens = tokens[i:i + chunk_size]
        chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
        chunks.append(chunk_text)
        if len(chunks) == 512:
            return chunks
    return chunks

def generate_summaries(chunks):
    model.eval()
    summaries = []
    
    for chunk in chunks:
        inputs = tokenizer(
            chunk,
            max_length=1024,
            truncation=True,
            return_tensors="pt"
        )

        inputs = {key: value.to(device) for key, value in inputs.items()}

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=128,
                num_beams=4,
                early_stopping=True
            )
        
        summary = tokenizer.decode(outputs[0].cpu(), skip_special_tokens=True)
        summaries.append(summary)
    
    return summaries

with open('/home/debian/develop/denis/Neuro-research/BART/test.txt', 'r', encoding='utf-8') as f:
    example = f.read()

text = example
chunks = chunk_text(text, chunk_size=512, overlap=100)
summaries = generate_summaries(chunks)

print(summaries)

Используемое устройство: cuda


['A method to create a generic generic II chatbot (General AI chatbot). The method comprises the steps of: (a) generating a generic 2 chatbot; (b) creating a generic 3 chatbot from the generic 2 set of 2 chatbots; (c) constructing a generic 4 chatbot based on the generic 3 set of 4 chatbots, and (d) creating the generic 4 set of 3 chatbots based on a generic 5 chatbot, (e) generating an generic 3-set of 2-set 2-chats based on generic 3 sets of 2.5 An artificial neural network (ANT', ' we present the architectural design of the model architec-ture (GENMO) and elucidates how it unifies motion estimation and generation within a single model. The model is characterized by the following steps: (a) transforming a noisy motion se-quence xt with the conditions C and condtion masks M into a clean motion sequence x0 through a series of carefully de-signed components. The initial processing stage consists of an additive fusion block that converts xt into a sequence of motion tokens. This block ut

In [14]:
print(*summaries)

The GPT version of version 3.5, modified using the GPT model to follow the command model (InstructGPT model), was designed to create a special generic II chatbot (General AI chatbot).ChatGPT. Notes [to correct code] 12Vaswani A., Shazeer N., Parmar N.,USzkoreit B., Jones L., Gomez A. N., Kaiser . .,Polosukhin I.Attenation is All you Need,R. Advances in National Information Processing Systems 30I. In this paper , we present a method to create a generic generic II chatbot (General AI chatbot). The method comprises the steps of: (a) generating a generic 2 chatbot; (b) creating a generic 3 chatbot from the generic 2 set of 2 chatbots; (c) constructing a generic 4 chatbot based on the generic 3 set of 4 chatbots, and (d) creating the generic 4 set of 3 chatbots based on a generic 5 chatbot, (e) generating an generic 3-set of 2-set 2-chats based on generic 3 sets of 2.5 An artificial neural network (ANT) is a type of equipment used in mechanical translation, part in national language process

In [12]:
from transformers import (
    AutoTokenizer,
    BartForConditionalGeneration,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
from datasets import load_from_disk, concatenate_datasets, Dataset

from evaluate import load

import os
import torch
import gc
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model_path = "/home/debian/develop/denis/Neuro-research/BART/bart-finetuned/final_model"
CHUNKS_DIR = "/home/debian/develop/denis/Neuro-research/BART/data"
BATCH_SIZE = 4
MAX_INPUT_LENGTH = 1024
MAX_TARGET_LENGTH = 256
SEED = 42

model = BartForConditionalGeneration.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

model = model.to(device)
def chunk_text(text, chunk_size=512, overlap=64):
    tokens = tokenizer.tokenize(text)
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk_tokens = tokens[i:i + chunk_size]
        chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
        chunks.append(chunk_text)
        if len(chunks) == 512:
            return chunks
    return chunks

def generate_summaries(chunks):
    model.eval()
    summaries = []
    
    for chunk in chunks:
        inputs = tokenizer(
            chunk,
            max_length=1024,
            truncation=True,
            return_tensors="pt"
        )

        inputs = {key: value.to(device) for key, value in inputs.items()}

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=128,
                num_beams=4,
                early_stopping=True
            )
        
        summary = tokenizer.decode(outputs[0].cpu(), skip_special_tokens=True)
        summaries.append(summary)
    
    return summaries

def get_summarisation(content_file_path: str, ):
    with open(content_file_path, 'r', encoding='utf-8') as f:
        example = f.read()

    text = example
    while len(text.split()) > 2048:
        print('-------------')
        print(len(text.split()))
        chunks = chunk_text(text, chunk_size=512, overlap=100)
        summaries = generate_summaries(chunks)
        text = ''
        for item in summaries:
            text += item
        print('-------------')
        print(len(text.split()))

    return summaries


content_file_path='/home/debian/develop/denis/Neuro-research/test/input.txt'
summaries = get_summarisation(content_file_path=content_file_path)

Token indices sequence length is longer than the specified maximum sequence length for this model (5933 > 1024). Running this sequence through the model will result in indexing errors


-------------
4082
-------------
1232


In [10]:
len(summaries)

text = ''
for i in summaries:
    text+=i
print(len(text.split()))

1232


In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto").to("cuda" if torch.cuda.is_available() else "cpu")

def generate_structured_report(summaries):
    prompt = """<|user|>: Write a single continuous paragraph that seamlessly integrates these key points:
    {}
    
    Guidelines:
    1. Connect ideas using transitional phrases ("furthermore", "however", "this suggests")
    2. Maintain logical flow between sentences
    3. Avoid section headings or bullet points
    4. Use academic linking: "Building on this", "An important corollary"
    5. Keep technical terminology
    
    Output must be one cohesive paragraph, not thesises (3-5 complex sentences). <|assistant|>:""".format("\n".join(f"- {s}" for s in summaries))

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.4,
        top_k=50,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    generated_text = response.split("<|assistant|>:")[-1].strip()

    return generated_text


report = generate_structured_report(summaries)

print(report)

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


Transformers are a family of neural networks that have revolutionized natural language processing (NLP) and have become the backbone of many modern AI applications. Transformers are designed to process sequential data, most prominently associated with large language models (LLMs), which have achieved elite performance in other fields of AI, such as computer vision, speech recognition, and time series forecasting. Transformers are characterized by their self-attention mechanism, which allows for parallelization and the ability to generate query, key, and value vectors for each part of an input sequence simultaneously. This quality of transformers has enabled them to process long-range dependencies and make decisions about how and when to focus on specific time steps of that sequence. Transformers have also achieved elite performance in other fields of AI, such as computer vision, speech recognition, and time series forecasting. This paper discusses the transformer architecture and the a

In [20]:
print(report)

A method to create a generic generic II chatbot (General AI chatbot) is presented. The method comprises the steps of:
    1. Generating a generic 2 chatbot:
        a. Creating a generic 2 set of 2 chatbots:
            b. Generating a generic 3 chatbot:
                c. Creating a generic 4 chatbot:
                    d. Creating the generic 4 set of 3 chatbots:
                        e. Generating an generic 3-set of 2-set 2-chats based on generic 3 sets of 2.5
    2. Architectural design of the model architecture (GENMO)
        a. Elucidates how GENMO unifies motion estimation and generation within a single model
        b. Presents the steps of transforming a noisy motion sequence xt with the conditions C and condition masks M into a clean motion sequence x0 through a series of carefully de-signed components
        c. Presents a dual-mode training paradigm for a diffusion model trained with the standard DDPM objective to generate motion sequences that satisfy the condition se