In [2]:
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(palette='summer')

import transformers
from datasets import load_dataset
import evaluate

from transformers import AutoTokenizer
import re

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

import gzip
import json
from pathlib import Path
import shutil

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def convert_to_jsonl(input_path, output_path):

    """Конвертирует файл в JSONL формат с обработкой ошибок"""
    
    with open(input_path, 'r', encoding='utf-8') as f_in, \
         open(output_path, 'w', encoding='utf-8') as f_out:

        for line in f_in:
            line = line.strip()
            if not line:
                continue

            try:
                data = json.loads(line)
                json.dump(data, f_out, ensure_ascii=False)
                f_out.write('\n')
            except json.JSONDecodeError:
                if line.startswith('['):
                    try:
                        for item in json.loads(line):
                            json.dump(item, f_out, ensure_ascii=False)
                            f_out.write('\n')
                    except:
                        print(f"Failed to parse array in: {input_path}")
                else:
                    print(f"Invalid JSON line skipped in: {input_path}")

def process_gz_files(source_root=".", target_root="converted_data"):
    source_path = Path(source_root)
    target_path = Path(target_root)

    for gz_file in source_path.rglob("*.gz"):
        try:
            relative_path = gz_file.relative_to(source_path)
            output_dir = target_path / relative_path.parent
            output_dir.mkdir(parents=True, exist_ok=True)

            temp_file = output_dir / gz_file.name
            final_file = output_dir / gz_file.name.replace(".gz", ".json")

            with gzip.open(gz_file, 'rb') as f_in:
                with open(temp_file, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)

            convert_to_jsonl(temp_file, final_file)
            temp_file.unlink()

        except Exception as e:
            print(f"Error processing {gz_file}: {str(e)}")

process_gz_files()

In [3]:
def filter_and_save_records(source_root="converted_data", target_root="filtered_data"):

    '''Getting filtered data'''
    
    source_path = Path(source_root)
    target_path = Path(target_root)
    
    processed_files = set(target_path.rglob("*.json"))
    
    for src_file in source_path.rglob("*.json"):
        relative_path = src_file.relative_to(source_path)
        dst_file = target_path / relative_path
        
        if dst_file.exists():
            continue
            
        dst_file.parent.mkdir(parents=True, exist_ok=True)
        
        try:
            with open(src_file, 'r', encoding='utf-8') as f_in, \
                 open(dst_file, 'w', encoding='utf-8') as f_out:

                filtered_count = 0
                total_count = 0
                
                for line in f_in:
                    line = line.strip()
                    total_count += 1
                    if not line:
                        continue

                    try:
                        record = json.loads(line)
                        abstract = record.get('abstract', '')
                        
                        if len(abstract.split()) >= 200:
                            json.dump(record, f_out, ensure_ascii=False)
                            f_out.write('\n')
                            filtered_count += 1
                            
                    except json.JSONDecodeError:
                        continue
                    except Exception as e:
                        print(f"Error processing record: {e}")

                
        except Exception as e:
            print(f"Error processing file {src_file}: {e}")
            if dst_file.exists():
                dst_file.unlink()

filter_and_save_records()

In [None]:
from datasets import load_dataset, concatenate_datasets
import aiohttp
from pathlib import Path
from datasets import Dataset
import numpy as np

arxiv = load_dataset("scientific_papers", "arxiv", 
                     split="train", 
                     trust_remote_code=True, 
                     storage_options={'client_kwargs': {'timeout': aiohttp.ClientTimeout(total=3600)}},
                     cache_dir=r"C:\Users\denis\.cache\huggingface\datasets")


arxiv = arxiv.remove_columns(['section_names'])
arxiv = arxiv.rename_column('abstract', 'summary')

def load_filtered_dataset(data_root="filtered_data"):
    data_path = Path(data_root)
    return Dataset.from_json([
        str(p) for p in data_path.rglob("*.json")
    ])

patent_dataset = load_filtered_dataset()


patent_dataset = patent_dataset.remove_columns(['publication_number', 'application_number'])
patent_dataset = patent_dataset.rename_column('abstract', 'summary')
patent_dataset = patent_dataset.rename_column('description', 'article')



def split_and_combine_datasets(arxiv_ds: Dataset, 
                              patent_ds: Dataset, 
                              seed: int = 42,
                              train_ratio: float = 0.8,
                              val_ratio: float = 0.1) -> tuple[Dataset, Dataset, Dataset]:
    """
    Разделяет каждый датасет на train/val/test и объединяет соответствующие части
    
    Параметры:
    arxiv_ds: Датасет arXiv
    patent_ds: Датасент патентов
    seed: Сид для воспроизводимости
    train_ratio: Доля тренировочных данных (0.0-1.0)
    val_ratio: Доля валидационных данных (0.0-1.0)
    
    Возвращает:
    (train, val, test) - объединенные датасеты
    """
    
    assert np.isclose(train_ratio + val_ratio + (1 - train_ratio - val_ratio), 1.0), "Пропорции должны суммироваться к 1"
    
    def split_single(ds: Dataset) -> tuple[Dataset, Dataset, Dataset]:
        train_test = ds.train_test_split(
            test_size=1-train_ratio, 
            seed=seed,
            shuffle=True
        )
        
        val_test = train_test['test'].train_test_split(
            test_size=val_ratio/(val_ratio + (1 - train_ratio - val_ratio)), 
            seed=seed,
            shuffle=True
        )
        
        return train_test['train'], val_test['train'], val_test['test']
    
    arxiv_train, arxiv_val, arxiv_test = split_single(arxiv_ds)
    patent_train, patent_val, patent_test = split_single(patent_ds)
    
    combined_train = concatenate_datasets([arxiv_train, patent_train])
    combined_val = concatenate_datasets([arxiv_val, patent_val])
    combined_test = concatenate_datasets([arxiv_test, patent_test])
    
    return combined_train, combined_val, combined_test

train_ds, val_ds, test_ds = split_and_combine_datasets(
    arxiv_ds=arxiv,
    patent_ds=patent_dataset,
    seed=42,
    train_ratio=0.8,
    val_ratio=0.1
)

print(f"Размеры финальных датасетов:")
print(f"Train: {len(train_ds)} samples")
print(f"Val: {len(val_ds)} samples")
print(f"Test: {len(test_ds)} samples")

Размеры финальных датасетов:
Train: 204180 samples
Val: 25521 samples
Test: 25525 samples


In [2]:
from datasets import Dataset
from pathlib import Path

def create_test_dataset(text_file: str) -> Dataset:
    # Прочитать содержимое файла
    with open(text_file, "r", encoding="utf-8") as f:
        content = f.read()

    # Создать структуру для датасета
    data = {
        "article": [content],  # Весь текст в одной строке
        "summary": [""]        # Пустые строки для заполнения
    }

    return Dataset.from_dict(data)

test_dataset = create_test_dataset(r"D:\ethd\ml\Neuro-research\example.txt")
print(test_dataset)

Dataset({
    features: ['article', 'summary'],
    num_rows: 1
})


In [None]:
from datasets import Dataset
import gc
import os
from transformers import AutoTokenizer
from datasets import load_dataset, concatenate_datasets, Dataset
import re
import gc
import os

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")


def tokenize_in_chunks(dataset: Dataset, chunk_size=1000, save_dir="processed"):
    # Создаем директорию для сохранения
    os.makedirs(save_dir, exist_ok=True)
    
    total_samples = len(dataset)
    num_chunks = total_samples // chunk_size + 1
    
    for i in range(num_chunks):
        # Выбираем чанк данных
        chunk = dataset.select(range(
            i * chunk_size,
            min((i + 1) * chunk_size, total_samples)
        ))
        
        # Токенизация
        tokenized_chunk = chunk.map(
            lambda examples: tokenizer(
                examples["article"],
                text_target=examples["summary"],
                max_length=1024,
                truncation=True,
                padding=False
            ),
            batched=True,
            batch_size=32,
            remove_columns=["article", "summary"],
            load_from_cache_file=False
        )
        
        # Сохранение чанка
        tokenized_chunk.save_to_disk(
            os.path.join(save_dir, f"chunk_{i}"),
            max_shard_size="100MB"
        )
        
        # Очистка памяти
        del chunk
        del tokenized_chunk
        gc.collect()
        
        print(f"Processed chunk {i+1}/{num_chunks}")

# Использование
tokenize_in_chunks(patent_dataset, chunk_size=2000)

In [4]:
from torch.utils.data import DataLoader
from datasets import load_from_disk, concatenate_datasets
import os
from transformers import DataCollatorForSeq2Seq
import random
from transformers import AutoTokenizer
from datasets import load_dataset, concatenate_datasets, Dataset
import re
import gc
import os

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

class ChunkedDataset:
    def __init__(self, chunk_dir, shuffle=True):
        self.chunk_files = sorted([
            os.path.join(chunk_dir, f) 
            for f in os.listdir(chunk_dir) 
            if f.startswith("chunk")
        ])
        self.shuffle = shuffle
        self.current_chunk = None

    def __iter__(self):
        if self.shuffle:
            random.shuffle(self.chunk_files)
            
        for chunk_file in self.chunk_files:
            # Загрузка чанка по требованию
            self.current_chunk = load_from_disk(chunk_file)
            yield from self.current_chunk
            
    def get_dataloader(self, batch_size=8, collate_fn=None):
        return DataLoader(
            self,
            batch_size=batch_size,
            collate_fn=collate_fn,
            num_workers=4,
            pin_memory=True
        )

# Использование
chunk_dir = r"processed"
dataset = ChunkedDataset(chunk_dir)
dataloader = dataset.get_dataloader(
    batch_size=2,
    collate_fn=DataCollatorForSeq2Seq(tokenizer)
)

In [None]:
from transformers import (
    AutoTokenizer,
    BartForConditionalGeneration,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
from datasets import load_from_disk, concatenate_datasets, Dataset
import os
import torch

MODEL_CHOICES = {
    "tiny": "sshleifer/distilbart-cnn-12-6",
    "base": "facebook/bart-base",
    "distilled": "sshleifer/distilbart-cnn-12-6",
    "custom": "patrickvonplaten/bart-tiny-random"
}

# Пример использования
MODEL_NAME = MODEL_CHOICES["tiny"]

CHUNKS_DIR = "processed"
OUTPUT_DIR = "bart-finetuned"
BATCH_SIZE = 4
MAX_INPUT_LENGTH = 1024
MAX_TARGET_LENGTH = 256
SEED = 42

# 1. Инициализация модели с оптимизациями памяти
model = BartForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,        # Полуточность
    low_cpu_mem_usage=True,
    gradient_checkpointing=True       # Экономит до 60% памяти
)

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    use_fast=True,                   # Используем быстрый токенизатор
    model_max_length=MAX_INPUT_LENGTH
)

def load_chunks_optimized(chunk_dir):
    chunk_dirs = [
        os.path.join(chunk_dir, d)
        for d in sorted(os.listdir(chunk_dir))
        if d.startswith("chunk") and os.path.isdir(os.path.join(chunk_dir, d))
    ]
    
    chunk_files = []
    for d in chunk_dirs:
        filename = "data-00000-of-00001.arrow"
        file_path = os.path.join(d, filename)
        
        if os.path.exists(file_path):
            chunk_files.append(file_path)
        else:
            raise FileNotFoundError(f"File {file_path} not found in directory {d}")
    
    if not chunk_files:
        raise ValueError(f"No valid chunk files found in {chunk_dir}")
    
    print(f"Loading {len(chunk_files)} chunks from {chunk_dir}")
    return concatenate_datasets([
        Dataset.from_file(f) for f in chunk_files
    ])


def preprocess_function(examples):
    inputs = [doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)
    labels = tokenizer(examples["summary"], max_length=MAX_TARGET_LENGTH, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# train_dataset = load_chunks_optimized(os.path.join(CHUNKS_DIR, 'train'))
# train_dataset = train_dataset.map(preprocess_function, batched=True, num_proc=8)


data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    pad_to_multiple_of=8,            # Улучшает производительность на Tensor Cores
    padding='longest',
    max_length=MAX_INPUT_LENGTH
)
    
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_steps=100,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE*4,  # Увеличил для валидации
    gradient_accumulation_steps=4,    # Эмулирует batch_size=32
    learning_rate=3e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    bf16=True,                        # Аппаратное ускорение
    seed=SEED,
    warmup_steps=500,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    dataloader_num_workers=8,         # Используем больше ядер CPU
    dataloader_pin_memory=True,       # Ускоряет передачу данных в GPU
    dataloader_prefetch_factor=2,     # Предзагрузка данных
    remove_unused_columns=True,       # Удаляем неиспользуемые столбцы
    optim="adamw_bnb_8bit",           # 8-битный оптимизатор
    report_to="none"                  # Отключаем логирование для ускорения
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=load_chunks_optimized(os.path.join(CHUNKS_DIR, 'train')),
    eval_dataset=load_chunks_optimized(os.path.join(CHUNKS_DIR, 'train')),
    data_collator=data_collator,
    tokenizer=tokenizer
)

try:
    trainer.train()
except KeyboardInterrupt:
    print("\nTraining interrupted. Saving final model...")

trainer.save_model(os.path.join(OUTPUT_DIR, "final_model"))
trainer.model.save_pretrained(OUTPUT_DIR, safe_serialization=True)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Error while downloading from https://cdn-lfs.hf.co/sshleifer/distilbart-cnn-12-6/3bac65d18c99463302d12ca75c2220ea714f9c81ce235f205fa818efe71df6ea?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1745425783&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NTQyNTc4M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9zc2hsZWlmZXIvZGlzdGlsYmFydC1jbm4tMTItNi8zYmFjNjVkMThjOTk0NjMzMDJkMTJjYTc1YzIyMjBlYTcxNGY5YzgxY2UyMzVmMjA1ZmE4MThlZmU3MWRmNmVhP3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiZyZXNwb25zZS1jb250ZW50LXR5cGU9KiJ9XX0_&Signature=Sdlz-jfGMGdSmjIi6B9MDw7ewCeGD2965OxOqQ

Loading 27 chunks from processed\train
Loading 27 chunks from processed\train


  trainer = Trainer(
Error while downloading from https://cdn-lfs.hf.co/sshleifer/distilbart-cnn-12-6/1e46814333b97dfa0f866f58fd15cd7b48ffbe7fd4c1a929caa5f95c7b2fa592?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1745425893&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NTQyNTg5M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9zc2hsZWlmZXIvZGlzdGlsYmFydC1jbm4tMTItNi8xZTQ2ODE0MzMzYjk3ZGZhMGY4NjZmNThmZDE1Y2Q3YjQ4ZmZiZTdmZDRjMWE5MjljYWE1Zjk1YzdiMmZhNTkyP3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiJ9XX0_&Signature=IPIijnXglcdAgsyvqfnUvpQyBPUJdWCkiiZxI1lBKLILvIXCCYjC8zBRGuUG6ooubMs3gr%7EjeOOz2D7zL52S2qxWaFDi79M75W1IJKb6v2D2CgxylUdYhXkx0GDOI2yBk9NJ6qKFAhSPR%7EIMYGzxtJkq7ga9bHVc3bTWgTbvvEA51uh7g-YDFiubg12nK7WrVzv-6VoeR3HcGy4h74hToLwiL82Qx4x6Ibg1dikHukzmiaJreH5g7UiL0Rfw21GVX-tkw9lmE8ULW%7E-8o173A1cFaYlw2fckewvMhOhWLNSAJoR3tKK6va1T1h%7E5Hjmyi8TDLqAPlyHvyLPcxVULeg__&Key-Pa

Step,Training Loss,Validation Loss



Training interrupted. Saving final model...




In [2]:
import torch, transformers, accelerate
print(f"PyTorch: {torch.__version__}")        # Должно быть 2.3.0+
print(f"Transformers: {transformers.__version__}")  # 4.41.0+
print(f"Accelerate: {accelerate.__version__}")      # 0.29.3+

  from .autonotebook import tqdm as notebook_tqdm


PyTorch: 2.5.1+cu121
Transformers: 4.50.3
Accelerate: 1.6.0


In [1]:
import torch
print(torch.cuda.is_available())  # Должно быть True
print(torch.version.cuda)

True
12.1
