In [6]:
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(palette='summer')

import transformers
from datasets import load_dataset
import evaluate

from transformers import AutoTokenizer
import re

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

import gzip
import json
from pathlib import Path
import shutil

In [None]:
def convert_to_jsonl(input_path, output_path):

    """Конвертирует файл в JSONL формат с обработкой ошибок"""
    
    with open(input_path, 'r', encoding='utf-8') as f_in, \
         open(output_path, 'w', encoding='utf-8') as f_out:

        for line in f_in:
            line = line.strip()
            if not line:
                continue

            try:
                data = json.loads(line)
                json.dump(data, f_out, ensure_ascii=False)
                f_out.write('\n')
            except json.JSONDecodeError:
                if line.startswith('['):
                    try:
                        for item in json.loads(line):
                            json.dump(item, f_out, ensure_ascii=False)
                            f_out.write('\n')
                    except:
                        print(f"Failed to parse array in: {input_path}")
                else:
                    print(f"Invalid JSON line skipped in: {input_path}")

def process_gz_files(source_root=".", target_root="converted_data"):
    source_path = Path(source_root)
    target_path = Path(target_root)

    for gz_file in source_path.rglob("*.gz"):
        try:
            relative_path = gz_file.relative_to(source_path)
            output_dir = target_path / relative_path.parent
            output_dir.mkdir(parents=True, exist_ok=True)

            temp_file = output_dir / gz_file.name
            final_file = output_dir / gz_file.name.replace(".gz", ".json")

            with gzip.open(gz_file, 'rb') as f_in:
                with open(temp_file, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)

            convert_to_jsonl(temp_file, final_file)
            temp_file.unlink()

        except Exception as e:
            print(f"Error processing {gz_file}: {str(e)}")

# process_gz_files()

Conversion complete!


In [None]:
def filter_and_save_records(source_root="converted_data", target_root="filtered_data"):

    '''Getting filtered data'''
    
    source_path = Path(source_root)
    target_path = Path(target_root)
    
    processed_files = set(target_path.rglob("*.json"))
    
    for src_file in source_path.rglob("*.json"):
        relative_path = src_file.relative_to(source_path)
        dst_file = target_path / relative_path
        
        if dst_file.exists():
            continue
            
        dst_file.parent.mkdir(parents=True, exist_ok=True)
        
        try:
            with open(src_file, 'r', encoding='utf-8') as f_in, \
                 open(dst_file, 'w', encoding='utf-8') as f_out:

                filtered_count = 0
                total_count = 0
                
                for line in f_in:
                    line = line.strip()
                    total_count += 1
                    if not line:
                        continue

                    try:
                        record = json.loads(line)
                        abstract = record.get('abstract', '')
                        
                        if len(abstract.split()) >= 200:
                            json.dump(record, f_out, ensure_ascii=False)
                            f_out.write('\n')
                            filtered_count += 1
                            
                    except json.JSONDecodeError:
                        continue
                    except Exception as e:
                        print(f"Error processing record: {e}")

                
        except Exception as e:
            print(f"Error processing file {src_file}: {e}")
            if dst_file.exists():
                dst_file.unlink()

# filter_and_save_records()

Filtering complete!


In [None]:
from datasets import load_dataset, concatenate_datasets
import aiohttp
from pathlib import Path
from datasets import Dataset

arxiv = load_dataset("scientific_papers", "arxiv", 
                     split="train", 
                     trust_remote_code=True, 
                     storage_options={'client_kwargs': {'timeout': aiohttp.ClientTimeout(total=3600)}},
                     cache_dir=r"C:\Users\denis\.cache\huggingface")


arxiv.remove_columns(['publication_number', 'application_number', 'section_names'])

def load_filtered_dataset(data_root="filtered_data"):
    data_path = Path(data_root)
    return Dataset.from_json([
        str(p) for p in data_path.rglob("*.json")
    ])

patent_dataset = load_filtered_dataset()


patent_dataset.remove_columns(['publication_number', 'application_number'])
patent_dataset = patent_dataset.rename_column('abstract', 'summary')
patent_dataset = patent_dataset.rename_column('description', 'article')

combined_dataset = concatenate_datasets([arxiv, patent_dataset])

Downloading data: 100%|██████████| 3.62G/3.62G [07:54<00:00, 7.64MB/s] 
Downloading data: 100%|██████████| 880M/880M [01:50<00:00, 7.99MB/s] 
Generating train split: 100%|██████████| 203037/203037 [01:48<00:00, 1874.25 examples/s]
Generating validation split: 100%|██████████| 6436/6436 [00:04<00:00, 1488.30 examples/s]
Generating test split: 100%|██████████| 6440/6440 [00:03<00:00, 1644.19 examples/s]


In [4]:
combined_dataset

Dataset({
    features: ['article', 'abstract', 'section_names', 'publication_number', 'summary', 'application_number'],
    num_rows: 255226
})

In [7]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

def preprocess_function(examples):
    inputs = [re.sub(r'<[^>]+>|http\S+', '', text) for text in examples["article"]]
    targets = [re.sub(r'[\U00010000-\U0010ffff]', '', text) for text in examples["summary"]]
    
    model_inputs = tokenizer(
        inputs,
        max_length=1024,
        truncation=True,
        add_special_tokens=True,
    )
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=256,
            truncation=True,
            add_special_tokens=True,
        )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [None]:
data_full = preprocess_function(combined_dataset)

In [None]:

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    num_train_epochs=5,
    predict_with_generate=True,
    generation_max_length=128,  # Максимальная длина суммы
)

trainer = Seq2SeqTrainer(
    # model=model,
    args=training_args,
    # train_dataset=train_dataset,
    # eval_dataset=val_dataset,
)
trainer.train()