In [None]:
import pandas as pd
import json
import os
import ast
import numpy as np
#import matplotlib.pyplot as pltfrom datasets import Dataset, load_dataset, concatenate_datasets 
from datasets import Dataset, load_dataset, concatenate_datasets 
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments, BertModel, DebertaV2Model, AutoModelForCausalLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch.nn.functional as F
import torch.nn as nn
import torch
import random
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from collections import Counter
from copy import deepcopy
from sklearn.metrics import precision_recall_curve, auc, roc_auc_score
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR
from sklearn.metrics.pairwise import cosine_similarity
# from peft import get_peft_config, get_peft_model, AdaLoraConfig, TaskType, LoraConfig
# from peft import PeftModel, PeftConfig

In [3]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")
set_seed()

Random seed set as 42


In [4]:
dataset = load_dataset('IlyaGusev/gazeta')

No config specified, defaulting to: gazeta/default
Reusing dataset gazeta (/home/user/.cache/huggingface/datasets/IlyaGusev___gazeta/default/2.0.0/e2d171980aa248bc22e0af4f8485ad69071fc8e5f3d54a253c71eb434f6694bd)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title', 'date', 'url'],
        num_rows: 60964
    })
    test: Dataset({
        features: ['text', 'summary', 'title', 'date', 'url'],
        num_rows: 6793
    })
    validation: Dataset({
        features: ['text', 'summary', 'title', 'date', 'url'],
        num_rows: 6369
    })
})

In [6]:
model_name = 'ai-forever/rugpt3small_based_on_gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='right', eos_token='</s>', bos_token='<s>', pad_token='<pad>')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
def prepare_features(samples):
    samples = [t + ' <s>' + s + '</s>' for t, s in zip(samples['text'], samples['summary'])]
    return tokenizer(samples, max_length=2048, padding=True, truncation=True)

In [None]:
dataset = dataset.map(prepare_features, batched=True)
dataset

In [11]:
model = AutoModelForCausalLM.from_pretrained(model_name, device_map=DEVICE_MAP)

In [12]:
training_args = Seq2SeqTrainingArguments(
    "models/rugpt3small",
    num_train_epochs=5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.001,
    learning_rate=5e-5,
    fp16=False,
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=1,
    load_best_model_at_end=False,
    report_to='none',
    gradient_accumulation_steps=16,
    optim='adamw_torch',
    predict_with_generate=False
)

In [13]:
import transformers
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [None]:
trainer.train()