In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("/content/kg_dataset (1).csv")

In [4]:
df = df.dropna().drop_duplicates()

In [5]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  #many spaces
    text = text.strip()               #spaces at start and end
    return text

df["Text"] = df["Text"].apply(clean_text)
df["headline"] = df["headline"].apply(clean_text)

In [6]:
MAX_LEN = 512
df = df[df["Text"].str.len() < MAX_LEN]

In [7]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["Text"].tolist(),
    df["headline"].tolist(),
    test_size=0.1,
    random_state=42
)

print("Train size:", len(train_texts))
print("Validation size:", len(val_texts))

Train size: 1555
Validation size: 173


In [8]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
import torch

#model selection
model_name = "google/mt5-small"

tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.
You are using the default legacy behaviour of the <class 'transformers.models.mt5.tokenization_mt5.MT5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only 

In [9]:
#max input characters and max output
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 64

#training tokenization
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=MAX_INPUT_LENGTH)
train_targets = tokenizer(train_labels, truncation=True, padding=True, max_length=MAX_TARGET_LENGTH)

#validation tokenization
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=MAX_INPUT_LENGTH)
val_targets = tokenizer(val_labels, truncation=True, padding=True, max_length=MAX_TARGET_LENGTH)

In [10]:
class KyrgyzHeadlineDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, targets):
        self.encodings = encodings
        self.targets = targets

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        item = {
            "input_ids": torch.tensor(self.encodings["input_ids"][idx]),
            "attention_mask": torch.tensor(self.encodings["attention_mask"][idx]),
            "labels": torch.tensor(self.targets["input_ids"][idx]),
        }
        return item

train_dataset = KyrgyzHeadlineDataset(train_encodings, train_targets)
val_dataset = KyrgyzHeadlineDataset(val_encodings, val_targets)

In [12]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    logging_dir="./logs",
    save_steps=500,              #save every 100
    logging_steps=100,           #logging every 100
    do_train=True,
    do_eval=True,
    eval_steps=500,              #every 500 steps eval
)

In [13]:
import evaluate
import numpy as np

rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Очистим лишние пробелы
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # ROUGE
    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    # BLEU
    bleu_result = bleu.compute(predictions=[p.split() for p in decoded_preds],
                               references=[[l.split()] for l in decoded_labels])

    return {
        "rouge1": rouge_result["rouge1"],
        "rougeL": rouge_result["rougeL"],
        "bleu": bleu_result["bleu"]
    }

In [15]:
from transformers import Trainer, TrainingArguments

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


<IPython.core.display.Javascript object>

KeyboardInterrupt: 

In [27]:
import os
checkpoint_dir = "./results"

In [28]:
checkpoints = [d for d in os.listdir(checkpoint_dir) if d.startswith("checkpoint")]
checkpoints.sort(key=lambda x: int(x.split("-")[-1]))
if checkpoints:
    print(f"Последний чекпойнт: {checkpoints[-1]}")
else:
    print("Чекпойнты не найдены.")

Чекпойнты не найдены.


In [None]:
metrics = trainer.evaluate()
print(metrics)

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


In [None]:
model.save_pretrained("my_kyrgyz_headline_model")
tokenizer.save_pretrained("my_kyrgyz_headline_model")

In [None]:
def generate_headline(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(model.device)
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=30,
        num_beams=4,
        early_stopping=True
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Пример
sample_text = "Ысык-Көл облусуна караштуу Тоң районунун Кажы-Сай айылындагы Пушкин атындагы орто мектебинде Тоң раойнунун баш имам-хатиби Таалайбек ажы Акунов ата-энелер менен жолугушуу өткөрдү. Жыйында баш имам-хатиб салттуу ислам, терроризм, экстремизм, окуучулар арасында рэкетчилик, бала тарбиясы туурасында лекция окуду."
print(generate_headline(sample_text))
