In [47]:
import tensorflow as tf
from datasets import load_dataset
from transformers import TFMT5ForConditionalGeneration, MT5Tokenizer, DataCollatorForSeq2Seq
from tensorflow.keras.optimizers import Adam


In [48]:
!pip install datasets



In [49]:
tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")

In [50]:
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd

# Hugging Face'ten veri setini y√ºkleme
dataset = load_dataset("batubayk/TR-News")  # Kendi veri setinizin adƒ±nƒ± yazƒ±n

# Train ve validation setlerini alƒ±n
train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']
# Train dataset'in yarƒ±sƒ±nƒ± alma
train_half = train_dataset.select(range(len(train_dataset) // 2))


# T√ºm veri setlerini bir DatasetDict'e d√∂n√º≈üt√ºrme
combined_datasets = DatasetDict({
    'train': train_half,       # Yarƒ±ya indirgenmi≈ü train set
    'validation': val_dataset, # Validation set
    'test': test_dataset       # Test set
})

# Birle≈ütirilmi≈ü veri setini g√∂r√ºnt√ºleme
print(combined_datasets)


DatasetDict({
    train: Dataset({
        features: ['abstract', 'author', 'content', 'date', 'source', 'tags', 'title', 'topic', 'url'],
        num_rows: 138786
    })
    validation: Dataset({
        features: ['abstract', 'author', 'content', 'date', 'source', 'tags', 'title', 'topic', 'url'],
        num_rows: 14610
    })
    test: Dataset({
        features: ['abstract', 'author', 'content', 'date', 'source', 'tags', 'title', 'topic', 'url'],
        num_rows: 15379
    })
})


In [51]:
def tokenize_sample_data(data):

    input_feature = tokenizer(data['content'], truncation=True, max_length=1024)
    label = tokenizer(data['abstract'], truncation=True, max_length=128)
    return {
        "input_ids" : input_feature['input_ids'],
        "attention_mask" : input_feature['attention_mask'],
        "labels" : label['input_ids'],
    }

tokenized_ds = combined_datasets.map(
    tokenize_sample_data,
    remove_columns= ['abstract','author','content','date','source','tags','title','topic','url'],
    batched=True,
    batch_size= 512
)
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 138786
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14610
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 15379
    })
})

In [52]:
'''

from transformers import MT5Tokenizer, MT5ForConditionalGeneration

# 1. Tokenizer ve modelin y√ºklenmesi
model_name = "google/mt5-small"
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

# 2. √ñzetlenmek istenen T√ºrk√ße metin
turkish_text = """
ƒ∞klim deƒüi≈üikliƒüi, k√ºresel sƒ±caklƒ±klarƒ±n artƒ±≈üƒ±yla birlikte ortaya √ßƒ±kan √ßevresel, sosyal ve ekonomik sorunlara neden olmaktadƒ±r.
√ñzellikle kuraklƒ±k, sel ve orman yangƒ±nlarƒ± gibi doƒüal afetlerin sƒ±klƒ±ƒüƒ± artarken, tarƒ±msal √ºretimde de ciddi d√º≈ü√º≈üler ya≈üanmaktadƒ±r.
Bu durum, gƒ±da g√ºvenliƒüini tehdit etmekte ve toplumlarƒ± olumsuz y√∂nde etkilemektedir.
"""

# 3. √ñzetleme i√ßin giri≈ü formatƒ±: 'summarize:' √∂n ekini ekliyoruz
input_text = f"summarize: {turkish_text}"

# 4. Tokenizer ile metni encode etme
input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)

# 5. Modeli kullanarak √∂zetleme yapma
summary_ids = model.generate(
    input_ids,
    max_length=150,
    min_length=30,
    length_penalty=2.0,
    num_beams=4,
    early_stopping=True
)

# 6. √ñzetlenen metni decode etme
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# 7. Sonucu yazdƒ±rma
print("√ñzetlenen Metin:")
print(summary)
'''

'\n\nfrom transformers import MT5Tokenizer, MT5ForConditionalGeneration\n\n# 1. Tokenizer ve modelin y√ºklenmesi\nmodel_name = "google/mt5-small"\ntokenizer = MT5Tokenizer.from_pretrained(model_name)\nmodel = MT5ForConditionalGeneration.from_pretrained(model_name)\n\n# 2. √ñzetlenmek istenen T√ºrk√ße metin\nturkish_text = """\nƒ∞klim deƒüi≈üikliƒüi, k√ºresel sƒ±caklƒ±klarƒ±n artƒ±≈üƒ±yla birlikte ortaya √ßƒ±kan √ßevresel, sosyal ve ekonomik sorunlara neden olmaktadƒ±r.\n√ñzellikle kuraklƒ±k, sel ve orman yangƒ±nlarƒ± gibi doƒüal afetlerin sƒ±klƒ±ƒüƒ± artarken, tarƒ±msal √ºretimde de ciddi d√º≈ü√º≈üler ya≈üanmaktadƒ±r.\nBu durum, gƒ±da g√ºvenliƒüini tehdit etmekte ve toplumlarƒ± olumsuz y√∂nde etkilemektedir.\n"""\n\n# 3. √ñzetleme i√ßin giri≈ü formatƒ±: \'summarize:\' √∂n ekini ekliyoruz\ninput_text = f"summarize: {turkish_text}"\n\n# 4. Tokenizer ile metni encode etme\ninput_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)\n\n# 5. Modeli kullana

In [53]:
import torch
from transformers import AutoConfig, AutoModelForSeq2SeqLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model konfig√ºrasyonu
mt5_config = AutoConfig.from_pretrained(
    "google/mt5-small",
    max_length=128,
    length_penalty=0.6,
    no_repeat_ngram_size=2,
    num_beams=15,
)

# PyTorch tabanlƒ± modeli y√ºkleme ve GPU/CPU'ya ta≈üƒ±ma
model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small", config=mt5_config).to(device)


In [54]:
data_collator = DataCollatorForSeq2Seq(
  tokenizer,
  model=model,
  return_tensors="pt")

In [55]:
import evaluate
import numpy as np
from nltk.tokenize import RegexpTokenizer

rouge_metric = evaluate.load("rouge")

# define function for custom tokenization
def tokenize_sentence(arg):
  encoded_arg = tokenizer(arg)
  return tokenizer.convert_ids_to_tokens(encoded_arg.input_ids)

# define function to get ROUGE scores with custom tokenization
def metrics_func(eval_arg):
  preds, labels = eval_arg
  # Replace -100
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  # Convert id tokens to text
  text_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
  text_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
  # Insert a line break (\n) in each sentence for ROUGE scoring
  # (Note : Please change this code, when you perform on other languages except for Japanese)
  text_preds = [(p if p.endswith(("!", "ÔºÅ", "?", "Ôºü", "„ÄÇ")) else p + "„ÄÇ") for p in text_preds]
  text_labels = [(l if l.endswith(("!", "ÔºÅ", "?", "Ôºü", "„ÄÇ")) else l + "„ÄÇ") for l in text_labels]
  sent_tokenizer_jp = RegexpTokenizer(u'[^!ÔºÅ?Ôºü„ÄÇ]*[!ÔºÅ?Ôºü„ÄÇ]')
  text_preds = ["\n".join(np.char.strip(sent_tokenizer_jp.tokenize(p))) for p in text_preds]
  text_labels = ["\n".join(np.char.strip(sent_tokenizer_jp.tokenize(l))) for l in text_labels]
  # compute ROUGE score with custom tokenization
  return rouge_metric.compute(
    predictions=text_preds,
    references=text_labels,
    tokenizer=tokenize_sentence
  )

In [56]:
!pip install evaluate
!pip install rouge_score



In [57]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="mt5-summarize-ja",
    log_level="error",
    num_train_epochs=5,  # Eƒüitim s√ºresini kƒ±saltmak gerekebilir
    learning_rate=3e-5,  # Daha d√º≈ü√ºk √∂ƒürenme oranƒ±, b√ºy√ºk veri setlerinde genellikle daha iyidir
    lr_scheduler_type="linear",
    warmup_steps=1000,  # Daha fazla warmup adƒ±mƒ±, b√ºy√ºk veri setleri i√ßin faydalƒ± olabilir
    optim="adafactor",
    weight_decay=0.01,
    per_device_train_batch_size=8,  # Batch boyutunu artƒ±rmak daha iyi sonu√ß verebilir
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,  # B√ºy√ºk bir batch boyutu elde etmek i√ßin gradient accumulation kullanƒ±mƒ±
    evaluation_strategy="steps",
    eval_steps=500,  # Deƒüerlendirme adƒ±mlarƒ±nƒ± artƒ±rma
    predict_with_generate=True,
    generation_max_length=128,
    save_steps=1000,  # Daha sƒ±k model kaydetme
    logging_steps=50,  # Daha az sƒ±k loglama
    push_to_hub=False,
)




In [58]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=metrics_func,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"].select(range(20)),
)

trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


{'loss': 24.5112, 'grad_norm': 16774.7421875, 'learning_rate': 1.5e-06, 'epoch': 0.023056083924145484}
{'loss': 23.6268, 'grad_norm': 7427.5166015625, 'learning_rate': 3e-06, 'epoch': 0.04611216784829097}
{'loss': 22.1511, 'grad_norm': 11378.0224609375, 'learning_rate': 4.5e-06, 'epoch': 0.06916825177243645}
{'loss': 20.1197, 'grad_norm': 19247.185546875, 'learning_rate': 6e-06, 'epoch': 0.09222433569658194}
{'loss': 17.8633, 'grad_norm': 14236.763671875, 'learning_rate': 7.5e-06, 'epoch': 0.11528041962072742}
{'loss': 15.1001, 'grad_norm': 3186.00537109375, 'learning_rate': 9e-06, 'epoch': 0.1383365035448729}
{'loss': 12.5646, 'grad_norm': 2478.00537109375, 'learning_rate': 1.05e-05, 'epoch': 0.16139258746901838}
{'loss': 10.0196, 'grad_norm': 1175.4456787109375, 'learning_rate': 1.2e-05, 'epoch': 0.18444867139316387}
{'loss': 7.8663, 'grad_norm': 529.7356567382812, 'learning_rate': 1.3500000000000001e-05, 'epoch': 0.20750475531730936}
{'loss': 6.6431, 'grad_norm': 198.30833435058594,



{'eval_loss': 3.2425925731658936, 'eval_rouge1': 0.27868188998239907, 'eval_rouge2': 0.1480487905727444, 'eval_rougeL': 0.21579057046004535, 'eval_rougeLsum': 0.21626552718006642, 'eval_runtime': 33.5535, 'eval_samples_per_second': 0.596, 'eval_steps_per_second': 0.149, 'epoch': 0.23056083924145485}
{'loss': 5.8539, 'grad_norm': 146.40818786621094, 'learning_rate': 1.65e-05, 'epoch': 0.2536169231656003}
{'loss': 5.1889, 'grad_norm': 111.25946044921875, 'learning_rate': 1.8e-05, 'epoch': 0.2766730070897458}
{'loss': 4.7205, 'grad_norm': 1226.7098388671875, 'learning_rate': 1.95e-05, 'epoch': 0.2997290910138913}
{'loss': 4.4035, 'grad_norm': 27.96617317199707, 'learning_rate': 2.1e-05, 'epoch': 0.32278517493803677}
{'loss': 4.227, 'grad_norm': 98.8541259765625, 'learning_rate': 2.25e-05, 'epoch': 0.34584125886218225}
{'loss': 3.9979, 'grad_norm': 18.958141326904297, 'learning_rate': 2.4e-05, 'epoch': 0.36889734278632774}
{'loss': 3.901, 'grad_norm': 17.45825958251953, 'learning_rate': 2.



{'loss': 3.5721, 'grad_norm': 17.894153594970703, 'learning_rate': 2.9847560975609756e-05, 'epoch': 0.4841777624070552}
{'loss': 3.5084, 'grad_norm': 16.740772247314453, 'learning_rate': 2.9695121951219515e-05, 'epoch': 0.5072338463312006}
{'loss': 3.4453, 'grad_norm': 14.236841201782227, 'learning_rate': 2.954268292682927e-05, 'epoch': 0.5302899302553461}
{'loss': 3.3784, 'grad_norm': 14.119400978088379, 'learning_rate': 2.9390243902439022e-05, 'epoch': 0.5533460141794916}
{'loss': 3.3471, 'grad_norm': 14.839422225952148, 'learning_rate': 2.923780487804878e-05, 'epoch': 0.5764020981036371}
{'loss': 3.3048, 'grad_norm': 18.75555992126465, 'learning_rate': 2.9085365853658536e-05, 'epoch': 0.5994581820277826}
{'loss': 3.2665, 'grad_norm': 14.308640480041504, 'learning_rate': 2.8932926829268295e-05, 'epoch': 0.622514265951928}
{'loss': 3.191, 'grad_norm': 12.731827735900879, 'learning_rate': 2.878048780487805e-05, 'epoch': 0.6455703498760735}
{'loss': 3.1721, 'grad_norm': 12.8121213912963

TrainOutput(global_step=10840, training_loss=3.2928555238730794, metrics={'train_runtime': 43452.6706, 'train_samples_per_second': 15.97, 'train_steps_per_second': 0.249, 'train_loss': 3.2928555238730794, 'epoch': 4.999250677272466})

In [59]:
# Eƒüitilen modeli kaydet
output_dir = "./trained_model"  # Kaydedilecek dizin
trainer.save_model(output_dir)  # Modeli ve tokenizer'ƒ± kaydeder

# Tokenizer'ƒ± ayrƒ±ca kaydetmek isterseniz
tokenizer.save_pretrained(output_dir)

print(f"Model ve tokenizer {output_dir} dizinine kaydedildi.")

Model ve tokenizer ./trained_model dizinine kaydedildi.


In [60]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Eƒüitilen modeli ve tokenizer'ƒ± y√ºkle
model = AutoModelForSeq2SeqLM.from_pretrained("./trained_model")
tokenizer = AutoTokenizer.from_pretrained("./trained_model")




In [68]:
test_text = "D√ºnyamƒ±zda 1900‚Äôl√º yƒ±llardan g√ºn√ºm√ºze kadar dil √∂ƒüretiminde √ße≈üitli metinler kullanƒ±lmƒ±≈ütƒ±r. Bunlar ‚Äú edebi metinler, √ºretilmi≈ü metinler, √∂zg√ºn ve √∂zel metinler ‚Äù ba≈ülƒ±klarƒ± altƒ±nda toplanmƒ±≈ütƒ±r. Metinlerin se√ßimi dil √∂ƒüretim yakla≈üƒ±m ve y√∂ntemlerine g√∂re deƒüi≈ümektedir. Her yakla≈üƒ±m kendine √∂zg√º metin kullanmƒ±≈ütƒ±r. Geleneksel yakla≈üƒ±mda dil bilgisi kurallarƒ±, atas√∂zleri, edebiyat, genel k√ºlt√ºr gibi konularƒ±n √∂ƒüretimine aƒüƒ±rlƒ±k verildiƒüinden edebi metinler kullanƒ±lmƒ±≈ütƒ±r. Davranƒ±≈ü√ßƒ± yakla≈üƒ±mda dil davranƒ±≈ü olarak ele alƒ±nmƒ±≈ü, tekrar, taklit ve ezberleme yoluyla √∂ƒüretilmi≈ütir. Bu yakla≈üƒ±mda edebi metinler yerine √ºretilmi≈ü metinler kullanƒ±lmƒ±≈ütƒ±r. Bili≈üsel yakla≈üƒ±mda ‚Äúdil ileti≈üim aracƒ±dƒ±r‚Äù g√∂r√º≈ü√º yayƒ±lmƒ±≈ü ve √∂zg√ºn metinler kullanƒ±lmaya ba≈ülanmƒ±≈ütƒ±r. "


In [69]:
inputs = tokenizer(
    test_text,
    return_tensors="pt",  # PyTorch tens√∂rleri olarak d√∂ner
    max_length=1024,      # Modelin giri≈ü sƒ±nƒ±rƒ±
    truncation=True       # √áok uzun metinleri keser
)


In [70]:
# √ñzetleme
output_ids = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=128,  # √ñzet uzunluƒüu sƒ±nƒ±rƒ±
    num_beams=4,     # Beam search kullanƒ±mƒ±
    no_repeat_ngram_size=2,  # Tekrar eden n-gramlarƒ± engeller
    length_penalty=0.6       # Kƒ±sa √∂zetlere √∂ncelik
)

# √ñzet √ß√∂z√ºmleme
summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("√ñzet:", summary)


√ñzet: D√ºnyamƒ±zda 1900‚Äôl√º yƒ±llardan g√ºn√ºm√ºze kadar dil √∂ƒüretiminde √ße≈üitli metinler kullanƒ±lmƒ±≈ütƒ±r.


In [71]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
