<a href="https://colab.research.google.com/github/buketugurlu/LangTutorAI/blob/main/langtutorai2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install datasets


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [4]:
import os
import torch
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW, get_scheduler
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

# 1. Cihaz Seçimi
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Kullanılan cihaz: {device}")

# 2. Model ve Tokenizer Yükleme
model_name = "modfiededition/t5-base-fine-tuned-on-jfleg"
model = T5ForConditionalGeneration.from_pretrained(model_name, from_tf=True).to(device)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# 3. Veri Setini Yükleme ve İşleme
dataset = load_dataset("jfleg", split="validation")
data = [{"input": f"grammar: {x['sentence']}", "target": x["corrections"][0]} for x in dataset]

# Eğitim ve doğrulama setlerini ayır
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Padding optimizasyonu
class GrammarCorrectionDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        source = self.data[idx]["input"]
        target = self.data[idx]["target"]

        source_enc = self.tokenizer(
            source, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt"
        )
        target_enc = self.tokenizer(
            target, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt"
        )

        return {
            "input_ids": source_enc["input_ids"].squeeze(0),
            "attention_mask": source_enc["attention_mask"].squeeze(0),
            "labels": target_enc["input_ids"].squeeze(0)
        }

def collate_fn(batch):
    input_ids = pad_sequence([b["input_ids"] for b in batch], batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence([b["attention_mask"] for b in batch], batch_first=True, padding_value=0)
    labels = pad_sequence([b["labels"] for b in batch], batch_first=True, padding_value=-100)
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

train_dataset = GrammarCorrectionDataset(train_data, tokenizer)
val_dataset = GrammarCorrectionDataset(val_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=16, collate_fn=collate_fn)

# 4. Eğitim Ayarları
optimizer = AdamW(model.parameters(), lr=3e-5)
num_training_steps = len(train_loader) * 3  # epoch sayısı
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
epochs = 3

# 5. Eğitim Fonksiyonları
def train_epoch(model, data_loader, optimizer, device, lr_scheduler):
    model.train()
    total_loss = 0
    loop = tqdm(data_loader, desc="Eğitim")
    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())
    return total_loss / len(data_loader)

def eval_epoch(model, data_loader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        loop = tqdm(data_loader, desc="Doğrulama")
        for batch in loop:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())
    return total_loss / len(data_loader)

# 6. Eğitim Döngüsü
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train_loss = train_epoch(model, train_loader, optimizer, device, lr_scheduler)
    val_loss = eval_epoch(model, val_loader, device)
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation Loss: {val_loss:.4f}")

# 7. Model Kaydetme
model_dir = "t5_fine_tuned_grammar_optimized"
os.makedirs(model_dir, exist_ok=True)
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

# 8. Model Testi
def test_sentence(sentence):
    input_text = f"grammar: {sentence}"
    input_ids = tokenizer(input_text, return_tensors="pt", max_length=128, truncation=True).input_ids.to(device)
    outputs = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

example_sentence = "She go to school every day and like to learn new things."
corrected_sentence = test_sentence(example_sentence)

print(f"Orijinal Cümle: {example_sentence}")
print(f"Düzeltilmiş Cümle: {corrected_sentence}")

Kullanılan cihaz: cuda


tf_model.h5:   0%|          | 0.00/892M [00:00<?, ?B/s]

All TF 2.0 model weights were used when initializing T5ForConditionalGeneration.

Some weights of T5ForConditionalGeneration were not initialized from the TF 2.0 model and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/1.95k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


README.md:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/141k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/755 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/748 [00:00<?, ? examples/s]



Epoch 1/3


Eğitim:   0%|          | 0/38 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Eğitim: 100%|██████████| 38/38 [00:32<00:00,  1.15it/s, loss=3.76]
Doğrulama: 100%|██████████| 10/10 [00:02<00:00,  3.79it/s, loss=3.55]


Train Loss: 10.3929
Validation Loss: 3.6107
Epoch 2/3


Eğitim: 100%|██████████| 38/38 [00:32<00:00,  1.16it/s, loss=1.06]
Doğrulama: 100%|██████████| 10/10 [00:02<00:00,  3.57it/s, loss=0.278]


Train Loss: 1.9710
Validation Loss: 0.3088
Epoch 3/3


Eğitim: 100%|██████████| 38/38 [00:34<00:00,  1.12it/s, loss=0.872]
Doğrulama: 100%|██████████| 10/10 [00:02<00:00,  3.45it/s, loss=0.235]


Train Loss: 0.9306
Validation Loss: 0.2440
Orijinal Cümle: She go to school every day and like to learn new things.
Düzeltilmiş Cümle: She goes to school every day and likes to learn new things.


In [26]:
# 8. Model Testi
def test_sentence(sentence):
    input_text = f"grammar: {sentence}"
    input_ids = tokenizer(input_text, return_tensors="pt", max_length=128, truncation=True).input_ids.to(device)
    outputs = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

example_sentence = "exampl "
corrected_sentence = test_sentence(example_sentence)

print(f"Orijinal Cümle: {example_sentence}")
print(f"Düzeltilmiş Cümle: {corrected_sentence}")

Orijinal Cümle: exampl 
Düzeltilmiş Cümle: Exampln'tiews: exampln'tiews.


In [18]:
from nltk.translate.bleu_score import SmoothingFunction

def calculate_bleu(data):
    bleu_scores = []
    smoothie = SmoothingFunction().method4  # Düzgünleştirme fonksiyonu
    for item in data:
        input_sentence = item["input"].replace("grammar: ", "")
        target_sentence = item["target"]
        predicted_sentence = test_sentence(input_sentence)

        # BLEU hesaplama
        reference = [target_sentence.split()]
        candidate = predicted_sentence.split()
        bleu_score = sentence_bleu(reference, candidate, smoothing_function=smoothie)
        bleu_scores.append(bleu_score)

    return sum(bleu_scores) / len(bleu_scores)

bleu_score = calculate_bleu(data)
print(f"BLEU Skoru (Düzgünleştirilmiş): {bleu_score:.4f}")


BLEU Skoru (Düzgünleştirilmiş): 0.4024
