In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install torch transformers accelerate tqdm

In [None]:
# Block 1: Data loading and tokenizer setup
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, GPT2LMHeadModel, AdamW, get_scheduler
from tqdm.auto import tqdm
import pandas as pd

# Load data
data_path1 = '/content/drive/MyDrive/AraProje/QuestionPrediction/training_data.csv'
data_path2 = '/content/drive/MyDrive/AraProje/QuestionPrediction/validation_data.csv'
data1 = pd.read_csv(data_path1)
data2 = pd.read_csv(data_path2)
data1 = data1.iloc[:2000]
data2 = data2.iloc[:500]

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('ytu-ce-cosmos/turkish-gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Block 2: Data preparation function
def prepare_sequence(input_text, question_text):
    # For input: reverse the tokens and convert back to text
    input_tokens = tokenizer.encode(input_text, add_special_tokens=True)
    reversed_input_tokens = input_tokens[::-1]
    reversed_input_text = tokenizer.decode(reversed_input_tokens)

    # Tokenize both for model input
    input_encoding = tokenizer(reversed_input_text,
                             max_length=128,
                             padding='max_length',
                             truncation=True,
                             return_tensors='pt')

    return input_encoding, input_encoding

# Block 3: Prepare datasets
train_data = data1
val_data = data2

# Process training data
train_inputs = []
train_labels = []
train_masks = []

for _, row in tqdm(train_data.iterrows(), desc="Processing training data"):
    input_encoding, label_encoding = prepare_sequence(row['input'], row['label'])

    train_inputs.append(input_encoding['input_ids'].squeeze())
    train_labels.append(label_encoding['input_ids'].squeeze())
    train_masks.append(input_encoding['attention_mask'].squeeze())

# Convert to tensors
train_inputs = torch.stack(train_inputs)
train_labels = torch.stack(train_labels)
train_masks = torch.stack(train_masks)

# Process validation data similarly
val_inputs = []
val_labels = []
val_masks = []

for _, row in tqdm(val_data.iterrows(), desc="Processing validation data"):
    input_encoding, label_encoding = prepare_sequence(row['input'], row['label'])

    val_inputs.append(input_encoding['input_ids'].squeeze())
    val_labels.append(label_encoding['input_ids'].squeeze())
    val_masks.append(input_encoding['attention_mask'].squeeze())

val_inputs = torch.stack(val_inputs)
val_labels = torch.stack(val_labels)
val_masks = torch.stack(val_masks)

# Block 4: Create dataloaders
train_dataset = TensorDataset(train_inputs, train_labels, train_masks)
val_dataset = TensorDataset(val_inputs, val_labels, val_masks)

batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)


In [None]:
model_path = '/content/drive/MyDrive/AraProje/PreviousTokenPrediction/TersGpt2Large'
model = GPT2LMHeadModel.from_pretrained(model_path, from_tf = True)
model.config.pad_token_id = tokenizer.pad_token_id
model.to('cuda')

In [None]:
# Optimizasyon ve scheduler tanımlama
optimizer = AdamW(model.parameters(), lr=1e-4)
num_training_steps = len(train_dataloader) * 2  #epoch
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=500, num_training_steps=num_training_steps)

# Loss fonksiyonu tanımlama
from torch.nn import CrossEntropyLoss
loss_fn = CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

In [None]:
from tqdm import tqdm

num_epochs = 3
best_val_loss = float('inf')
patience = 4
early_stop_counter = 0

for epoch in range(num_epochs):
    # ---- Eğitim döngüsü ----
    model.train()
    total_train_loss = 0
    train_loop = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}")

    for batch in train_loop:
        optimizer.zero_grad()

        input_ids = batch[0].to('cuda')
        labels = batch[1].to('cuda')
        attention_mask = batch[2].to('cuda')

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
        optimizer.step()
        scheduler.step()

        total_train_loss += loss.item()
        train_loop.set_postfix(loss=loss.item())

    avg_train_loss = total_train_loss / len(train_dataloader)

    # ---- Validation döngüsü ----
    model.eval()
    total_val_loss = 0

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch[0].to('cuda')
            labels = batch[1].to('cuda')
            attention_mask = batch[2].to('cuda')

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

    # Early stopping ve model checkpointing
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        early_stop_counter = 0
        model.save_pretrained('/content/drive/MyDrive/AraProje/QuestionPrediction/Test')
        tokenizer.save_pretrained('/content/drive/MyDrive/AraProje/QuestionPrediction/Test')
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print("Early stopping triggered")
            break

print("Eğitim tamamlandı ve model kaydedildi.")

Epoch 1/3:   0%|          | 0/125 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Epoch 1/3: 100%|██████████| 125/125 [01:25<00:00,  1.46it/s, loss=1.1]


Epoch 1/3, Train Loss: 1.5323, Validation Loss: 1.4725


Epoch 2/3: 100%|██████████| 125/125 [01:24<00:00,  1.47it/s, loss=1.17]


Epoch 2/3, Train Loss: 1.2519, Validation Loss: 1.4942


Epoch 3/3: 100%|██████████| 125/125 [01:24<00:00,  1.47it/s, loss=1.02]


Epoch 3/3, Train Loss: 0.8907, Validation Loss: 1.5950
Eğitim tamamlandı ve model kaydedildi.


In [None]:
from transformers import AutoTokenizer, GPT2LMHeadModel

# Tokenizer'ı yükleme
tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/AraProje/QuestionPrediction/Model2')

# Modeli yükleme
model = GPT2LMHeadModel.from_pretrained('/content/drive/MyDrive/AraProje/QuestionPrediction/Model2')
model.to('cuda')  # Eğer GPU kullanıyorsanız modeli CUDA'ya taşıyın

In [None]:
from transformers import pipeline
text = """Türkiyenin başkenti Ankara'dır"""
text = " Cevap: " + text
d = tokenizer.decode(tokenizer.encode(text)[::-1], skip_special_tokens = True)


text_generator = pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer,
    #top_k=40,               # En olası 50 token arasından seçim yap
    #top_p=0.90,              # Kümülatif olasılığı %90 olan tokenlar arasından seçim yap
    #temperature=0.5,        # Daha tutarlı sonuçlar için düşük sıcaklık değeri
    #repetition_penalty=2.0  # Tekrar eden kelimelere ceza uygula
)
r = text_generator(d, max_length=100,truncation=True)[0]['generated_text']
tokenizer.decode(tokenizer.encode(r)[::-1])

Device set to use cuda:0


"Soru: Hangi ülkenin başkent hangileridir? Cevap: Türkiyenin başkenti Ankara'dır"