# ***Örnek-1***

In [1]:
import torch
import os
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer, DataCollatorWithPadding
from torch.utils.data import Dataset

In [2]:
# Check if a GPU is available and if not, use a CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

Using device: cpu


In [3]:
# Load pre-trained GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)

# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


OSError: Can't load tokenizer for 'gpt2'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'gpt2' is the correct path to a directory containing all relevant files for a GPT2Tokenizer tokenizer.

In [None]:
class CustomDataset(Dataset):
    def __init__(self, tokenizer, data, block_size):
        self.tokenizer = tokenizer
        self.data = data
        self.block_size = block_size

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        tokenized_inputs = self.tokenizer(
            item["question"],
            item["answer"],
            truncation=True,
            padding="max_length",
            max_length=self.block_size,
            return_tensors="pt"
        )
        tokenized_inputs["labels"] = tokenized_inputs["input_ids"].clone()
        return tokenized_inputs

In [None]:
data = [
    {"question": "What is the capital of France?", "answer": "Paris"},
    {"question": "Who wrote 'Romeo and Juliet'?", "answer": "William Shakespeare"},
    # Diğer sorular ve cevaplar...
]

data = CustomDataset(tokenizer, data, 128)

# Create a data collator that will dynamically pad the sequences
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Training arguments and Trainer
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    num_train_epochs=2,
    learning_rate=2e-5,
    output_dir='./results'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data,
    eval_dataset=None,
    data_collator=data_collator,
)

In [None]:
trainer.train()


In [None]:
model.eval()


In [None]:
prompt_text = "Who is Igor Kolokov?"

# Tokenize the prompt text and convert to tensor
input_ids = tokenizer(prompt_text, return_tensors="pt").input_ids
attention_mask = tokenizer(prompt_text, return_tensors="pt").attention_mask

# Move input_ids and attention_mask tensor to GPU
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)



In [None]:
# Generate text from the model
output = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    pad_token_id=tokenizer.pad_token_id,
    max_length=100,
    num_beams=5,
    temperature=1.5,
    top_k=50,
    do_sample=True
)

# Decode the generated text back to string
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)

# ***Örnek-2***

In [None]:
!pip install datasets

In [None]:
import torch
from torch.utils.data import Dataset
from datasets import load_dataset
from transformers import GPT2Tokenizer
from torch.utils.data import DataLoader
from transformers import AdamW
import transformers
import torch.nn as nn
transformers.logging.set_verbosity_error()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using {device} device")

Using cpu device


In [None]:
train_dataset = load_dataset("squad")["train"].select(range(500))
test_dataset = load_dataset("squad")["validation"].select(range(100))


example = train_dataset[0]

print(" Title: ", example['title'])
print("\n Context: ", example['context'])
print("\n Question: ", example['question'])
print("\n Answers: ", example['answers'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.83k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

 Title:  University_of_Notre_Dame

 Context:  Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.

 Question:  To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?

 Answers:  {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}


In [None]:
class SquadDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.tokenizer = tokenizer
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        example = self.dataset[idx]
        context = example['context']
        question = example['question']
        answer = example['answers']['text'][0]

        encoding = self.tokenizer.encode_plus(
            question,
            context,
            add_special_tokens=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            padding='max_length',
            max_length=384,
            truncation=True
        )

        input_ids = encoding['input_ids']
        answer_start = example['answers']['answer_start'][0]
        answer_end = answer_start + len(answer)

        start_positions = []
        end_positions = []
        for i, token_id in enumerate(input_ids):
            if i == answer_start:
                start_positions.append(i)
            else:
                start_positions.append(-100)

            if i == answer_end:
                end_positions.append(i)
            else:
                end_positions.append(-100)

        inputs = {
            'input_ids': torch.tensor(encoding['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(encoding['attention_mask'], dtype=torch.long),
            'token_type_ids': torch.tensor(encoding['token_type_ids'], dtype=torch.long),
            'start_positions': torch.tensor(start_positions, dtype=torch.float),  # start and end positions should be float
            'end_positions': torch.tensor(end_positions, dtype=torch.float)
        }

        return inputs, answer

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

train_dataloader = DataLoader(
    SquadDataset(train_dataset, tokenizer),
    batch_size=32,
    shuffle=True
)
test_dataloader = DataLoader(
    SquadDataset(test_dataset, tokenizer),
    batch_size=32,
    shuffle=False
)


In [None]:
from transformers import GPT2ForQuestionAnswering

model = GPT2ForQuestionAnswering.from_pretrained("gpt2").to(device)

print(model)


GPT2ForQuestionAnswering(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (qa_outputs): Linear(in_features=768, out_features=2, bias=True)
)


In [None]:
learning_rate = 5e-5
epochs = 2
optimizer = AdamW(model.parameters(), lr=learning_rate)




In [None]:
def train_loop(dataloader, model, optimizer):

    model.train()

    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch[0]['input_ids'].to(device)
        attention_mask = batch[0]['attention_mask'].to(device)
        token_type_ids = batch[0]['token_type_ids'].to(device)
        start_positions = batch[0]['start_positions'].to(device)
        end_positions = batch[0]['end_positions'].to(device)

        labels = {
            'start_positions': start_positions,
            'end_positions': end_positions
        }

        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        loss_start = nn.CrossEntropyLoss()(outputs.start_logits, start_positions)
        loss_end = nn.CrossEntropyLoss()(outputs.end_logits, end_positions)
        loss = (loss_start + loss_end) / 2  # average loss for start and end positions

        loss.backward()
        optimizer.step()

In [None]:
def test_loop(dataloader, model):
    model.eval()
    val_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch[0]['input_ids'].to(device)
            attention_mask = batch[0]['attention_mask'].to(device)
            token_type_ids = batch[0]['token_type_ids'].to(device)
            start_positions = batch[0]['start_positions'].to(device)
            end_positions = batch[0]['end_positions'].to(device)

            labels = {
                'start_positions': start_positions,
                'end_positions': end_positions
            }

            outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

            loss_start = nn.CrossEntropyLoss()(outputs.start_logits, start_positions)
            loss_end = nn.CrossEntropyLoss()(outputs.end_logits, end_positions)
            loss = (loss_start + loss_end) / 2  # average loss for start and end positions

            val_loss += loss.item()

    print(f"Validation Loss: {val_loss/len(dataloader)}")

In [None]:
for t in range(epochs):
    print(f"Epoch {t+1}\n ---------------------------")
    train_loop(train_dataloader, model, optimizer)
    test_loop(test_dataloader, model)

print("Done!")

model.save_pretrained("fine_tuned_QA")

In [None]:
from transformers import GPT2ForQuestionAnswering

model_name = "fine_tuned_QA"
model = GPT2ForQuestionAnswering.from_pretrained(model_name).to(device)

In [None]:
question = "What is the capital of India?"
context = "India is a country located in South Asia. Its captial is New Delhi."

inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt").to(device)

In [None]:
output = model(**inputs)
start_logits = output.start_logits.squeeze(-1).tolist()
end_logits = output.end_logits.squeeze(-1).tolist()

start_index = int(torch.argmax(torch.tensor(start_logits)))
end_index = int(torch.argmax(torch.tensor(end_logits)))

In [None]:
answer = tokenizer.decode(inputs["input_ids"][0][start_index:end_index+1])
print("Answer is: ", answer)

Answer is:  . Its captial is New Delhi.
