In [73]:
from datasets import load_dataset
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoConfig
from transformers import AutoModelForCausalLM
import torch
import evaluate
import re

torch.cuda.empty_cache()

In [74]:
def clean_text(text):
    text = text.replace("\n", " ")
    text = text.replace("?", ".")
    text = re.sub(r"[^A-Za-z0-9\s\.]", "", text)

    text = " ".join(text.split())

    return text

In [75]:
with open("AROG-ENG-SUBTITLE.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    raw_text_c = clean_text(raw_text)
with open("Gora-ENG.txt", "r", encoding="utf-8") as f:
    raw_text_2 = f.read()
    raw_text_c_2 = clean_text(raw_text_2)
with open("yahsi_bati_eng_sub-utf.txt", "r", encoding="utf-8") as f:
    raw_text_3 = f.read()
    raw_text_c_3 = clean_text(raw_text_3)
raw_text_c = raw_text_c + raw_text_c_2 + raw_text_c_3
print(len(raw_text_c))

177826


In [81]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
raw_data = {"train": raw_text_c.split("."), "test": raw_text_c[130000:].split(".")}

add_spe_tokens = ["<|ANS|>"]

special_tokens_dict = {
    "eos_token": "<|EOS|>",
    "bos_token": "<|SOS|>",
    "pad_token": "<|PAD|>",
    "additional_special_tokens": add_spe_tokens,
}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

config = AutoConfig.from_pretrained(
    "gpt2",
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    output_hidden_states=False,
)
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", config=config)


# print(raw_data["train"])
def preprocess_function(examples, tokenizer=tokenizer):
    tokenize_data = []
    for example in examples:
        token = tokenizer(example)
        token["input_ids"].append(13)
        token["attention_mask"].append(1)
        tokenize_data.append(token)

    return tokenize_data


tokenize_data_train = preprocess_function(raw_data["train"])
tokenize_data_test = preprocess_function(raw_data["test"])

In [82]:
def convert_to_dict(data):
    return {key: [d[key] for d in data] for key in data[0]}


train_dict = convert_to_dict(tokenize_data_train)
test_dict = convert_to_dict(tokenize_data_test)

# 'Dataset' nesnelerini oluşturma
train_dataset = Dataset.from_dict(train_dict)
test_dataset = Dataset.from_dict(test_dict)

# 'DatasetDict' nesnesini oluşturma
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

# Dataset özniteliklerini yazdırma
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1995
    })
})


In [83]:
def group_texts(examples, block_size=128):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


lm_dataset = dataset.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/7473 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1995 [00:00<?, ? examples/s]

In [84]:
print(lm_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 352
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 95
    })
})


In [85]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")

In [86]:
training_args = TrainingArguments(
    output_dir="my_awesome_cmylmz_clm-model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
    num_train_epochs=200.0,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,4.172878
2,No log,3.94915
3,No log,3.807129
4,No log,3.700149
5,No log,3.604308
6,No log,3.525833
7,No log,3.456946
8,No log,3.394712
9,No log,3.320366
10,No log,3.262115


TrainOutput(global_step=8800, training_loss=0.9143495806780728, metrics={'train_runtime': 2747.3651, 'train_samples_per_second': 25.625, 'train_steps_per_second': 3.203, 'total_flos': 4598739763200000.0, 'train_loss': 0.9143495806780728, 'epoch': 200.0})

In [87]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 1.90


In [110]:
from transformers import pipeline

prompt = "Garavel"
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
generator = pipeline(
    "text-generation",
    model="baloglu321/my_awesome_cmylmz_clm-model",
    tokenizer=tokenizer,
)
text = generator(prompt)
print(text[0]["generated_text"])

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Garavel gets his wish granted. Youre all gonna die. Were not living anyway. What are you gonna do. Garavel give me his blessing. What are you gonna do. Throw away the stoneployees. Were not living anyway
