In [None]:
!pip install transformers -q
!pip install datasets -q

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!cp /content/drive/MyDrive/turkish_poems.csv /content/

In [None]:
import pandas as pd

def clean_text(text):

  text = text.replace('\n\n','')
  text = text.replace('.','')
  text = text.replace(',','')
  text = text.replace('\n','')

  return text

data_path = '/content/turkish_poems.csv'
data = pd.read_csv(data_path)
data['content'] = data['content'].apply(clean_text)
data = data['content'].unique()
data = [pair for pair in data if len(pair) <= 1024]
len(data)

17334

In [None]:
train_size = int(len(data) * 0.8)
train_data = data[:train_size]
test_data = data[train_size:]
len(train_data), len(test_data)

(13867, 3467)

In [None]:
from transformers import DataCollatorForLanguageModeling, DataCollatorWithPadding, AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, AutoConfig
from datasets import Dataset

model = AutoModelForCausalLM.from_pretrained('gorkemgoknar/gpt2-small-turkish')
tokenizer = AutoTokenizer.from_pretrained('gorkemgoknar/gpt2-small-turkish')

In [None]:
bos = '<|endoftext|>'
eos = '<|EOS|>'
pad = '<|pad|>'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad}
num_add_tokens = tokenizer.add_special_tokens(special_tokens_dict)

config = AutoConfig.from_pretrained('gorkemgoknar/gpt2-small-turkish', 
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
                                    pad_token_id=tokenizer.pad_token_id,
                                    output_hidden_states=False)

model = AutoModelForCausalLM.from_pretrained('gorkemgoknar/gpt2-small-turkish', config=config)
model.resize_token_embeddings(len(tokenizer))

Embedding(50259, 768)

In [None]:
def prepare_data(data):
    
    response = []
    
    for pair in data:
        
        new_pair = bos + ' ' + str(pair) + ' ' + eos
        
        response.append(new_pair)
        
    return response

In [None]:
train_data = prepare_data(train_data)
test_data = prepare_data(test_data)
test_data[0], train_data[0]

('<|endoftext|> Yokluğun mehtapsız bir kış gecesindeKaranlık bir yolda yürümek kadar zor Karanlık yolda yürümeyi bilmiyorsan Mehtapsız bir kış gecesi olgel bana sor <|EOS|>',
 '<|endoftext|> Acı acı günlerimŞamatacı günlerimGüneş yüzü görmedi (vay) Hacı bacı günlerimDeğirmenin çarkı yokBu bahçenin parkı yokAli gider Veli gelir külhana (vay) Birbirinden farkı yokAcı acı günlerimGöz boyacı günlerimAhı vahınan geçti gittiHacı bacı günlerimBir ipte iki cambazZor olur oynayamazDili tatlı içi zehir güzelim (vay) Böyle kazan kaynamazAcı acı günlerimŞamatacı günlerimAhı vahınan geçti gittiHacı bacı günlerimMahzuni deli gönlümYine yollara düştüHacı vurdu bacı durdu acımadı alemeKaygu kullara düştüAcı acı günlerimŞamatacı günlerimAhı vahınan geçti gittiHacı bacı günlerim <|EOS|>')

In [None]:
train_data = pd.DataFrame(train_data)
train_data.columns = ['content']
test_data = pd.DataFrame(test_data)
test_data.columns = ['content']

In [None]:
train_dataset = Dataset.from_pandas(train_data[['content']])
test_dataset = Dataset.from_pandas(test_data[['content']])
train_dataset, test_dataset

(Dataset({
     features: ['content'],
     num_rows: 13867
 }), Dataset({
     features: ['content'],
     num_rows: 3467
 }))

In [None]:
def tokenize_func(example):
    
    return tokenizer(example['content'], padding = True)

tokenized_train_dataset = train_dataset.map(tokenize_func,
                                           batched = True,
                                           num_proc = 5,
                                           remove_columns = ['content'])

tokenized_test_dataset = test_dataset.map(tokenize_func,
                                         batched = True,
                                         num_proc = 5,
                                         remove_columns = ['content'])

        

#0:   0%|          | 0/3 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/3 [00:00<?, ?ba/s]

#1:   0%|          | 0/3 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/3 [00:00<?, ?ba/s]

#4:   0%|          | 0/3 [00:00<?, ?ba/s]

          

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
model_save_path = '/content/fine_tuned_model'

training_args = TrainingArguments(
    output_dir=model_save_path,
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    warmup_steps=200,
    weight_decay=0.01,
    logging_dir=model_save_path,
    prediction_loss_only=True,
    save_steps=10000
    #report_to = 'wandb'
)

data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm = False)

trainer = Trainer(model = model,
                 args = training_args,
                 data_collator = data_collator,
                 train_dataset = tokenized_train_dataset,
                 eval_dataset = tokenized_test_dataset)

In [None]:
import torch
import gc

torch.cuda.empty_cache()
gc.collect()

46

In [None]:
trainer.train()

***** Running training *****
  Num examples = 13867
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 17335
  Number of trainable parameters = 124441344
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,6.1157
1000,5.7168
1500,5.5344
2000,5.4396
2500,5.3456
3000,5.3006
3500,5.2833
4000,5.0413
4500,5.0353
5000,5.0263


Step,Training Loss
500,6.1157
1000,5.7168
1500,5.5344
2000,5.4396
2500,5.3456
3000,5.3006
3500,5.2833
4000,5.0413
4500,5.0353
5000,5.0263


Saving model checkpoint to /content/fine_tuned_model/checkpoint-10000
Configuration saved in /content/fine_tuned_model/checkpoint-10000/config.json
Configuration saved in /content/fine_tuned_model/checkpoint-10000/generation_config.json
Model weights saved in /content/fine_tuned_model/checkpoint-10000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=17335, training_loss=4.914100955493312, metrics={'train_runtime': 13016.5292, 'train_samples_per_second': 5.327, 'train_steps_per_second': 1.332, 'total_flos': 2.1384799763328e+16, 'train_loss': 4.914100955493312, 'epoch': 5.0})

In [None]:
trainer.save_model()
tokenizer.save_pretrained(model_save_path)

Saving model checkpoint to /content/fine_tuned_model
Configuration saved in /content/fine_tuned_model/config.json
Configuration saved in /content/fine_tuned_model/generation_config.json
Model weights saved in /content/fine_tuned_model/pytorch_model.bin
tokenizer config file saved in /content/fine_tuned_model/tokenizer_config.json
Special tokens file saved in /content/fine_tuned_model/special_tokens_map.json


('/content/fine_tuned_model/tokenizer_config.json',
 '/content/fine_tuned_model/special_tokens_map.json',
 '/content/fine_tuned_model/vocab.json',
 '/content/fine_tuned_model/merges.txt',
 '/content/fine_tuned_model/added_tokens.json',
 '/content/fine_tuned_model/tokenizer.json')

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 3467
  Batch size = 2


{'eval_loss': 5.125611782073975,
 'eval_runtime': 190.3152,
 'eval_samples_per_second': 18.217,
 'eval_steps_per_second': 9.111,
 'epoch': 5.0}

In [None]:
my_model = AutoModelForCausalLM.from_pretrained(model_save_path)
my_tokenizer = AutoTokenizer.from_pretrained(model_save_path)

input_text = my_tokenizer.bos_token
input_ids = my_tokenizer.encode(input_text, return_tensors = 'pt')
output = my_model.generate(input_ids, min_length = 100, max_length = 250, num_beams = 5, do_sample = True, top_k = 100, top_p = 0.8, no_repeat_ngram_size = 2, temperature = 0.8)
print(tokenizer.decode(output[0], skip_special_tokens = True))

loading configuration file /content/fine_tuned_model/config.json
Model config GPT2Config {
  "_name_or_path": "/content/fine_tuned_model",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 50257,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 50258,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "trans

In [None]:
!zip -r model.zip /content/fine_tuned_model/
!mv /content/model.zip /content/drive/MyDrive/

  adding: content/fine_tuned_model/ (stored 0%)
  adding: content/fine_tuned_model/added_tokens.json (deflated 23%)
  adding: content/fine_tuned_model/merges.txt (deflated 60%)
  adding: content/fine_tuned_model/events.out.tfevents.1674925567.394a7ff8e316.1793.0 (deflated 62%)
  adding: content/fine_tuned_model/vocab.json (deflated 62%)
  adding: content/fine_tuned_model/generation_config.json (deflated 28%)
  adding: content/fine_tuned_model/1674925567.1558592/ (stored 0%)
  adding: content/fine_tuned_model/1674925567.1558592/events.out.tfevents.1674925567.394a7ff8e316.1793.1 (deflated 63%)
  adding: content/fine_tuned_model/checkpoint-10000/ (stored 0%)
  adding: content/fine_tuned_model/checkpoint-10000/optimizer.pt (deflated 8%)
  adding: content/fine_tuned_model/checkpoint-10000/generation_config.json (deflated 28%)
  adding: content/fine_tuned_model/checkpoint-10000/trainer_state.json (deflated 75%)
  adding: content/fine_tuned_model/checkpoint-10000/training_args.bin (deflated 4