In [None]:
# All the credits goes to Andrei Karpathy. This is just a small adaptaion for Turkish Eksi Sozluk entries. 
# Please check out his repo - https://github.com/karpathy/minGPT

In [None]:
# Hasan Ocak'ın hazırladığı eksi verisini indirmeniz lazım
#https://www.kaggle.com/ocakhsn/eksi-sozluk-entries/version/3

In [None]:
!unzip *.zip 

Archive:  803190_1394039_bundle_archive.zip
  inflating: real.csv                


In [None]:
import pandas

In [None]:
df = pandas.read_csv('real.csv', skipinitialspace=True, usecols=[3])

In [None]:
df = df.iloc[:,0].str.replace("\n","")

In [None]:
df.head()

0    hala kimsenin entry girmediği başlık. daha ne ...
1                           şu an 1$=6,87 tl olan kur.
2                     (bkz:yatırım tavsiyesi değildir)
3    sabah 10-11 gibi bi atak yapmasını beklediğim ...
4    berat albayrak yumruk atmadığı için 7 tl'ye do...
Name: text, dtype: object

In [None]:
df.to_csv('eksi.txt', header=None, index=None, mode='a')

In [None]:
!git clone https://github.com/karpathy/minGPT.git

Cloning into 'minGPT'...
remote: Enumerating objects: 21, done.[K
remote: Total 21 (delta 0), reused 0 (delta 0), pack-reused 21[K
Unpacking objects: 100% (21/21), done.


In [None]:
%cd minGPT/

/content/minGPT


## Train a character-level GPT on some text data

The inputs here are simple text files, which we chop up to individual characters and then train GPT on. So you could say this is a char-transformer instead of a char-rnn. Doesn't quite roll off the tongue as well. In this example we will feed it some shakespear, which we'll get it to predict character-level.

In [None]:
# set up logging
import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)

In [None]:
# make deterministic
from mingpt.utils import set_seed
set_seed(42)

In [None]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

In [None]:
import math
from torch.utils.data import Dataset

class CharDataset(Dataset):

    def __init__(self, data, block_size):
        chars = list(set(data))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))
        
        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data
    
    def __len__(self):
        return math.ceil(len(self.data) / (self.block_size + 1))

    def __getitem__(self, idx):
        # we're actually going to "cheat" and pick a spot in the dataset at random
        i = np.random.randint(0, len(self.data) - (self.block_size + 1))
        chunk = self.data[i:i+self.block_size+1]
        dix = [self.stoi[s] for s in chunk]
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y


In [None]:
block_size = 128 # spatial extent of the model for its context

In [None]:
# you can download this file at https://github.com/karpathy/char-rnn/blob/master/data/tinyshakespeare/input.txt
text = open('/content/eksi.txt', 'r').read().rstrip() # don't worry we won't run out of file handles
train_dataset = CharDataset(text, block_size) # one line of poem is roughly 50 characters


data has 2641167 characters, 94 unique.


In [None]:
from mingpt.model import GPT, GPTConfig
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size,
                  n_layer=32, n_head=32, n_embd=512)
model = GPT(mconf)

08/21/2020 06:10:11 - INFO - mingpt.model -   number of parameters: 1.010391e+08


In [None]:
from mingpt.trainer import Trainer, TrainerConfig

# initialize a trainer instance and kick off training
tconf = TrainerConfig(max_epochs=50, batch_size=32, learning_rate=6e-4,
                      lr_decay=True, warmup_tokens=512*20, final_tokens=200*len(train_dataset)*block_size,
                      num_workers=4)
trainer = Trainer(model, train_dataset, None, tconf)
trainer.train()

epoch 1 iter 639: train loss 2.20949. lr 5.999633e-04: 100%|██████████| 640/640 [07:33<00:00,  1.41it/s]
epoch 2 iter 639: train loss 1.87224. lr 5.998525e-04: 100%|██████████| 640/640 [07:33<00:00,  1.41it/s]
epoch 3 iter 639: train loss 1.68608. lr 5.996678e-04: 100%|██████████| 640/640 [07:34<00:00,  1.41it/s]
epoch 4 iter 639: train loss 1.56334. lr 5.994092e-04: 100%|██████████| 640/640 [07:35<00:00,  1.41it/s]
epoch 5 iter 639: train loss 1.45197. lr 5.990766e-04: 100%|██████████| 640/640 [07:34<00:00,  1.41it/s]
epoch 6 iter 639: train loss 1.34232. lr 5.986703e-04: 100%|██████████| 640/640 [07:34<00:00,  1.41it/s]
epoch 7 iter 639: train loss 1.22549. lr 5.981902e-04: 100%|██████████| 640/640 [07:34<00:00,  1.41it/s]
epoch 8 iter 639: train loss 1.09058. lr 5.976366e-04: 100%|██████████| 640/640 [07:35<00:00,  1.40it/s]
epoch 9 iter 639: train loss 0.92832. lr 5.970096e-04: 100%|██████████| 640/640 [07:35<00:00,  1.41it/s]
epoch 10 iter 639: train loss 0.78630. lr 5.963092e-04:

KeyboardInterrupt: ignored

In [None]:
import joblib
joblib.dump({'model' : model}, 'gpt_model.joblib', compress=0, protocol=None, cache_size=None)

['gpt_model.joblib']

In [None]:
# alright, let's sample some character-level shakespear
from mingpt.utils import sample

context = """sus pus"""
x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None,...].to(trainer.device)
y = sample(model, x, 100, temperature=0.9, sample=True, top_k=5)[0]
completion = ''.join([train_dataset.itos[int(i)] for i in y])
print(completion)

sus pus olduğunun salamasına göre onlara olan olabilir fak ve sosyal başkalarıla 10-5 senedir lüsana kadar 


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
model_dict = joblib.load('/content/drive/My Drive/models/gpt_model.joblib')

In [None]:
model_dict = a['model']