In [16]:
import torch
from transformers import T5Tokenizer, AutoModelForCausalLM
from torch.utils.data import Dataset, DataLoader
import glob

tokenizer = T5Tokenizer.from_pretrained("rinna/japanese-gpt2-small")
tokenizer.do_lower_case = True  # due to some bug of tokenizer config loading

model = AutoModelForCausalLM.from_pretrained("rinna/japanese-gpt2-small")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [17]:
novel_path = glob.glob("C:\\Python\\Pytorch\\Transformer related\\Fine tuning GPT 2\\GPT2 japanese\\*.txt")[0]
novel_path

'C:\\Python\\Pytorch\\Transformer related\\Fine tuning GPT 2\\GPT2 japanese\\色彩を持たない多崎つくると、彼の巡礼の年 (村上春樹).txt'

In [18]:
sentlist = []

with open(novel_path, "r", encoding="utf-8") as file:
    for sent in file.readlines():
        sent = sent.split("。")
        for i in sent:
            if len(i) > len('\n'):
                sentlist.append(i+"。")

sentlist[:10]

['大学二年生の七月から、翌年の一月にかけて、多崎たざきつくるはほとんど死ぬことだけを考えて生きていた。',
 'その間に二十歳の誕生日を迎えたが、その刻み目はとくに何の意味も持たなかった。',
 'それらの日々、自らの命を絶つことは彼にとって、何より自然で筋の通ったことに思えた。',
 'なぜそこで最後の一歩を踏み出さなかったのか、理由は今でもよくわからない。',
 'そのときなら生死を隔てる敷居をまたぐのは、生卵をひとつ呑のむより簡単なことだったのに。',
 'つくるが実際に自殺を試みなかったのはあるいは、死への想いがあまりにも純粋で強烈すぎて、それに見合う死の手段が、具体的な像を心中に結べなかったからかもしれない。',
 '具体性はそこではむしろ副次的な問題だった。',
 'もしそのとき手の届くところに死につながる扉があったなら、彼は迷わず押し開けていたはずだ。',
 '深く考えるまでもなく、いわば日常の続きとして。',
 'しかし幸か不幸か、そのような扉を手近な場所に見つけることが彼にはできなかった。']

In [19]:
len(sentlist)

7789

In [20]:
text_sample =sentlist[:10]
encoding = tokenizer(text_sample, truncation=True, max_length=1024, padding=True)
encoding


{'input_ids': [[9, 183, 259, 10967, 1050, 928, 7, 8158, 92, 7519, 7, 617, 1105, 40, 14596, 3546, 56, 3562, 10770, 229, 926, 18, 15942, 3320, 124, 8, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], [9, 17234, 11488, 25540, 5548, 12, 7, 65, 28547, 303, 11, 9549, 1059, 3717, 30, 1535, 40, 706, 8, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], [9, 4274, 7668, 7, 3081, 14996, 2348, 192, 1450, 2189, 1522, 7, 1059, 94, 1129, 19, 1722, 10, 15858, 2270, 2988, 5094, 8, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], [9, 11133, 3641, 1240, 16465, 18, 8652, 353, 14741, 1974, 7, 8019, 17920, 1600, 17619, 8, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], [9, 18550, 1598, 196, 837, 18, 8399, 8467, 3980, 1197, 18, 240, 1477, 10, 11, 7, 196, 2471, 18, 8818, 21944, 10, 561, 94, 14695, 14717, 10, 17, 8, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], [9, 3546, 3996, 2473, 5638, 18, 4332, 706, 10, 11, 615, 7, 837, 105, 13078, 12, 18803, 16306, 19, 

In [21]:
decoding = []
for i in encoding["input_ids"]:
    a = tokenizer.decode(i)
    decoding.append(a)
decoding

['大学二年生の七月から、翌年の一月にかけて、多崎たざきつくるはほとんど死ぬことだけを考えて生きていた。</s> [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 'その間に二十歳の誕生日を迎えたが、その刻み目はとくに何の意味も持たなかった。</s> [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 'それらの日々、自らの命を絶つことは彼にとって、何より自然で筋の通ったことに思えた。</s> [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 'なぜそこで最後の一歩を踏み出さなかったのか、理由は今でもよくわからない。</s> [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 'そのときなら生死を隔てる敷居をまたぐのは、生卵をひとつ呑のむより簡単なことだったのに。</s> [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 'つくるが実際に自殺を試みなかったのはあるいは、死への想いがあまりにも純粋で強烈すぎて、それに見合う死の手段が、具体的な像を心中に結べなかったからかもしれない。</s>',
 '具体性はそこではむしろ副次的な問題だった。</s> [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [

In [22]:
text_sample2 = "具体性はそこではむしろ副次的な問題だった。"
tokens = tokenizer.tokenize(text_sample2)
tokens

['▁', '具', '体', '性', 'は', 'そこで', 'は', 'むしろ', '副', '次', '的な', '問題', 'だった', '。']

In [23]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(sentlist, train_size=0.9)

print(len(train_set))
print(len(test_set))

7010
779


In [24]:
class HarukiDataset(Dataset):
    def __init__(self, sents):
        self.sents = sents
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []

        encoding = tokenizer(sents, truncation=True, max_length=1024, padding=True)
        self.input_ids = torch.tensor(encoding["input_ids"])
        self.attn_masks = torch.tensor(encoding['attention_mask'])

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]
            


In [25]:
train_dataset = HarukiDataset(train_set)
test_dataset = HarukiDataset(test_set)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [26]:
len(train_dataset)

7010

In [27]:
for i, embed in enumerate(train_loader):
    if i ==0:
        print(embed[0])


tensor([[14287,   964,    11,  ...,     3,     3,     3],
        [  235,    65,  1665,  ...,     3,     3,     3],
        [  602, 14626, 13314,  ...,     3,     3,     3],
        ...,
        [  235,   220,  6884,  ...,     3,     3,     3],
        [    9, 26275, 10685,  ...,     3,     3,     3],
        [    9, 28396,  1450,  ...,     3,     3,     3]])


In [28]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

epochs = 10
learning_rate = 3e-5
warmup_steps = 500

In [29]:
def train(model, gpt2_type="gpt2"):
    model = model.to(device)
    model.train()
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1)

    loss = 0

    for epoch in range(epochs):

        for i, input in enumerate(train_loader):
            input_tensor = input[0].to(device)
            labels = input[0].to(device)            
            masks = input[1].to(device)
            output = model(input_tensor, labels=labels, attention_mask = masks)
            loss = output[0]
            loss.backward()
            loss_value = loss.item()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            model.zero_grad()
            
            if (i+1) % 200 == 0:
                print("epoch: {}, step: {}, loss: {}".format(epoch+1, i+1, loss_value))
    
    torch.save(model.state_dict(), "C:\\Python\\Pytorch\\Transformer related\\Fine tuning GPT 2\\GPT2 japanese\\weights\\weight1.pt")

    return model

In [30]:
train(model)

RuntimeError: CUDA out of memory. Tried to allocate 94.00 MiB (GPU 0; 8.00 GiB total capacity; 7.25 GiB already allocated; 0 bytes free; 7.30 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF