In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset 
import random
import time
import datetime
import random
from transformers import GPT2LMHeadModel, GPT2Config
import numpy as np
from torch.utils.data import random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [2]:
import requests

master = "https://raw.githubusercontent.com/mcelikkaya/medium_articles/main/japan_wiki.txt"
req = requests.get(master)
req = req.text

In [3]:
all_sentences = req.split("\n")
all_sentences = [s.replace("\r","") for s in all_sentences]

In [4]:
print("sample size : ",len(all_sentences))
print("samples     : " )
all_sentences[0:10]

sample size :  40389
samples     : 


['Hokkaido was formerly known as Ezo  Yezo  Yeso  or Yesso.',
 'According to Matsuura  the name was thought up because the Ainu called the region Kai.',
 'In contrast to the island of Honshu  Hokkaido saw an absence of conflict during this time period.',
 'From the Middle Ages  the people in Hokkaido began to be called Ezo.',
 'Hokkaido subsequently became known as Ezochi  蝦夷地  lit.',
 'The disputes eventually developed into war.',
 'Takeda Nobuhiro killed the Ainu leader  Koshamain  and defeated the opposition in 1457.',
 'The Matsumae family s economy relied upon trade with the Ainu.',
 'They held authority over the south of Ezochi until the end of the Edo period.',
 'There were numerous revolts by the Ainu against the feudal rule.']

In [5]:
from transformers import GPT2Tokenizer
#get pretrained tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='', eos_token='', pad_token='')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
#tokenizer some samples
print( tokenizer.encode("Japan Tokyo") )
print( tokenizer.encode("Japan") )
print( tokenizer.encode("japan tokyo") )
print( tokenizer.encode("japan") )
print( tokenizer.encode("tokyo") )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[16504, 11790]
[16504]
[73, 2674, 284, 2584, 78]
[73, 2674]
[83, 482, 8226]


In [6]:
max_len = max([len(tokenizer.encode(s)) for s in all_sentences])

max_len

85

In [7]:
#since we will be feeding with sentences from wikipedia
#we can mark beginning and end of sentences with with sos and eos
def tokenize_seq(sent,tokenizer,max_length):
  return tokenizer(''+ sent + '', truncation=True, max_length=max_length, padding="max_length")

class JapanDataset(Dataset):

  def __init__(self, sentences, tokenizer, gpt2_type="gpt2", max_length=max_len):

    self.tokenizer = tokenizer 
    self.input_ids = []
    self.attn_masks = []

    for sentence in sentences:      
      encodings = tokenize_seq(sentence,tokenizer,max_length)
            
      self.input_ids.append(torch.tensor(encodings['input_ids']))
      self.attn_masks.append(torch.tensor(encodings['attention_mask']))
    
  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx]   

def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed))))) 

In [8]:
import gc
gc.collect() 

4

In [9]:
#create an instance of Dataset
dataset = JapanDataset(all_sentences, tokenizer, max_length=max_len)

# Split into training and validation sets
train_size = int(0.9*len(dataset))
val_size = len(dataset) - train_size

train_set, val_set = random_split(dataset, [train_size, val_size])
print("train_size :",train_size)
print("val_size   :",val_size)
gc.collect() 

train_size : 36350
val_size   : 4039


0

In [10]:
#lets check a sample from dataset 
#50257 beginning of sentence token
#50258 end of sentence token
#50259 pad token
dataset[0]

(tensor([   39,   482,    74, 44354,   373, 15734,  1900,   355,   412, 10872,
           220,   575,  8471,    78,   220,  3363,    78,   220,   393,   575,
           408,    78,    13, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [11]:
#define dataloaders
train_dataloader = DataLoader(train_set, sampler=RandomSampler(train_set), batch_size=16)
validation_dataloader = DataLoader(val_set, sampler = SequentialSampler(val_set), batch_size = 16)

In [12]:
# Create default config
configuration = GPT2Config.from_pretrained("gpt2", output_hidden_states=False)
# Load pretrained gpt2
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)
model.resize_token_embeddings(len(tokenizer))

# Create device
device = torch.device("cuda")
model.cuda()


optimizer = torch.optim.Adam(model.parameters(),lr = 0.0005)
model = model.to(device)

In [13]:
#at every step i want to check if generations are getting better.
def eval_keywords(keywords):
    model.eval()
    for keyword in keywords:
        input_seq = " " + keyword
        generated = torch.tensor(tokenizer.encode(input_seq)).unsqueeze(0)
        generated = generated.to(device)
        sample_outputs = model.generate(generated, do_sample=True, top_k=30, max_length=50, top_p=0.9, num_return_sequences=2)

        for i, sample_output in enumerate(sample_outputs):
            print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

keywords = ["Osaka","Japan","Kyoto","Yokohama","Kanto","Nikko","Japan has","Tokyo is the","Osaka is the","Kyoto is the"]

In [14]:
#call model with a batch of input
def process_one_batch(batch):
    b_input_ids = batch[0].to(device)
    b_labels = batch[0].to(device)
    b_masks = batch[1].to(device)
    outputs = model(b_input_ids, attention_mask = b_masks, labels=b_labels)
    return outputs

#do one epoch for training
def train_epoch():
    t0 = time.time()
    total_train_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):

        model.zero_grad()
        outputs = process_one_batch(batch)
        loss = outputs[0]
        batch_loss = loss.item()
        total_train_loss+=batch_loss

        loss.backward()
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_dataloader)  
    print("avg_train_loss",avg_train_loss)  
    elapsed_time = format_time(time.time() - t0)
    print("elapsed time for 1 training epoch : ",elapsed_time)

#do one epoch for eval
def eval_epoch():
    t0 = time.time()
    total_eval_loss = 0
    nb_eval_steps = 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:

        with torch.no_grad():
            outputs = process_one_batch( batch)
            loss = outputs[0]              
            batch_loss = loss.item()
            total_eval_loss += batch_loss         

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    print("avg_val_loss",avg_val_loss) 
    elapsed_time = format_time(time.time() - t0)
    print("elapsed time for 1 eval epoch : ",elapsed_time)

In [15]:
#train eval 1 cycle
#then create sample sentences
train_epoch()
eval_epoch()
eval_keywords( keywords )

avg_train_loss 0.8565802537954189
elapsed time for 1 training epoch :  0:08:15


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


avg_val_loss 0.7495614122967476
elapsed time for 1 eval epoch :  0:00:17


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  Osakayo has a population of around 1.12 million  2018.
1:  Osaka has been ranked 9th in the world in quality of food  food  and waste water.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  Japan has the most female male births with over 1.1 billion babies.
1:  Japan is also an important trading partner of China.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  Kyoto       Tokyo has two major highways  a one way traffic and a three way traffic lane.
1:  Kyoto  Japan is a major exporter of food  fish  vegetables  and meat.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  Yokohama is the capital of the prefecture.
1:  Yokohama was also a major transportation port for Japan.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  Kanto and Ōsaka were the three largest cities of Japan at 1 479 km.
1:  Kanto  匈  the center of the koto dance  is one of the most popular  and has become popular in Japan.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  Nikko was born on April 12  1943  in Tokyo.
1:  Nikko      In a referendum on 23 December 1991  the President was replaced by a constitutional monarch.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  Japan has the second highest average household wealth in the world at  0.5.
1:  Japan has a low cost of living  with over 11 000 homes and more than 2 000 shops.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  Tokyo is the largest exporter of chicken meat in the world.
1:  Tokyo is the largest exporter of sake in the world.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  Osaka is the second largest and most famous of these sports.
1:  Osaka is the capital of the prefecture.
0:  Kyoto is the second largest manufacturing area of any country in the world.
1:  Kyoto is the country s largest importer of rice  with over 4 000 exports.
