In [None]:
#Connecting with google drive
from google.colab import drive
drive.mount('/content/gdrive')

# Standard imports
import random
import torch
import pandas as pd
import numpy as np

# Setting seed value, we need this for recreateability purpose
RANDOM_SEED = 13

# Change the location where we will work
%cd gdrive/MyDrive/Interviews/Neo_humans

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/Interviews/Neo_humans


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m56.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
# Tokenizer: converting words to numbers that can be understood my machine

from transformers import GPT2Tokenizer

# Importing the tokenizer of GPT2
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Adding special tokens like start:
# 1. BOS-> start of sequence (first word)
# 2. EOS-> end of sequence (last word)
# 3. pad-> padding tokens that will help to give same context length for models

special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'pad_token': '<PAD>'}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:
import torch
from torch.utils.data import Dataset

# https://pytorch.org/tutorials/beginner/basics/data_tutorial.html

# Dataset subclass deal with loading each datapoint, batch size, etc.
class poem_dataset(Dataset):
  def __init__(self,data,tokenizer,max_length=1024):
    self.input_ids=[]
    self.attention_mask=[]

    for text in data:
      encoding_dict=tokenizer('<BOS>' + text + '<EOS>',truncation=True,max_length=max_length,padding='max_length')
      self.input_ids.append(torch.tensor(encoding_dict["input_ids"]))
      self.attention_mask.append(torch.tensor(encoding_dict["attention_mask"]))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self,idx):
    return self.input_ids[idx], self.attention_mask[idx]


In [None]:
# Setting context length
MAX_LEN = 1024 #As per GPT2

# Loading data and making a dataset
df_poem_stanza=pd.read_csv("poetry/poem_stanza.csv")
poem_stanza_data=poem_dataset(df_poem_stanza["stanza"].values,tokenizer,max_length=MAX_LEN)

In [None]:
from torch.utils.data import random_split


# def train_val_split_dps(split, dataset):
#   train_size = int(split * len(dataset))
#   val_size = len(dataset) - train_size
#   return train_size, val_size


# poem_stanza_train_size, poem_stanza_val_size = train_val_split_dps(0.8, poem_stanza_data)

# random split imported from troch.utils
# We can use stratified split if we have more understanding on data
poem_stanza_train_data, poem_stanza_val_data = random_split(poem_stanza_data, [poem_stanza_train_size, poem_stanza_val_size])

In [None]:
# seeding everything
import random
torch.cuda.manual_seed_all(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7f12a43d10f0>

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
# Setting a smaller batchsize because of smaller GPU in free version
BATCH_SIZE = 2

# Initializing dataloader
poem_stanza_train_dataloader = DataLoader(poem_stanza_train_data,
                              sampler=RandomSampler(poem_stanza_train_data),
                              batch_size=BATCH_SIZE)

poem_stanza_val_dataloader = DataLoader(poem_stanza_val_data,
                            sampler=SequentialSampler(poem_stanza_val_data),
                            batch_size=BATCH_SIZE)

In [None]:
import datetime
def format_time(elapsed):
  return str(datetime.timedelta(seconds=int(round((elapsed)))))

# hyperparameters
learning_rate = 1e-4
eps = 1e-8
warmup_steps = 50

# setting device to load GPU--if available else CPU. I would suggest use GPU only.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
from transformers import GPT2Config, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup

# Loading basic configuration
configuration = GPT2Config(vocab_size=len(tokenizer), n_positions=MAX_LEN).from_pretrained('gpt2', output_hidden_states=True)

# Loading pretrained model
poem_stanza_model = GPT2LMHeadModel.from_pretrained('gpt2', config=configuration)
poem_stanza_model.resize_token_embeddings(len(tokenizer))

# Load data in model if possible
if torch.cuda.is_available():
  poem_stanza_model.cuda()

# Initializing model training tools like optimizer, schedular, etc.
optimizer = AdamW(poem_stanza_model.parameters(), lr=learning_rate, eps=eps)

EPOCHS = 8
total_steps = len(poem_stanza_train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=warmup_steps,
                                            num_training_steps=total_steps)

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



In [None]:
!mkdir models

In [None]:
# import time

# start_time = time.time()
# poem_stanza_model.load_state_dict(torch.load("models/poem_stanza_model3.pth"),strict=False)
# poem_stanza_model = poem_stanza_model.to(device)

In [None]:
import time

# EPOCHS = 8
start_time = time.time()

# Load model on device that will be mostly GPU
poem_stanza_model = poem_stanza_model.to(device)

# Model training
for epoch_i in range(0, EPOCHS):

    print(f'Epoch {epoch_i + 1} of {EPOCHS}')

    t0 = time.time()
    total_train_loss = 0
    poem_stanza_model.train()

    for step, batch in enumerate(poem_stanza_train_dataloader):

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        poem_stanza_model.zero_grad()

        outputs = poem_stanza_model(b_input_ids,
                                    labels=b_labels,
                                    attention_mask=b_masks,
                                    token_type_ids=None)

        loss = outputs[0]

        batch_loss = loss.item()
        total_train_loss += batch_loss

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(poem_stanza_train_dataloader)
    training_time = format_time(time.time() - t0)

    print(f'Average Training Loss: {avg_train_loss}. Epoch Training Time: {training_time}')

    t0 = time.time()

    poem_stanza_model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in poem_stanza_val_dataloader:
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        with torch.no_grad():

            outputs  = poem_stanza_model(b_input_ids,
                                         attention_mask=b_masks,
                                         labels=b_labels)

            loss = outputs[0]

        batch_loss = loss.item()
        total_eval_loss += batch_loss

    avg_val_loss = total_eval_loss / len(poem_stanza_val_dataloader)


    print(f'Average Validation Loss: {avg_val_loss}')
    torch.save(poem_stanza_model.state_dict(), "models/poem_stanza_model"+str(epoch_i)+".pth")

print(f'Total Training Time: {format_time(time.time()-start_time)}')

Epoch 1 of 8
Average Training Loss: 0.210162496516954. Epoch Training Time: 0:43:27
Average Validation Loss: 0.18575276780744523
Epoch 2 of 8


In [None]:
# For model infernece
poem_stanza_model.eval()


prompt = "retirement, a thing of beauty"
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

# https://huggingface.co/transformers/v2.9.1/main_classes/model.html

sample_outputs = poem_stanza_model.generate(
                                generated,
                                do_sample=True,
                                top_k=50,
                                max_length=MAX_LEN,
                                top_p=0.96,
                                num_return_sequences=5,
                                )

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: retirement, a thing of beauty
as you the very sky do o'er
what will ye leave behind
like a dream upon the dewy mountains
that o'er you you may float
in the dew or on the cloud--
you wander into a world you find
like a dream upon the dew
but i tell you that the ocean holds
in this room i never could see


1: retirement, a thing of beauty gone
and now to my own eyes is like a star
and now to my heart it seems
since i am old and dead
i love but you alone


2: retirement, a thing of beauty
and the heart was as a flower that is
dost thou know that i was dead
so shalt thou tell me why
you were in life but i were
and the heart the soul


3: retirement, a thing of beauty
which through the eyes of some was hidden
and where she shone at the ears of others
the cheeks and shoulders where she was not seen
which through their veils she did like an infant white
when some saw her coming
some felt her look as her in nature
but when she came one by one


4: retirement, a thing of beauty born of good 

In [None]:
prompt = "here we go my love"
# generated = torch.tensor().unsqueeze(0)
# generated = generated.to(device)

In [None]:
encoding_dict=tokenizer('<BOS>' + prompt + '<EOS>',truncation=True,max_length=max_length,padding='max_length')
encoded_prompt=torch.tensor(encoding_dict["input_ids"])
attention_prompt=torch.tensor(encoding_dict["attention_mask"])

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/IPython/core/interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-33-90c8533325b8>", line 1, in <module>
    encoding_dict=tokenizer('<BOS>' + prompt + '<EOS>',truncation=True,max_length=max_length,padding='max_length')
NameError: name 'max_length' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/IPython/core/interactiveshell.py", line 2040, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'NameError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/IPython/core/ultratb.py", line 1101, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of

NameError: ignored