# Text Generation using GPT (Using Huggingface)

## Project Setup

In [None]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import torch
import shutil
from torch.utils.data import Dataset, random_split
from transformers import Trainer, TrainingArguments, GPTNeoForCausalLM, GPT2Tokenizer
import csv
import re
from google.colab import drive


In [None]:
!wget https://raw.githubusercontent.com/casonshep/NLP_Module_TaylorSwift_Lyrics/main/TSwift_Discography.csv

--2023-05-08 08:01:11--  https://raw.githubusercontent.com/casonshep/NLP_Module_TaylorSwift_Lyrics/main/TSwift_Discography.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 409751 (400K) [text/plain]
Saving to: ‘TSwift_Discography.csv’


2023-05-08 08:01:11 (79.8 MB/s) - ‘TSwift_Discography.csv’ saved [409751/409751]



## Data Preparation

In [None]:
# Connects colab to google drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = "TSwift_Discography.csv"

lyrics = []
remove_empty = ["[", "]", "Pre-Chorus", "Verse", "1", "2", "3", "Bridge", "Outro", "Chorus", "(", ")", 
                "\"", ":", "&", "Taylor", "Swift", "\'"]

with open(file_path, newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',')
    for row in spamreader:
      for const in remove_empty:
        row[2] = row[2].replace(const, "");
      row[2] = row[2].replace("\n", " ");
      res = re.sub(' +', ' ', row[2])
      lyrics.append(res)

sonnets = lyrics
sonnets.pop(52) #song skew results with repeated I's

datas = sonnets[2:-1]
for data in datas:
  if len(data)<1:
    datas.remove(data)
print(len(datas))

print(sonnets[52])


208
Its AM in your car Windows down, you pass my street, the memories start You say its in the past, you drive straight ahead Youre thinking that I hate you now Cause you still dont know what I never said I wish you would come back Wish I never hung up the phone like I did, I Wish you knew that Id never forget you as long as I live, and I Wish you were right here, right now, its all good I wish you would Its AM in my room Headlights pass the window pane I think of you Were a crooked love in a straight line down Makes you want to run and hide But it makes you turn right back around I wish you would come back Wish I never hung up the phone like I did, I Wish you knew that Id never forget you as long as I live, and I Wish you were right here, right now, its all good I wish you would I wish we could go back And remember what we were fighting for, and I Wish you knew that I miss you too much to be mad anymore, and I Wish you were right here, right now, its all good I wish you would Post- I,

In [None]:
# Custome dataset class to load dataset
class ShakespeareDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            # Encode the descriptions using the GPT-Neo tokenizer
            encodings_dict = tokenizer('<|startoftext|>' 
                                        + txt +    
                                        '<|endoftext|>',
                                        truncation=True,
                                        max_length=max_length, 
                                            padding="max_length")
            input_ids = torch.tensor(encodings_dict['input_ids'])    
            self.input_ids.append(input_ids)
            mask = torch.tensor(encodings_dict['attention_mask'])
            self.attn_masks.append(mask)
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

## Initialize tokenizer, model

In [None]:
# Set the random seed to a fixed value to get reproducible results 
torch.manual_seed(42)

# Download the pre-trained GPT-Neo model's tokenizer
# Add the custom tokens denoting the beginning and the end 
# of the sequence and a special token for padding
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-125M",    
                            bos_token='<|startoftext|>',
                            eos_token='<|endoftext|>',
                            pad_token='<|pad|>')

# Download the pre-trained GPT-Neo model and transfer it to the GPU
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M").cuda()

# Resize the token embeddings because we've just added 3 new tokens 
model.resize_token_embeddings(len(tokenizer))

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading pytorch_model.bin:   0%|          | 0.00/526M [00:00<?, ?B/s]

Embedding(50259, 768)

## Train/Test Split data

In [None]:
max_length = max([len(tokenizer.encode(sonnet)) for sonnet in datas])

# Load dataset
dataset = ShakespeareDataset(sonnets, tokenizer, max_length)

# Split data into train/val
train_size = int(0.85 * len(dataset))

train_data, val_data = random_split(dataset, [train_size, len(dataset) - train_size])

print(max_length)
print(sonnets[52])

1116
Its AM in your car Windows down, you pass my street, the memories start You say its in the past, you drive straight ahead Youre thinking that I hate you now Cause you still dont know what I never said I wish you would come back Wish I never hung up the phone like I did, I Wish you knew that Id never forget you as long as I live, and I Wish you were right here, right now, its all good I wish you would Its AM in my room Headlights pass the window pane I think of you Were a crooked love in a straight line down Makes you want to run and hide But it makes you turn right back around I wish you would come back Wish I never hung up the phone like I did, I Wish you knew that Id never forget you as long as I live, and I Wish you were right here, right now, its all good I wish you would I wish we could go back And remember what we were fighting for, and I Wish you knew that I miss you too much to be mad anymore, and I Wish you were right here, right now, its all good I wish you would Post- I

In [None]:
tokenizer.batch_decode(val_data[10])

['<|startoftext|> Midnight You come and pick me up, no headlights A long drive Could end in burning flames or paradise Fade into view, oh Its been a while since I have even heard from you Heard from you And I should just tell you to leave Cause I know exactly where it leads But I watch us go round and round each time You got that James Dean daydream look in your eye And I got that red lip classic thing that you like And when we go crashing down, we come back every time Cause we never go out of style, we never go out of style You got that long hair, slicked back, white t-shirt And I got that good girl faith and a tight little skirt And when we go crashing down, we come back every time Cause we never go out of style, we never go out of style So it goes He cant keep his wild eyes on the road Mmm Takes me home The lights are off, hes taking off his coat Mmm, yeah I say, I heard, oh That youve been out and about with some other girl Some other girl He says, What you heard is true But I cant

## Train Model

In [None]:
# Here I will pass the output directory where 
# the model predictions and checkpoints will be stored, 
# batch sizes for the training and validation steps, 
# and warmup_steps to gradually increase the learning rate
# learning_rates = [5e-5, 3e-5, 1e-5]

# for learning_rate in learning_rates:

#     training_args = TrainingArguments(output_dir=f'./results_{learning_rate}',
#                                       num_train_epochs=5,
#                                       logging_steps=1000,
#                                       save_steps=1000,
#                                       evaluation_strategy='steps',
#                                       eval_steps=1000,                               
#                                       per_device_train_batch_size=2,
#                                       per_device_eval_batch_size=2,
#                                       warmup_steps=100,
#                                       learning_rate=learning_rate,
#                                       weight_decay=0.01,  
#                                       logging_dir=f'./logs_{learning_rate}')

#     trainer = Trainer(model=model, args=training_args,  
#                       train_dataset=train_data,
#                       eval_dataset=val_data, 
#                       # This custom collate function is necessary 
#                       # to built batches of data
#                       data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),       
#                   'attention_mask': torch.stack([f[1] for f in data]),
#                   'labels': torch.stack([f[0] for f in data])})

#     # Start training process!
#     print(f"Training result for learning rate: {learning_rate}")
#     trainer.train()
#     print("\n\n")

Based on the results above, it looks like model trained with learning rate = 5e-5 is more promising than others.

In [None]:
training_args = TrainingArguments(output_dir=f'./results',
                                      num_train_epochs=5,
                                      logging_steps=1000,
                                      save_steps=5000,
                                      evaluation_strategy='steps',
                                      eval_steps=1000,                               
                                      per_device_train_batch_size=2,
                                      per_device_eval_batch_size=2,
                                      warmup_steps=100,
                                      learning_rate=5e-5,
                                      weight_decay=0.01,  
                                      logging_dir=f'./logs')

trainer = Trainer(model=model, args=training_args,  
                  train_dataset=train_data,
                  eval_dataset=val_data, 
                  # This custom collate function is necessary 
                  # to built batches of data
                  data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),       
              'attention_mask': torch.stack([f[1] for f in data]),
              'labels': torch.stack([f[0] for f in data])})

# Start training process!
trainer.train()




Step,Training Loss,Validation Loss


TrainOutput(global_step=450, training_loss=0.9785207790798611, metrics={'train_runtime': 336.5681, 'train_samples_per_second': 2.659, 'train_steps_per_second': 1.337, 'total_flos': 509568111267840.0, 'train_loss': 0.9785207790798611, 'epoch': 5.0})

In [None]:
# Save model in the specified file path
trainer.save_model("drive/MyDrive/AICamp/nlp/TS")

In [None]:
tokenizer.save_pretrained("drive/MyDrive/AICamp/nlp/TS")

('drive/MyDrive/AICamp/nlp/TS/tokenizer_config.json',
 'drive/MyDrive/AICamp/nlp/TS/special_tokens_map.json',
 'drive/MyDrive/AICamp/nlp/TS/vocab.json',
 'drive/MyDrive/AICamp/nlp/TS/merges.txt',
 'drive/MyDrive/AICamp/nlp/TS/added_tokens.json')

## Checking Model Output

In [None]:
generated = tokenizer("<|startoftext|>", return_tensors="pt").input_ids.cuda()
sample_outputs = model.generate(generated, do_sample=True, top_k=50,
                                # bos_token='<|startoftext|>',
                                # eos_token='<|endoftext|>', pad_token='<|pad|>',
                                max_length=50, top_p=0.90, temperature=1.8, num_return_sequences=20)
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: You really cant resist the cool things I think when you sit around my kitchen
1: He thinks like he knows who a day wears away He loves himself to a great dark time It doesnt need to. But oh, I know where it goes again Id like he thinks like I know him Now Im here alone Id like my best friend
2: Once I saw a white rose blooming, its always blue and soft after all these flowers have comeosing past the tree, let thereby I fell straight up under one another like ghosts In their old beds and old moles as if none
3: In the garden it seemed hard to be at lunch, waiting on the street while Im standing, I wanted a beer Im sippin back again Dont ask But I always got on if shes still here But where in New Zealand would I
4: Im so happy you called last week She asked about how it would work and you said it would save its name on their face But yeah If we were friends this fall, all around the corner Would a crowd dance on your window? They dont walk
5: I remember those times with our arms locke

## Upload model to huggingface

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import HfApi

api = HfApi()

In [None]:
# Create your repo first to upload the model
api.create_repo(repo_id="NLP-TaylorSwift")

RepoUrl('https://huggingface.co/casonshep/NLP-TaylorSwift', endpoint='https://huggingface.co', repo_type='model', repo_id='casonshep/NLP-TaylorSwift')

In [None]:
# Upload your model to huggingface. You can clone the repo anytime to use the model.
import os

model_pth = "drive/MyDrive/AICamp/nlp/TS"

files = os.listdir(model_pth)
for fi in files:
    print(os.path.join(model_pth, fi))

    api.upload_file(
        path_or_fileobj=os.path.join(model_pth, fi),
        path_in_repo=fi,
        repo_id="casonshep/NLP-TaylorSwift",
        repo_type="model",
    )

drive/MyDrive/AICamp/nlp/TS/config.json
drive/MyDrive/AICamp/nlp/TS/generation_config.json
drive/MyDrive/AICamp/nlp/TS/pytorch_model.bin


Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/551M [00:00<?, ?B/s]

drive/MyDrive/AICamp/nlp/TS/training_args.bin


Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

drive/MyDrive/AICamp/nlp/TS/tokenizer_config.json
drive/MyDrive/AICamp/nlp/TS/special_tokens_map.json
drive/MyDrive/AICamp/nlp/TS/added_tokens.json
drive/MyDrive/AICamp/nlp/TS/vocab.json
drive/MyDrive/AICamp/nlp/TS/merges.txt
