<a href="https://colab.research.google.com/github/aditeyabaral/gpt2-implementation/blob/main/huggingface_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformers Implementation

# Installing and Setting up Environment

In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/19/22/aff234f4a841f8999e68a7a94bdd4b60b4cebcfeca5d67d61cd08c9179de/transformers-3.3.1-py3-none-any.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 3.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 17.9MB/s 
[?25hCollecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 26.4MB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl 

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments, pipeline
from transformers import GPT2Config, GPT2LMHeadModel

# Loading Training Data

In [None]:
!curl -L -O https://raw.githubusercontent.com/aditeyabaral/gpt2-implementation/main/Simpsons.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 8998k  100 8998k    0     0  14.4M      0 --:--:-- --:--:-- --:--:-- 14.4M


In [None]:
df = pd.read_csv("Simpsons.csv")
df.dropna(inplace = True)
df.drop_duplicates(inplace = True)
df.reset_index(inplace = True)
df.drop(columns = "index", inplace = True)
print(df.shape)
df.head()

(126646, 2)


Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [None]:
def character_slice(character):
  if character in np.unique(df[["raw_character_text"]]):
    return df[df["raw_character_text"] == character]

In [None]:
CHARACTER = "Miss Hoover"

character_df = character_slice(CHARACTER)
print(character_df.shape)
character_df.head()

(154, 2)


Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
4426,Miss Hoover,"Thank you, Ralph. Very graphic. Lisa Simpson? ..."
4428,Miss Hoover,Dear God!
4785,Miss Hoover,I question the educational value of this assem...


## Adding delimiters

This is optional - allows for retention of training data structure

In [None]:
train_text = "\n".join(character_df["spoken_words"].values)

In [None]:
with open("corpus.txt","w") as f:
  f.write(train_text)

# Training

In [None]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')
train_path = "/content/corpus.txt"
train_data = TextDataset(tokenizer=tokenizer,file_path=train_path,block_size=4)
collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [None]:
config = GPT2Config(
  vocab_size=tokenizer.vocab_size,
  bos_token_id=tokenizer.bos_token_id,
  eos_token_id=tokenizer.eos_token_id
)

In [None]:
model = GPT2LMHeadModel(config)
training_args=TrainingArguments(output_dir="/content/output",overwrite_output_dir=True,num_train_epochs=10,
                                per_device_train_batch_size=1,per_device_eval_batch_size=1,eval_steps=100,save_steps=5000,warmup_steps=100,gradient_accumulation_steps=1)
torch.cuda.empty_cache()
trainer = Trainer(model=model, args=training_args, data_collator=collator, train_dataset=train_data, prediction_loss_only=True)
trainer.train()
trainer.save_model()



HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=566.0, style=ProgressStyle(description_wi…

{'loss': 8.0275615234375, 'learning_rate': 4.640287769784173e-05, 'epoch': 0.8833922261484098, 'total_flos': 1493277696000, 'step': 500}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=566.0, style=ProgressStyle(description_wi…

{'loss': 5.6583193359375, 'learning_rate': 4.1906474820143885e-05, 'epoch': 1.76678445229682, 'total_flos': 2986555392000, 'step': 1000}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=566.0, style=ProgressStyle(description_wi…

{'loss': 5.124650390625, 'learning_rate': 3.741007194244605e-05, 'epoch': 2.65017667844523, 'total_flos': 4479833088000, 'step': 1500}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=566.0, style=ProgressStyle(description_wi…

{'loss': 4.664802734375, 'learning_rate': 3.2913669064748206e-05, 'epoch': 3.53356890459364, 'total_flos': 5973110784000, 'step': 2000}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=566.0, style=ProgressStyle(description_wi…

{'loss': 4.138349609375, 'learning_rate': 2.841726618705036e-05, 'epoch': 4.41696113074205, 'total_flos': 7466388480000, 'step': 2500}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=566.0, style=ProgressStyle(description_wi…

{'loss': 3.626490234375, 'learning_rate': 2.392086330935252e-05, 'epoch': 5.30035335689046, 'total_flos': 8959666176000, 'step': 3000}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=566.0, style=ProgressStyle(description_wi…

{'loss': 3.177654296875, 'learning_rate': 1.942446043165468e-05, 'epoch': 6.18374558303887, 'total_flos': 10452943872000, 'step': 3500}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=566.0, style=ProgressStyle(description_wi…

{'loss': 2.79109765625, 'learning_rate': 1.4928057553956835e-05, 'epoch': 7.067137809187279, 'total_flos': 11946221568000, 'step': 4000}
{'loss': 2.3026875, 'learning_rate': 1.0431654676258994e-05, 'epoch': 7.950530035335689, 'total_flos': 13439499264000, 'step': 4500}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=566.0, style=ProgressStyle(description_wi…

{'loss': 1.9343203125, 'learning_rate': 5.935251798561151e-06, 'epoch': 8.8339222614841, 'total_flos': 14932776960000, 'step': 5000}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=566.0, style=ProgressStyle(description_wi…

{'loss': 1.72496875, 'learning_rate': 1.4388489208633094e-06, 'epoch': 9.717314487632509, 'total_flos': 16426054656000, 'step': 5500}




# Generation

In [19]:
generator = pipeline("text-generation",model="/content/output",tokenizer="gpt2",config={'max-length':1024})
generated_text = generator("Ralph")[0]['generated_text']
print(generated_text)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


Ralph, this.
II don don't don't't take't take take't take


In [22]:
!zip -r /content/output . output.zip

  adding: .config/ (stored 0%)
  adding: .config/configurations/ (stored 0%)
  adding: .config/configurations/config_default (deflated 15%)
  adding: .config/active_config (stored 0%)
  adding: .config/.last_opt_in_prompt.yaml (stored 0%)
  adding: .config/logs/ (stored 0%)
  adding: .config/logs/2020.10.14/ (stored 0%)
  adding: .config/logs/2020.10.14/16.30.50.869240.log (deflated 92%)
  adding: .config/logs/2020.10.14/16.31.31.333159.log (deflated 54%)
  adding: .config/logs/2020.10.14/16.31.46.824031.log (deflated 54%)
  adding: .config/logs/2020.10.14/16.31.46.119103.log (deflated 55%)
  adding: .config/logs/2020.10.14/16.31.25.706256.log (deflated 87%)
  adding: .config/logs/2020.10.14/16.31.10.843101.log (deflated 54%)
  adding: .config/config_sentinel (stored 0%)
  adding: .config/.last_update_check.json (deflated 23%)
  adding: .config/.last_survey_prompt.yaml (stored 0%)
  adding: .config/gce (stored 0%)
  adding: .config/.metricsUUID (stored 0%)
  adding: cached_lm_GPT2Token