In [1]:
#!pip install torch
#!pip install transformers
#!pip install flash-attn --no-build-isolation

In [None]:
import torch
import warnings
warnings.filterwarnings('ignore')

from transformers import GPT2LMHeadModel, GPT2Tokenizer

device = "cuda" # the device to load the model onto

# initialize tokenizer and model from pretrained GPT2 model from Huggingface
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')
model = GPT2LMHeadModel.from_pretrained('gpt2-large', pad_token_id=tokenizer.eos_token_id, torch_dtype=torch.float16)  #, attn_implementation="flash_attention_2")
out = model.to(device)

### generate text - autoregressive next word prediction
### note that `encode` and `decode` here do not refer to the encoder-decoder architecture, but encoding text into numeric representation and decoding back from numbers into words (tokens)


In [11]:
sequence = "the first president of the US was"

#sequence = \
#"All the lonely people \
#Where do they all come from? "

# encoding sentence for model to process
inputs = tokenizer.encode(sequence, return_tensors='pt').to(device)

# generating text and decoding, try sampling...
outputs = model.generate(inputs, max_length=40, do_sample=False, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
# printing output
print('\n\n'+text)



the first president of the US was not born in the United States. He was born on April 4, 1805, in New York City, the son of John Quincy Adams, a lawyer, and


### denife useful functions for loading and processing data

In [8]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(tokenizer=tokenizer, file_path=file_path, block_size=block_size,)
    return dataset

def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=mlm,)
    return data_collator

### domain adaptation of the pretrained GPT2 model

In [15]:
def train(train_file_path, model_name, output_dir, overwrite_output_dir, per_device_train_batch_size, num_train_epochs, save_steps):

  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)
  model = GPT2LMHeadModel.from_pretrained(model_name)
  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
          logging_steps=50,
          report_to="none",
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )

  trainer.train()
  trainer.save_model()

In [16]:
model_name = 'gpt2'
#train_file_path = "/content/fine-tuning-data/reddit.multiple.subreddits.10K.dat"
train_file_path = "/content/domain-adaptation-data/shakespeare-5K.dat"
output_dir = '/content/adapted-models'

overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 3
save_steps = 500

In [17]:
import warnings
warnings.filterwarnings('ignore')

train(
    output_dir=output_dir,
    model_name=model_name,
    train_file_path=train_file_path,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

Step,Training Loss
50,3.9905
100,3.6102


### generate text with the freshly domain-adapted model

In [18]:
model_path = '/content/adapted-models'
# initialize tokenizer and model from tuned GPT2 model
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path, pad_token_id=tokenizer.eos_token_id, torch_dtype=torch.float16)
out = model.to(device)

In [19]:
model.generation_config.pad_token_id = tokenizer.eos_token_id

sequence = "the first president of the US was"

# encoding sentence for model to process
inputs = tokenizer.encode(sequence, return_tensors='pt').to(device)

# generating text and decoding, try sampling...
outputs = model.generate(inputs, max_length=50, do_sample=False, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
# printing output
print('\n\n'+text)



the first president of the US was born in the year of our Lord's birth.

CAMILLO:
I am sorry, sir, that you have not been able to see him,
but I think you must have seen him
