<a href="https://colab.research.google.com/github/andrewkwstich/Crossword-Generator/blob/main/clue_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import itertools
import numpy as np
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, AdamW, get_linear_schedule_with_warmup
import torch
torch.manual_seed(14)
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
import nltk
import time
import datetime
import random
nltk.download('punkt')
from tqdm import tqdm, trange

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
path = '/content/drive/My Drive/Colab Notebooks/Crossword-Generator/'

In [5]:
train = pd.read_csv(path+"train.csv")
dev = pd.read_csv(path+"valid.csv")

In [6]:
cache_dir = path + "tmp/"
checkpoints = path + "checkpoints/"

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>', cache_dir=cache_dir)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
config = GPT2Config.from_pretrained("gpt2", cache_dir=cache_dir)
model = GPT2LMHeadModel.from_pretrained("gpt2", cache_dir=cache_dir)

In [9]:
model.resize_token_embeddings(len(tokenizer))
device = torch.device("cuda")
model.cuda()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50259, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [10]:
train_list_analogy = []
analogy_length = 1000
for i in range(0, analogy_length, 2):
    try:
        train_list_analogy.append("Crossword clue for " + train["answer"][i] + ": " + train["clue"][i] + "Crossword clue for " + train["answer"][i+1] + ": " + train["clue"][i+1])
    except:
        pass

In [11]:
train_list_single = []
num_train_examples = 10000  # = len(train) if using all data
for i in range(analogy_length, num_train_examples, 1):
    try:
        train_list_single.append("Crossword clue for " + train["answer"][i] + ": " + train["clue"][i])
    except:
        pass

In [12]:
dev_list = []
num_dev_examples = 1000  # = len(train) if using all data
for i in range(num_dev_examples):
    try:
        dev_list.append("Crossword clue for " + dev["answer"][i] + ": " + dev["clue"][i])
    except:
        pass

In [13]:
class GPT2Dataset(Dataset):

    def __init__(self, txt_list, tokenizer, max_length=30):

        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []

        for txt in txt_list:

            encodings_dict = tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx] 

In [14]:
train_dataset_analogy = GPT2Dataset(train_list_analogy, tokenizer)
train_dataset_single = GPT2Dataset(train_list_single, tokenizer)
dev_dataset = GPT2Dataset(dev_list, tokenizer)

train_dataloader_analogy = DataLoader(
            train_dataset_analogy,  # The training samples.
            sampler = RandomSampler(train_dataset_analogy), # Select batches randomly
            batch_size = 2 # Trains with this batch size.
        )
train_dataloader_single = DataLoader(
            train_dataset_single,  # The training samples.
            sampler = RandomSampler(train_dataset_single), # Select batches randomly
            batch_size = 2 # Trains with this batch size.
        )
dev_dataloader = DataLoader(
            dev_dataset,  # The training samples.
            sampler = RandomSampler(dev_dataset), # Select batches randomly
            batch_size = 2 # Trains with this batch size.
        )

In [15]:
epochs = 5
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8
total_steps = len(train_dataset_analogy) + len(train_dataset_single) * (epochs-1)

# this produces sample output every 100 steps
sample_every = 1000

In [16]:
optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon
                )
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)



In [17]:
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [None]:
total_t0 = time.time()

training_stats = []

model = model.to(device)

training_stats = []

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    if epoch_i == 0:
      train_dataloader = train_dataloader_analogy
      prev_loss = 0
    else:
      train_dataloader = train_dataloader_single
      prev_loss = avg_train_loss

    total_train_loss = 0

    model.train()

    t0 = time.time()

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()        

        outputs = model(  b_input_ids,
                          labels=b_labels, 
                          attention_mask = b_masks,
                          token_type_ids=None
                        )

        loss = outputs[0]

        batch_loss = loss.item()
        total_train_loss += batch_loss

        # # Get sample every x batches.
        # if step % sample_every == 0 and not step == 0:

        #     elapsed = format_time(time.time() - t0)
        #     print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))

        #     model.eval()

        #     sample_outputs = model.generate(
        #                             bos_token_id=random.randint(1,30000),
        #                             do_sample=True,   
        #                             top_k=50, 
        #                             max_new_tokens = 30,
        #                             top_p=0.95, 
        #                             num_return_sequences=1
        #                         )
        #     for i, sample_output in enumerate(sample_outputs):
        #           print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
            
        #     model.train()

        loss.backward()

        optimizer.step()

        scheduler.step()

    # Calculate the average loss over all of the batches.

    avg_train_loss = total_train_loss / len(train_dataloader)

    if prev_loss != 0 and prev_loss < avg_train_loss:
        torch.save({
        'epoch': epoch_i,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler': scheduler.state_dict(),
        'loss': loss
        }, checkpoints + "clue_generator_{}".format(epoch_i) + ".pt")
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in dev_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)
        
        with torch.no_grad():        

            outputs  = model(b_input_ids, 
#                            token_type_ids=None, 
                             attention_mask = b_masks,
                            labels=b_labels)
          
            loss = outputs[0]  
            
        batch_loss = loss.item()
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(dev_dataloader)
    
    validation_time = format_time(time.time() - t0)    

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...

  Average training loss: 6.13
  Training epoch took: 0:00:20

Running Validation...
  Validation Loss: 1.77
  Validation took: 0:00:06

Training...

  Average training loss: 1.87
  Training epoch took: 0:05:42

Running Validation...
  Validation Loss: 1.87
  Validation took: 0:00:06

Training...

  Average training loss: 1.55
  Training epoch took: 0:05:40

Running Validation...
  Validation Loss: 1.98
  Validation took: 0:00:06

Training...

  Average training loss: 1.16
  Training epoch took: 0:05:41

Running Validation...
  Validation Loss: 2.28
  Validation took: 0:00:06

Training...
