In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 5.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 47.6 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 64.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.0 tokenizers-0.12.1 transformers-4.22.2


In [3]:
!pip install wordfreq

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wordfreq
  Downloading wordfreq-3.0.2-py3-none-any.whl (56.8 MB)
[K     |████████████████████████████████| 56.8 MB 1.1 MB/s 
Collecting ftfy>=6.1
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 2.0 MB/s 
Installing collected packages: ftfy, wordfreq
Successfully installed ftfy-6.1.1 wordfreq-3.0.2


In [None]:
import regex as re
import string
import itertools
import numpy as np
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, AdamW, get_linear_schedule_with_warmup, GPTNeoForCausalLM
import torch
torch.manual_seed(14)
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
import nltk
from wordfreq import word_frequency
import time
import datetime
import random
nltk.download('punkt')
from tqdm import tqdm, trange

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
category_list = ["(wordplay)", "(anagram)", "(abbreviation)"]

In [None]:
def discard_low_freq(words, category_list=category_list, threshold=5e-05):
    words = str(words)
    tokenized = words.translate(str.maketrans('', '', string.punctuation)).split()
    freqs = [(word, word_frequency(word, "en")) for word in tokenized]
    for word, freq in freqs:
        if freq < threshold and "("+word+")" not in category_list:
            return np.NaN
    return True

In [None]:
def discard_low_freq_df(df):
    return df.iloc[df[["answer", "clue"]].applymap(discard_low_freq).dropna().index].reset_index()

In [None]:
train = discard_low_freq_df(pd.read_csv(path+"train.csv"))
dev = discard_low_freq_df(pd.read_csv(path+"valid.csv"))

In [None]:
path = '/content/drive/My Drive/Colab Notebooks/Crossword-Generator/'

In [None]:
cache_dir = path + "tmp/"
checkpoints = path + "checkpoints/"

In [None]:
model_str = "EleutherAI/gpt-neo-1.3B"
model_dict = {"EleutherAI/gpt-neo-1.3B":GPTNeoForCausalLM, "gpt2":GPT2LMHeadModel}
model_type = model_dict[model_str]

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained(model_str, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>', cache_dir=cache_dir)
model = model_type.from_pretrained(model_str, cache_dir=cache_dir)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
model.resize_token_embeddings(len(tokenizer))
device = torch.device("cuda")
model.cuda()

GPTNeoForCausalLM(
  (transformer): GPTNeoModel(
    (wte): Embedding(50259, 2048)
    (wpe): Embedding(2048, 2048)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0): GPTNeoBlock(
        (ln_1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (attn): GPTNeoAttention(
          (attention): GPTNeoSelfAttention(
            (attn_dropout): Dropout(p=0.0, inplace=False)
            (resid_dropout): Dropout(p=0.0, inplace=False)
            (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (out_proj): Linear(in_features=2048, out_features=2048, bias=True)
          )
        )
        (ln_2): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTNeoMLP(
          (c_fc): Linear(in_features=2048, out_features=8192, bias=True)
          (c_proj): Linear(

In [None]:
category_string = str(tuple(category_list)).replace(", ", "|").replace("'","")
category_dict = {}
for word in category_list:
    category_dict[word] = []

In [None]:
train_list_analogy = []
analogy_length = 100000

for i in range(0, analogy_length, 2):
    try:
        i_match = re.search("\({}\)".format(category_string), train["clue"][i])
        next_match = re.search("\({}\)".format(category_string), train["clue"][i+1])
        if not i_match and not next_match:
            train_list_analogy.append("Crossword clue for " + train["answer"][i] + ": " + train["clue"][i] + ". Crossword clue for " + train["answer"][i+1] + ": " + train["clue"][i+1])
        elif i_match and i_match[0] in category_dict:
            category_dict[i_match[0]].append(i)
        elif next_match and next_match[0] in category_dict:
            category_dict[next_match[0]].append(i+1)
        else:
            train_list_analogy.append("Crossword clue for " + train["answer"][i] + ": " + train["clue"][i] + ". Crossword clue for " + train["answer"][i+1] + ": " + train["clue"][i+1])
    except:
        print(i)
        pass
for category in category_dict.values():
    for i in range(0, len(category), 2):
        try:
            train_list_analogy.append("Crossword clue for " + train["answer"][category[i]] + ": " + train["clue"][category[i]] + ". Crossword clue for " + train["answer"][category[i+1]] + ": " + train["clue"][category[i+1]])
        except:
            pass

In [None]:
len(train_list_analogy)

48107

In [None]:
train_list_single = []
num_train_examples = int(200000)  # = len(train) if using all data
for i in range(analogy_length, num_train_examples, 1):
    try:
        train_list_single.append("Crossword clue for " + train["answer"][i] + ": " + train["clue"][i])
    except:
        pass

In [None]:
dev_list = []
num_dev_examples = int(25000)  # = len(dev) if using all data
for i in range(num_dev_examples):
    try:
        dev_list.append("Crossword clue for " + dev["answer"][i] + ": " + dev["clue"][i])
    except:
        pass

In [None]:
class GPTDataset(Dataset):

    def __init__(self, txt_list, tokenizer, max_length=30):

        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []

        for txt in txt_list:

            encodings_dict = tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx] 

In [None]:
train_dataset_analogy = GPTDataset(train_list_analogy, tokenizer)
train_dataset_single = GPTDataset(train_list_single, tokenizer)
dev_dataset = GPTDataset(dev_list, tokenizer)

train_dataloader_analogy = DataLoader(
            train_dataset_analogy,  # The training samples.
            sampler = RandomSampler(train_dataset_analogy), # Select batches randomly
            batch_size = 2 # Trains with this batch size.
        )
train_dataloader_single = DataLoader(
            train_dataset_single,
            sampler = RandomSampler(train_dataset_single),
            batch_size = 2
        )
dev_dataloader = DataLoader(
            dev_dataset,
            sampler = RandomSampler(dev_dataset),
            batch_size = 2
        )

In [None]:
epochs = 5
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8
total_steps = len(train_dataset_analogy) + len(train_dataset_single) * (epochs-1)

# this produces sample output every 100 steps
sample_every = 1000

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon
                )
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)



In [None]:
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [None]:
total_t0 = time.time()

training_stats = []

model = model.to(device)

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    if epoch_i == 0:
      train_dataloader = train_dataloader_analogy
      space = " "
      prev_loss = 0
    else:
      train_dataloader = train_dataloader_single
      space = ""
      prev_loss = avg_train_loss

    # train_dataloader = train_dataloader_single

    total_train_loss = 0

    model.train()

    t0 = time.time()

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()        

        outputs = model(  b_input_ids,
                          labels=b_labels, 
                          attention_mask = b_masks,
                          token_type_ids=None
                        )

        loss = outputs[0]

        batch_loss = loss.item()
        total_train_loss += batch_loss

        # Get sample every x batches.
        if step % sample_every == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))

            model.eval()

            for i, sample_input in enumerate(b_input_ids):
                text = tokenizer.decode(sample_input, skip_special_tokens=True)
                try:
                    substring = str(tokenizer.encode(re.search(space + "Crossword clue for .*:", text)[0][1:])[1:])[1:-1]
                except:
                    print("No substring. Current example: ", text)
                tensor_list = [num.item() for num in sample_input]
                start_index = str(tensor_list).rfind(substring)
                end_index = start_index + len(substring)
                input_string = str(tensor_list)[1:end_index]
                input_list = [int(num) for num in input_string.strip().split(",")]
                inputs = torch.tensor(input_list).view(1,len(input_list)).to(device)
                outputs = model.generate( 
                        inputs=inputs,
                        do_sample=True,   
                        top_k=50, 
                        max_new_tokens = 30,
                        top_p=0.95, 
                        num_return_sequences=1,
                        pad_token_id=tokenizer.eos_token_id)
                print(tokenizer.decode(outputs[0], skip_special_tokens=True))
            
            model.train()

        loss.backward()

        optimizer.step()

        scheduler.step()

    # Calculate the average loss over all of the batches.

    avg_train_loss = total_train_loss / len(train_dataloader)
    torch.save({
    'epoch': epoch_i,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'scheduler': scheduler.state_dict(),
    'loss': loss
    }, checkpoints + "clue_generator_{}_{}".format(model_str.replace("/", ""), epoch_i) + ".pt")
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in dev_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)
        
        with torch.no_grad():        

            outputs  = model(b_input_ids, 
#                            token_type_ids=None, 
                             attention_mask = b_masks,
                            labels=b_labels)
          
            loss = outputs[0]  
            
        batch_loss = loss.item()
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(dev_dataloader)
    
    validation_time = format_time(time.time() - t0)    

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch 1,000  of  24,054. Loss: 2.892059087753296.   Elapsed: 0:02:33.
Crossword clue for step: One of 39 in an old movie. Crossword clue for agent: Made a band
Crossword clue for did: Carried through on. Crossword clue for of course: Night.
  Batch 2,000  of  24,054. Loss: 2.2793357372283936.   Elapsed: 0:05:07.
Crossword clue for net gain: Final profit. Crossword clue for de co: It makes sense before a lot on it
Crossword clue for nearly: More or less. Crossword clue for return: '___!'
  Batch 3,000  of  24,054. Loss: 2.3121321201324463.   Elapsed: 0:07:40.
Crossword clue for high: Weather map mark. Crossword clue for word: They's for "up, what!"
Crossword clue for i took: ___ a trip on a train.... Crossword clue for how: 'Yes already!'
  Batch 4,000  of  24,054. Loss: 2.3873767852783203.   Elapsed: 0:10:13.
Crossword clue for arts: Industrial ___ (school subject). Crossword clue for quite a few: Not many in (wordplay)
Crossword clue for a re: "So there you ___". Crossw