<a href="https://colab.research.google.com/github/cactode/suburban/blob/main/suburban.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Suburban Dictionary Fine Tuning Scripts
This notebook goes through the process of filtering / cleaning an Urban Dictionary definition corpus and using it to fine-tune a Tensorflow GPT-2 model from Huggingface. It tries to take advantage of Google Colab TPUs, so make sure you have the TPU runtime enabled.

See the final result at [cactode.club/suburban](https://cactode.club/suburban)

If you're starting out with [this raw dataset from Kaggle](https://www.kaggle.com/therohk/urban-dictionary-words-dataset), you'll need to clean it up first with this script.

Code for cleaning original dataset and loading it into Arrow format.
```python
import re
import os
import pandas as pd
from datasets import Dataset

tokenizer = re.compile(r'(^\d+,)(.+)(,-?\d+,-?\d+,\w*,\"[^\"]+\"$)')
inp_lines = None
with open('urbandict-word-defs.csv', 'r') as inp:
    with open('urbandict-word-defs-fixed.csv', "w") as out:
        inp_iter = iter(inp)
        # skip header
        out.write(next(inp_iter) + '\n')
        for inp_line in inp_iter:
            match = tokenizer.match(inp_line)
            if not match:
                print("No match...")
                print(inp_line)
                continue
            elif r'"' in match[2]:
                raise InputException("Detected quote, " + inp_line)
            out_line = match.expand(r'\1"\2"\3') + '\n'
            out.write(out_line)
```

## Package Installation

In [None]:
!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
!sudo apt-get install git-lfs
!pip install datasets transformers symspellpy

## Imports and Data Setup


In [None]:
%tensorflow_version 2.x
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import datetime
import re
import pkg_resources
from unicodedata import normalize
from datasets import Dataset
from matplotlib import pyplot as plt
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel, pipeline
from symspellpy import SymSpell, Verbosity
from base64 import b64encode, b64decode
from IPython.core.display import display, HTML

# utility to print out things with pretty HTML formatting
def niceprint(*args):
    display(HTML(f"<p>{' '.join([str(i) for i in args])}"))

# prevents pandas from cutting off printed outputs
pd.options.display.max_colwidth = 6000
pd.options.display.max_rows = 400

# fast spell correct
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
_ = sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

# our source material is inherently racist, but it's worth trying to cut that down by filtering out
#  definitions with racist words. racist words are base64 encoded, view at own risk
badwords_b64 = b'YXBlIGFyeWFuIGJlYW5lciBiaW1ibyBib290bGlwIGNoaW5rIGNvb2xpZSBjb29uIGNyYWNrZXIgZHlrZSBmYWcgZmFnb3QgZmFnZ290IGdyaW5nbyBob25reSBpbmp1biBpc2xhbSBqYXAgamV3IGpld2lzaCBqaWdhYm9vIGt5a2UgbGVzYm8gbXVzbGltIG5pZ2xldCBuaWdnZXIgbmlnZ2EgcmV0YXJkIHJldGFyZGVkIHNwaWMgd2V0YmFjayB3aG9yZQ=='
bad_words = SymSpell(max_dictionary_edit_distance=0, prefix_length=7)
for word in b64decode(badwords_b64).decode('ascii').split(" "):
    bad_words.create_dictionary_entry(word, 1)

# assuming our data is in our drive under urbandict_word_defs_fixed.csv
from google.colab import drive
drive.mount('/content/drive')
PREFIX = os.path.join('/content','drive', 'MyDrive', 'Colab Notebooks')

# tpu magic
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.TPUStrategy(tpu)

## Constants

In [None]:
# values used to filter out bad definitions
MIN_UPVOTE = 10
MIN_UPDOWN_RATIO = 0.5
MIN_WORD_CHARS = 3
MIN_DEFINITION_CHARS = 10

# influences maximum output length
MAX_TOKENS = 128

# this is about good for TPUs
BATCH_SIZE = 24 * tpu_strategy.num_replicas_in_sync

## Data processing

In [None]:
# regex hell. it's pretty far from a science, but it works lol
NEWLINES = re.compile(r" ;; ")
FIRST_DEFS = re.compile(r"^[1a][\.\)](?!\d)\s?|^1\s")
MULTIPLE_DEFS = re.compile(r"\s[1-6a-g][\.\)](?!\d)\s?|\n\d\s")
TYPES = re.compile(r"^(acronym|pronoun|p\.\s?noun|noun|adjective|verb|adv\.|adj\.|v\.|n\.|p\.)[:\s]+|^\([\w\s\.]+\)[:\s]*")
REMOVE_NEWLINES = re.compile(r"(?<=[.,;:\-]) *\n+")
REMOVE_NEWLINES_NOPERIOD = re.compile(r"(?<=\w) *\n+")
GARBAGE = GARBAGE = re.compile(r"[\[\]\{\}\\\|_\^\*`~<>]")
REAL_WORD = re.compile(r"(?<=[\s,;:])([a-z]{3}[a-z]*)(?=$|\s|\.\s|[,;:\?\!])")
EXCESS_SPACE = re.compile(r"(\s)\s+")
CORRECT_PUNCTUATION = re.compile(r"([a-z]+)\s+([\.,;:])")
ADD_PERIOD = re.compile(r"([^\.\,\?\!\s])$")

# converts everything to lowercase ascii to reduce token burden
def normalize_text(text):
    return normalize('NFKD', text.strip().lower()).encode('ascii', 'ignore').decode()

# helper function that tries to split multiple definitions for one word into unique rows
def split_into_lists(definition):
    out = normalize_text(definition)
    out = NEWLINES.sub("\n", out)
    out = FIRST_DEFS.sub(r"", out)
    out = MULTIPLE_DEFS.sub(r"<|split|>", out).split("<|split|>")
    return out

# read in our csv and apply data converters defined above. also explode along definition
df = pd.read_csv(
    os.path.join(PREFIX, 'input_data', 'rawdatasets', 'urbandict-word-defs-fixed.csv'),
    usecols=['word', 'up_votes', 'down_votes', 'definition'],
    dtype={
        'up_votes': 'Int32',
        'down_votes': 'Int32',
    },
    converters = {
        'word': normalize_text,
        'definition': split_into_lists 
    },
    nrows=1000
).dropna().explode('definition', ignore_index=True)

# make arrow dataset for next operations
ds = Dataset.from_pandas(df)

In [None]:
# use our regex pile to clean up the word
def process_word(word):
    out = GARBAGE.sub(" ", word) # remove pointless fancy characters
    out = EXCESS_SPACE.sub(r"\1", out) # remove any more than two spaces
    return out

# use our regex pile to clean up the definitions
def process_definition(definition):
    out = definition.strip()
    out = TYPES.sub("", out) # remove annoying extra word type specifiers
    out = GARBAGE.sub(" ", out) # remove pointless fancy characters
    out = REMOVE_NEWLINES.sub(" ", out) # remove newlines and replace them with spaces
    out = REMOVE_NEWLINES_NOPERIOD.sub(". ", out) # remove newlines with no periods and add periods, too
    out = EXCESS_SPACE.sub(r"\1", out) # remove any more than two spaces
    out = CORRECT_PUNCTUATION.sub(r"\1\2", out) # get rid of spaces before punctuation.
    out = ADD_PERIOD.sub(r"\1.", out) # add period to end of sentence.
    out = REAL_WORD.sub(
        lambda match: sym_spell.lookup(
            match[0], 
            Verbosity.CLOSEST, 
            max_edit_distance=2,
            transfer_casing=True,
            include_unknown=True
        )[0].term, out
    ) # aggressive spellchecking
    out = out.strip() # for the road
    return out

# cleans up words and definitions
def process_words_and_definitions(row):
    row['word'] = process_word(row['word'])
    row['definition'] = process_definition(row['definition'])
    return row

# assembles both words and definitions into a prompt
def assemble_input_text(row):
    row['input_text'] = f"define {row['word']}: {row['definition']}"
    return row
    
# remove things that have bad upvote records
ds = ds.filter(
    lambda x: x['up_votes'] > MIN_UPVOTE and 
    x['down_votes'] and 
    x['up_votes'] / x['down_votes'] > MIN_UPDOWN_RATIO
)

# actually map our processing function over everything
ds = ds.map(process_words_and_definitions)

# remove things that are too short or have racist language
ds = ds.filter(
    lambda x: len(x['word']) > MIN_WORD_CHARS and
    len(x['definition']) > MIN_DEFINITION_CHARS and
    not any([bad_words.lookup(word, Verbosity.CLOSEST) for word in REAL_WORD.findall(x['definition'])])
)

# turn into a single prompt
ds = ds.map(assemble_input_text)

# eliminate useless columns
ds = ds.remove_columns(['word', 'up_votes', 'down_votes', 'definition'])

## Data Tokenization and Prep

In [None]:
# prepare tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(
    'gpt2',
)
tokenizer.pad_token = tokenizer.eos_token

# define tokenizing function
def tokenize(in_ds):
    # initial tokenization
    token_data = tokenizer(
        in_ds['input_text'],
        max_length=MAX_TOKENS,
        truncation=True,
        padding='max_length'
    )

    # make sure labels are shifted from input ids for prediction task
    token_data['labels'] = [input_id[1:] for input_id in token_data["input_ids"]]
    token_data['input_ids'] = [token[:-1] for token in token_data['input_ids']]
    token_data['attention_mask'] = [mask[:-1] for mask in token_data['attention_mask']]
    
    """
    Explanation:
    - For every label:
        - For every char in the label:
            - If the input_ids (label shifted to the right) is EOS, change it to -100.
                - This makes sure that only one EOS is left in.
    """
    token_data['labels'] = [
        [
            token_data['labels'][i][j]
            if token_data['input_ids'][i][j] != tokenizer.eos_token_id
            else -100 
            for j in range(len(token_data['labels'][i]))
        ]
        for i in range(len(token_data['labels']))
    ]
    
    return token_data

# tokenize!
ds = ds.map(
    tokenize,
    batched=True,
    remove_columns=['input_text']
)

# remove any training datums that don't end
ds = ds.filter(
    lambda x: x['input_ids'].count(tokenizer.eos_token_id)
)

ds = ds.with_format(type="tensorflow", columns=['input_ids', 'attention_mask', 'labels'])
dict_ds = ds.train_test_split(test_size=0.10, shuffle=True, seed=42)
tf_ds = dict()
for split in ['train', 'test']:
    features = {x: dict_ds[split][x] for x in ['input_ids', 'attention_mask']}
    tf_ds[split] = tf.data.Dataset.from_tensor_slices((features, dict_ds[split]['labels'])).batch(BATCH_SIZE)

## Model Prep

In [None]:
# set epochs and learning rate scheduler
EPOCHS = 10

START_LR = 0.00001
MIN_LR = 0.00001
MAX_LR = 0.00003 * tpu_strategy.num_replicas_in_sync
RAMPUP_EPOCHS = 3
SUSTAIN_EPOCHS = 0
EXP_DECAY = .8

def lrfn(epoch):
  if epoch < RAMPUP_EPOCHS:
    return (MAX_LR - START_LR)/RAMPUP_EPOCHS * epoch + START_LR
  elif epoch < RAMPUP_EPOCHS + SUSTAIN_EPOCHS:
    return MAX_LR
  else:
    return (MAX_LR - MIN_LR) * EXP_DECAY**(epoch-RAMPUP_EPOCHS-SUSTAIN_EPOCHS) + MIN_LR

# define callbacks
callbacks = [
    tf.keras.callbacks.LearningRateScheduler(
        lrfn,
        verbose=True
    ),
    tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",
        min_delta=0.02,
        verbose=1,
        patience=1,
        restore_best_weights=True
    ),
]

# make model
with tpu_strategy.scope():
    model = TFGPT2LMHeadModel.from_pretrained(
        'gpt2',
        use_cache=False,
    )
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
        loss=model.compute_loss
    )
model.summary()

## Training

In [None]:
# train this boi up
hist = model.fit(
    tf_ds['train'],
    validation_data=tf_ds['test'],
    epochs=EPOCHS,
    callbacks=callbacks,
    verbose=1
)

In [None]:
# define a pipeline so we can try it out
definition = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

In [None]:
# see what it says for a bunch of words!
def generate_words(words):
    for word in words:
        result = pd.DataFrame(definition(f"define {word}:", max_length=150, num_return_sequences=1))
        niceprint(f"<b>{word}</b>")
        niceprint("<br>".join(result['generated_text']))

generate_words(["beans", "chetan", "koala care", "androgyn", "black fog", "karen", "wungus", "boppin with the boys"])

In [None]:
# probably push it to the hub so we can keep playing with it
model.push_to_hub("cactode/gpt2_urbandict_textgen", use_auth_token='OPENSESAME')