In [144]:
import spacy
import en_core_web_sm
import nltk
from nltk.corpus import wordnet as wn
import numpy as np
import tensorflow as tf
import datasets
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import create_optimizer

In [82]:
nlp = en_core_web_sm.load()
# nltk.download('wordnet')

## Move Wordnet Synsets to Dictionary

In [229]:
wordnet_nouns = wn.all_eng_synsets('n')
wordnet_verbs = wn.all_eng_synsets('v')
wordnet_adjs = wn.all_eng_synsets('a')
wordnet_advs = wn.all_eng_synsets('r')

In [230]:
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, truncate=True, use_fast=False)
batch_size = 16

max_input_length = 200
max_target_length = 200

# def preprocess_function(examples):
#     model_inputs = tokenizer(
#         examples["definition"],
#         max_length=max_input_length,
#         truncation=True,
#         padding='longest'
#     )
#     labels = tokenizer(
#         examples["name"],
#         max_length=max_target_length,
#         truncation=True,
#         padding='longest'
#     )
#     model_inputs["labels"] = tf.constant(labels["input_ids"])
#     return model_inputs

def preprocess_function(examples):
    return tokenizer(examples['definition'], truncation=True)

In [231]:
def create_dataset(wordnet_data):
    dictionary = {
        'name': [],
        'definition': [],
    }

    for s in wordnet_data:
        dictionary['name'].append(s.name())
        dictionary['definition'].append(s.definition())

    dataset = datasets.Dataset.from_dict(dictionary)
    return dataset

nouns = create_dataset(wordnet_nouns) \
            .train_test_split(test_size=.2)
encoded_nouns = nouns.map(preprocess_function, batched=True) \
            # .select_columns(['input_ids', 'attention_mask', 'labels']) \
# nouns = nouns.remove_columns(['name', 'definition'])
# verbs = create_dataset(wordnet_verbs) \
#             .train_test_split(test_size=.2) \
#             .map(preprocess_function, batched=True) \
#             .select_columns(['input_ids', 'attention_mask', 'labels'])
# adjs = create_dataset(wordnet_adjs) \
#             .train_test_split(test_size=.2) \
#             .map(preprocess_function, batched=True) \
#             .select_columns(['input_ids', 'attention_mask', 'labels'])
# advs = create_dataset(wordnet_advs) \
#             .train_test_split(test_size=.2) \
#             .map(preprocess_function, batched=True) \
#             .select_columns(['input_ids', 'attention_mask', 'labels'])

Map: 100%|██████████| 65692/65692 [00:15<00:00, 4253.15 examples/s]
Map: 100%|██████████| 16423/16423 [00:03<00:00, 4241.42 examples/s]


In [233]:
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")

In [234]:
noun_model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=16000)

# tf_train_dataset = nouns['train'].to_tf_dataset(
#     columns=['attention_mask', 'input_ids', 'label'],
#     shuffle=True,
#     batch_size=8,
#     collate_fn=data_collator
# )

# tf_validation_dataset = nouns['train'].to_tf_dataset(
#     columns=['attention_mask', 'input_ids', 'label'],
#     shuffle=False,
#     batch_size=8,
#     collate_fn=data_collator
# )

tf_train_dataset = noun_model.prepare_tf_dataset(
    encoded_nouns['train'],
    shuffle=True,
    batch_size=16,
    tokenizer=tokenizer
)

tf_validation_dataset = noun_model.prepare_tf_dataset(
    encoded_nouns['test'],
    shuffle=False,
    batch_size=16,
    tokenizer=tokenizer
)

batch_size = 8
num_epochs = 5
batches_per_epoch = len(nouns['train']) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

noun_model.compile(optimizer=optimizer)
noun_model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3)
# noun_model.save_pretrained("models/nouns-model")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/3


TypeError: in user code:

    File "/Users/adamlear/Desktop/portfolio/projects/word-guesser/.env/lib/python3.10/site-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/Users/adamlear/Desktop/portfolio/projects/word-guesser/.env/lib/python3.10/site-packages/keras/src/engine/training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/adamlear/Desktop/portfolio/projects/word-guesser/.env/lib/python3.10/site-packages/keras/src/engine/training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "/Users/adamlear/Desktop/portfolio/projects/word-guesser/.env/lib/python3.10/site-packages/transformers/modeling_tf_utils.py", line 1678, in train_step
        self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "/Users/adamlear/Desktop/portfolio/projects/word-guesser/.env/lib/python3.10/site-packages/keras/src/optimizers/optimizer.py", line 543, in minimize
        grads_and_vars = self.compute_gradients(loss, var_list, tape)
    File "/Users/adamlear/Desktop/portfolio/projects/word-guesser/.env/lib/python3.10/site-packages/keras/src/optimizers/optimizer.py", line 276, in compute_gradients
        grads = tape.gradient(loss, var_list)

    TypeError: Argument `target` should be a list or nested structure of Tensors, Variables or CompositeTensors to be differentiated, but received None.


## Tokenize Word Definitions

## Move Wordnet Synsets to TensorFlow Data