In [91]:
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel, AdamWeightDecay, GPT2Config
from torch.utils.data import Dataset, DataLoader
import tensorflow as tf
from tensorflow.python.client import device_lib
import utils
import os
import pandas as pd
import numpy as np

In [93]:
print(device_lib.list_local_devices())
print(tf.config.list_physical_devices())

model = TFGPT2LMHeadModel.from_pretrained('gpt2', config=GPT2Config())
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 5553669894701128109
xla_global_id: -1
]
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]


All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [76]:
datapath = "../data/cleaned_data/"
if not os.path.exists(datapath):
    utils.clean_data()
all_articles_dict = utils.load_article_data(path=datapath)
all_articles = pd.concat([df for df in all_articles_dict.values()])

In [86]:
#I believe we need to use the TFGPT2LMHeadModel for this after some research
MAX_LEN = 128

class GPT2ArticleDataset(Dataset):
    def __init__(self, tokenizer, articles, headlines, ml):
        self.max_len = ml
        self.tokenizer = tokenizer
        self.eos = self.tokenizer.eos_token
        self.eos_id = self.tokenizer.eos_token_id
        self.articles = articles
        self.headlines = headlines
        self.tokenized_articles = []
        self.tokenized_headlines = []

        self.tokenizer.pad_token = self.eos
        for article, headline in zip(self.articles, self.headlines):
            tokenized_article = self.tokenizer.encode(article + self.eos, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors='tf')
            tokenized_headline = self.tokenizer.encode(headline + self.eos, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors='tf')
            self.tokenized_articles.append(tokenized_article)
            self.tokenized_headlines.append(tokenized_headline)
    
    def __len__(self):
        return len(self.tokenized_articles)
    
    def __getitem__(self, item):
        return self.tokenized_articles[item]

dataset = GPT2ArticleDataset(tokenizer, all_articles['article'][0:1000], all_articles['headline'][0:1000], MAX_LEN)

In [94]:
tf_dataset = tf.data.Dataset.from_tensor_slices((dataset.tokenized_articles, dataset.tokenized_headlines))
tf_dataset = tf_dataset.batch(4).shuffle(buffer_size=100)
optimizer = AdamWeightDecay(learning_rate=5e-5, weight_decay_rate=0.01, clipnorm=1.0)
num_epochs = 3


# Training loop
for epoch in range(num_epochs):
    total_loss = 0.0

    for idx, batch in enumerate(tf_dataset):
        input_ids = batch[0]
        labels = batch[1]

        with tf.GradientTape() as tape:
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs[0]
            print(loss)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        total_loss += loss
        if idx % 10 == 0:
            print(f'{idx}: avg loss = {total_loss / idx}')

    average_loss = total_loss / len(tf_dataset)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {average_loss}")

tf.Tensor([nan], shape=(1,), dtype=float32)
0: avg loss = [nan]


KeyboardInterrupt: 