In [2]:
# required import statements
from transformers import AutoTokenizer, AdamWeightDecay, TFAutoModelForCausalLM
import tensorflow as tf
from tensorflow.python.client import device_lib
import utils
import os
import pandas as pd
from datasets import load_dataset
import random

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Check for GPU Compatibility
print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print(f'GPUs: {tf.config.list_physical_devices("GPU")}')
tf.debugging.set_log_device_placement(False)
GPU_ENABLED = True if len(tf.config.list_physical_devices('GPU')) > 0 else False

2.10.0
Num GPUs Available:  1
GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [4]:
# Load pretrained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = TFAutoModelForCausalLM.from_pretrained('gpt2')

# Constants
TLDR = ' TL;DR '
MAX_LEN = 512
NUM_ELEMENTS = 50000
BATCHES = 2
SAVE_MODEL_PATH = '../trained_models/gpt2-summarization-gpu'
DATA_PATH = "../data/cleaned_data/"

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [5]:
# prepare and normalize data from csv files
if not os.path.exists(DATA_PATH):
    utils.clean_data()
all_articles_dict = utils.load_article_data(path=DATA_PATH)
del all_articles_dict['clean_Articles.csv']
del all_articles_dict['clean_CNN_Articels_clean.csv']
all_articles_df = pd.concat([df for df in all_articles_dict.values()])

In [6]:
def strip_nonalnum(word):
    """
    Strip non-alphanumeric characters from the start and end of a string.
    Returns the stripped string.
    """
    if not word:
        return word  # nothing to strip
    for start, c in enumerate(word):
        if c.isalnum():
            break
    for end, c in enumerate(word[::-1]):
        if c.isalnum():
            break
    return word[start:len(word) - end]

def clean_datapoint(datapoint):
    """
    Given a line from the cleaned data. Perform transformations to get a resulting string of
    the format: 'article TL;DR headline' without any starting or trailing non-alphanumeric characters.
    Also remove ending titles for specific newspapers.
    """
    res = strip_nonalnum(datapoint[1]) + ' TL;DR ' + strip_nonalnum(datapoint[0]).replace(' - The New York Times', '').replace(' - Breitbart', '')
    return res

def pad_and_truncate_data(dataset):
    """
    Format data to always contain the TL;DR and the entire headline. Truncate the article such that
    the whole string becomes MAX_LEN long.
    """
    ARTICLE_LEN = MAX_LEN - len(TLDR)
    result = []
    for d in dataset:
        try:
            article, headline = d.split(' TL;DR ')
            result.append(article[0:ARTICLE_LEN - len(headline)] + TLDR + headline)
        except:
            continue
    return result   


# Clean each element of data and format by: article TL;DR headline
all_articles = all_articles_df.values.tolist()
all_articles = [clean_datapoint(x) for x in all_articles if isinstance(x[0], str) and isinstance(x[1], str)][:NUM_ELEMENTS]

# Pad and truncate data to specific length
all_articles = pad_and_truncate_data(all_articles)
print(f'Example: {all_articles[0]}')

Example: WASHINGTON  —   Congressional Republicans have a new fear when it comes to their    health care lawsuit against the Obama administration: They might win. The incoming Trump administration could choose to no longer defend the executive branch against the suit, which challenges the administration’s authority to spend billions of dollars on health insurance subsidies for   and   Americans, handing House Republicans a big victory on    issues. Bu TL;DR House Republicans Fret About Winning Their Health Care Suit


In [7]:
# Write data to files to be loaded into a dataset
random.seed(11)
random.shuffle(all_articles)
TRAIN_SPLIT = 0.9
END_IDX = int(len(all_articles) * TRAIN_SPLIT)
with open("../data/train_data.txt", "w", encoding='utf-8') as txt_file:
    for line in all_articles[0:END_IDX]:
        txt_file.write(line + "\n") # works with any number of elements in a line
with open("../data/test_data.txt", "w", encoding='utf-8') as txt_file:
    for line in all_articles[END_IDX:]:
        txt_file.write(line + "\n") # works with any number of elements in a line

In [8]:
# Load dataset from files
datasets = load_dataset("text", data_files={"train": '../data/train_data.txt', "validation": '../data/test_data.txt'})

Downloading data files: 100%|██████████| 2/2 [00:00<?, ?it/s]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 94.12it/s]
Generating train split: 44999 examples [00:00, 104860.98 examples/s]
Generating validation split: 5000 examples [00:00, 134095.86 examples/s]


In [9]:
# print some data information to check for correctness
print(datasets["train"][10])
print(len(datasets['train']))
print(len(datasets['validation']))

{'text': 'Two cultures collided in Canada when a group of recently arrived Syrian refugees were settled in a Vancouver hotel where a yearly furry convention was taking place. [The fifth annual VancouFur convention was taking place at the Executive Airport Plaza hotel when numerous attendees noticed a bus full of migrants pull up outside and start to pour through the entrance.  “They saw people in giant animal costumes. To the children they were just cartoons fr TL;DR Migrants Settled in Hotel Hosting Furry Convention'}
44999
5000


In [10]:
class TokenizerWrapper:
    """
    This class is used to wrap the tokenizer and provide a tokenize_function for the inputs.
    """
    def __init__(self, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.max_len = max_len
    
    def tokenize_function(self, examples):
        """
        The function that is used to tokenize the inputs using the supplied class member variables.
        """
        return self.tokenizer(examples["text"],
                              padding='max_length',
                              truncation=True,
                              max_length=self.max_len // 4)

tokenizer_wrapper = TokenizerWrapper(tokenizer, MAX_LEN)

In [11]:
# Tokenize data
tokenized_datasets = datasets.map(
    tokenizer_wrapper.tokenize_function, batched=True, num_proc=4, remove_columns=["text"]
)

Map (num_proc=4):   0%|          | 0/44999 [00:00<?, ? examples/s]

Map (num_proc=4): 100%|██████████| 44999/44999 [00:08<00:00, 5554.14 examples/s] 
Map (num_proc=4): 100%|██████████| 5000/5000 [00:03<00:00, 1308.54 examples/s]


In [12]:
# Example outputs
print(tokenized_datasets["train"][1])
print(tokenizer.decode(tokenized_datasets["train"][1]["input_ids"]))
print(len(tokenizer.decode(tokenized_datasets["train"][1]["input_ids"])))

{'input_ids': [38328, 360, 1536, 265, 286, 262, 968, 1971, 3782, 468, 6848, 326, 4390, 3615, 10429, 5073, 2605, 3804, 510, 262, 3663, 284, 3151, 503, 284, 564, 250, 2, 12295, 6170, 447, 251, 12471, 287, 607, 9831, 4046, 11, 6011, 606, 564, 250, 22366, 447, 251, 284, 920, 501, 606, 284, 5078, 13, 685, 35, 1536, 5562, 6797, 287, 465, 3502, 5721, 357, 14986, 12476, 2599, 220, 3363, 11, 262, 9831, 447, 247, 82, 905, 25428, 373, 47305, 555, 4035, 11, 20953, 11, 10768, 220, 851, 220, 220, 475, 5073, 2605, 447, 247, 82, 5928, 8666, 318, 407, 13, 1375, 447, 247, 82, 1057, 35073, 24811, 26, 7707, 360, 1536, 265, 25, 5073, 3242, 1068, 10528, 284, 1303, 12295, 6170, 17897, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [13]:
# Add labels to tokenized data
def add_labels(examples):
    examples['labels'] = examples['input_ids'].copy()
    return examples

lm_datasets = tokenized_datasets.map(
    add_labels,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/44999 [00:00<?, ? examples/s]

Map (num_proc=4): 100%|██████████| 44999/44999 [00:03<00:00, 11703.44 examples/s]
Map (num_proc=4): 100%|██████████| 5000/5000 [00:02<00:00, 1827.93 examples/s]


In [14]:
# Prepare training and validation datasets
train_set = model.prepare_tf_dataset(
    lm_datasets["train"],
    shuffle=True,
    batch_size=BATCHES,
)

validation_set = model.prepare_tf_dataset(
    lm_datasets["validation"],
    shuffle=False,
    batch_size=BATCHES,
)

In [15]:
# Compile and train model
if GPU_ENABLED:
    with tf.device('/GPU:0'):
        optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
        model.compile(optimizer=optimizer)
        model.fit(
            train_set, 
            validation_data=validation_set, 
            epochs=2,  
            verbose=True)
else:
    optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
    model.compile(optimizer=optimizer)
    model.fit(
        train_set, 
        validation_data=validation_set, 
        epochs=2,  
        verbose=True)

# Save trained model
print(f"Saving model at {SAVE_MODEL_PATH}")
model.save_pretrained(SAVE_MODEL_PATH)

Epoch 1/2
Epoch 2/2
Saving model at ../trained_models/gpt2-summarization-gpu


In [16]:
# save trained model
model.save_pretrained('../trained_models/gpt2-summarization')
# Load trained model
model = TFAutoModelForCausalLM.from_pretrained('../trained_models/gpt2-summarization/')

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at ../trained_models/gpt2-summarization/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
