In [1]:
from transformers import AutoTokenizer, AdamWeightDecay, TFAutoModelForCausalLM
import tensorflow as tf
from tensorflow.python.client import device_lib
import utils
import os
import pandas as pd
from datasets import load_dataset
import random




In [2]:
# Load pretrained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = TFAutoModelForCausalLM.from_pretrained('gpt2')

# Constants
TLDR = ' TL;DR '
MAX_LEN = 512




All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [9]:
datapath = "../data/cleaned_data/"
if not os.path.exists(datapath):
    utils.clean_data()
all_articles_dict = utils.load_article_data(path=datapath)
del all_articles_dict['clean_Articles.csv']
del all_articles_dict['clean_CNN_Articels_clean.csv']
all_articles_df = pd.concat([df for df in all_articles_dict.values()])

In [15]:
# Format data by: article TL;DR headline
num_elements = 1000
all_articles = all_articles_df.values.tolist()
all_articles = [x[1].strip() + " TL;DR " + x[0].strip().replace(' - The New York Times', '') 
                for x in all_articles 
                if isinstance(x[0], str) and isinstance(x[1], str)][0:num_elements]

def pad_and_truncate_data(dataset):
    """
    Format data to always contain the TL;DR and the entire headline. Truncate the article such that
    the whole string becomes MAX_LEN long.
    """
    ARTICLE_LEN = MAX_LEN - len(TLDR)
    result = []
    for d in dataset:
        article, headline = d.split(' TL;DR ')
        result.append(article[0:ARTICLE_LEN - len(headline)] + TLDR + headline)
    return result

all_articles = pad_and_truncate_data(all_articles)

In [16]:
# Write data to files to be loaded into a dataset
random.seed(11)
random.shuffle(all_articles)
TRAIN_SPLIT = 0.9
END_IDX = int(len(all_articles) * TRAIN_SPLIT)
with open("../data/train_data.txt", "w", encoding='utf-8') as txt_file:
    for line in all_articles[0:END_IDX]:
        txt_file.write(line + "\n") # works with any number of elements in a line
with open("../data/test_data.txt", "w", encoding='utf-8') as txt_file:
    for line in all_articles[END_IDX:]:
        txt_file.write(line + "\n") # works with any number of elements in a line

In [17]:
datasets = load_dataset("text", data_files={"train": '../data/train_data.txt', "validation": '../data/test_data.txt'})

Downloading data files: 100%|██████████| 2/2 [00:00<?, ?it/s]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 62.27it/s]
Generating train split: 900 examples [00:00, 41874.18 examples/s]
Generating validation split: 100 examples [00:00, 5160.51 examples/s]


In [18]:
print(datasets["train"][10])
print(len(datasets['train']))
print(len(datasets['validation']))

{'text': 'The presidency of Donald J. Trump has been noteworthy for its speed. In his first week in office, as the president’s aides won’t tire of reminding us, Mr. Trump has already put in motion plans to do much of what he promised to do while campaigning. But it’s not just the politician who is moving fast. It’s the population, too. In a matter of hours on Saturday, thousands rushed to the nation’s airports, beckoned by tweets. The f TL;DR The Alt-Majority: How Social Networks Empowered Mass Protests Against Trump'}
900
100


In [19]:
class TokenizerWrapper:
    def __init__(self, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.max_len = max_len
    
    def tokenize_function(self, examples):
        return self.tokenizer(examples["text"],
                              padding='max_length',
                              truncation=True,
                              max_length=self.max_len // 4)

tokenizer_wrapper = TokenizerWrapper(tokenizer, MAX_LEN)

In [20]:
# Tokenize data
tokenized_datasets = datasets.map(
    tokenizer_wrapper.tokenize_function, batched=True, num_proc=4, remove_columns=["text"]
)

Map (num_proc=4): 100%|██████████| 900/900 [00:06<00:00, 132.41 examples/s]
Map (num_proc=4): 100%|██████████| 100/100 [00:04<00:00, 21.31 examples/s]


In [21]:
print(tokenized_datasets["train"][1])
print(tokenizer.decode(tokenized_datasets["train"][1]["input_ids"]))
print(len(tokenizer.decode(tokenized_datasets["train"][1]["input_ids"])))

{'input_ids': [5673, 447, 247, 21358, 5984, 5883, 3955, 11, 2688, 5018, 220, 851, 220, 220, 383, 717, 11903, 286, 6669, 447, 247, 1000, 1215, 388, 320, 11, 257, 289, 6548, 1748, 319, 262, 10183, 30140, 286, 10843, 11, 389, 783, 220, 764, 317, 27316, 3443, 4721, 938, 614, 11, 290, 2319, 5085, 389, 11694, 612, 11, 749, 2636, 286, 3288, 5640, 706, 890, 290, 12309, 3160, 13, 1320, 318, 284, 910, 11, 612, 318, 2147, 8584, 546, 428, 1295, 11, 530, 286, 262, 11706, 18573, 284, 10843, 287, 262, 12030, 2688, 5018, 11, 543, 2692, 12000, 422, 8078, 2026, 812, 2084, 13, 564, 250, 1026, 447, 247, 82, 636, 220, 24811, 26, 7707, 2692, 447, 247, 82, 6912, 12, 14993, 364, 16168, 284, 564, 246, 5247, 4403, 447, 247], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [22]:
# Add labels to tokenized data
def add_labels(examples):
    examples['labels'] = examples['input_ids'].copy()
    return examples

lm_datasets = tokenized_datasets.map(
    add_labels,
    batched=True,
    batch_size=1000,
    num_proc=4,
)


Map (num_proc=4): 100%|██████████| 900/900 [00:02<00:00, 353.64 examples/s]
Map (num_proc=4): 100%|██████████| 100/100 [00:02<00:00, 43.49 examples/s]


In [23]:
# Prepare training and validation datasets
train_set = model.prepare_tf_dataset(
    lm_datasets["train"],
    shuffle=True,
    batch_size=4,
)

validation_set = model.prepare_tf_dataset(
    lm_datasets["validation"],
    shuffle=False,
    batch_size=4,
)

In [24]:
# Compile and train model
optimizer = AdamWeightDecay(lr=2e-5, weight_decay_rate=0.01)
model.compile(optimizer=optimizer)
model.fit(train_set, 
          validation_data=validation_set, 
          epochs=1,  
          verbose=True)

  super().__init__(name, **kwargs)





<keras.src.callbacks.History at 0x1d432702ad0>

In [25]:
# model.save_weights('../trained_models/gpt2-summarization')
model.save_pretrained('../trained_models/gpt2-summarization')

In [10]:
model = TFAutoModelForCausalLM.from_pretrained('../trained_models/gpt2-summarization/')

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at ../trained_models/gpt2-summarization/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [9]:

def summarize_article(article):
    tokenized = tokenizer(article, return_tensors="np")
    outputs = model.generate(**tokenized, max_length=512)
    return tokenizer.decode(outputs[0])

max_input_len = MAX_LEN - len(TLDR)
summaries = []
with open('../data/test_data.txt', encoding='utf-8') as f:
    for i in range(0, 10):
        test_sentence = f.readline().split(TLDR)[0]
        test_sentence = test_sentence[0:max_input_len] + TLDR
        summaries.append(summarize_article(test_sentence))
for summary in summaries:
    print(summary)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Not since Lincoln has there been a president as fundamentally shaped  —   in his life, convictions and outlook on the world  —   by reading and writing as Barack Obama. Last Friday, seven days before his departure from the White House, Mr. Obama sat down in the Oval Office and talked about the indispensable role that books have played during his presidency and throughout his life  —   from his peripatetic and sometimes lonely boyhood, when “thes TL;DR    Obama: A Life of Reading and Writing<|endoftext|>
Fifty years ago right about now, two unassuming young brothers were standing in front of a CBS studio audience taping the first episode of their new variety show. They were also about to unleash the modern concept of television buzz, in a storm the likes of which the medium had not seen. They were the Smothers Brothers, Tom and Dick, and the    story of their show, “The Smothers Brothers Comedy Hour,” is worth recalling 50 years on as a case study that ma TL;DR  TL;DR Smothers Brothers 