In [36]:
import os
import pandas as pd
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer,GPT2LMHeadModel,AdamW, get_linear_schedule_with_warmup
from datetime import datetime
from tqdm import tqdm

In [37]:
if torch.cuda.is_available(): 
 dev = "cuda:0" 
else: 
 dev = "cpu" 
device = torch.device(dev) 

In [38]:
TRAIN_SPLIT = 0.7
VAL_SPLIT = 0.15
TEST_SPLIT = 0.15

MAX_LENGTH = 100
BATCH_SIZE = 8
EPOCHS = 5
LR = 2e-5
WARMUP_STEPS = 200

RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7fdeb589f850>

In [39]:
BOS = '<|endoftext|>'
PAD = '<|pad|>'
SEP = '<|sep|>'

special_tokens_dict = {'bos_token': BOS, 'pad_token': PAD, 'sep_token': SEP}

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
num_add_toks = tokenizer.add_special_tokens(special_tokens_dict)

ignore_idx = tokenizer.pad_token_id


In [40]:
# Load data frame
df = pd.read_json('data/data.jsonl')

print(f"{len(df)} articles loaded.")
df.head()

# Clean data
df = df[~df["url"].str.contains("thanhnien")]
df = df[df["title"].str.split().apply(lambda x: len(x)) >= 8]

# Remove duplicates
df.drop_duplicates(subset='title', inplace=True, keep='first')

df = df.reset_index()
print(f"{len(df)} articles after filtering.")

df = df[['title', 'description']]


10000 articles loaded.
8099 articles after filtering.


In [41]:
texts = []
sep_pos = []	# Position of SEP between description and title
drop = []
for i, row in df.iterrows():
	desc_tokens = tokenizer.tokenize(row['description'])
	title_tokens = tokenizer.tokenize(row['title'])
	if len(title_tokens) > 50:
		drop.append(i)
		continue
	if len(desc_tokens) + len(title_tokens) > MAX_LENGTH-3:
		desc_len = MAX_LENGTH -3  - len(title_tokens)
		desc_tokens = desc_tokens[:desc_len]
	BOS_id, SEP_id, PAD_id = tokenizer.convert_tokens_to_ids([BOS, SEP, PAD])
	title_token_ids = tokenizer.convert_tokens_to_ids(title_tokens)
	desc_token_ids = tokenizer.convert_tokens_to_ids(desc_tokens)
    
	token_ids = [BOS_id] + desc_token_ids + [SEP_id] + title_token_ids + [BOS_id]
	token_ids.extend([PAD_id]* (MAX_LENGTH-len(token_ids)))
	texts.append(token_ids)
	sep_pos.append(len(desc_token_ids) + 1)
	

for i in drop:
	df.drop(i, inplace=True)
df = df.reset_index()

df['token_ids'] = texts
df['sep_pos'] = sep_pos

In [42]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, train_size=int(TRAIN_SPLIT*len(df)), random_state=RANDOM_SEED)
df_test, df_val = train_test_split(df_test, train_size=int(TEST_SPLIT*len(df)), random_state=RANDOM_SEED)

In [43]:
if len(df_train) % BATCH_SIZE:
	df_train = df_train[:-(len(df_train)% BATCH_SIZE)]
if len(df_val) % BATCH_SIZE:
	df_val = df_val[:-(len(df_val)% BATCH_SIZE)]
if len(df_test) % BATCH_SIZE:
	df_test = df_test[:-(len(df_test)% BATCH_SIZE)]


In [44]:
class NewsDataset(Dataset):

    def __init__(self, df):
        self.token_ids = list(df.token_ids)
        self.sep_pos = list(df.sep_pos)

    def __len__(self):
        return len(self.token_ids)

    def __getitem__(self,idx):  
        token_ids = torch.tensor(self.token_ids[idx])
        sample = {'token_ids': token_ids, 'sep_pos': self.sep_pos[idx]}
        return sample

In [45]:
train_dataset = NewsDataset(df_train)
val_dataset = NewsDataset(df_val)
test_dataset = NewsDataset(df_test)
print(len(train_dataset), len(val_dataset), len(test_dataset))

5648 1208 1208


In [46]:
from torch.utils.tensorboard import SummaryWriter


In [47]:
def train(model, train_dataset, valid_dataset, ignore_index, gradient_accumulation_steps = 32, max_grad_norm=1):
    """ Trains GPT2 model and logs necessary details.
        Args:
            args: dict that contains all the necessary information passed by user while training
            model: finetuned gpt/gpt2 model
            tokenizer: GPT/GPT2 tokenizer
            train_dataset: GPT21024Dataset object for training data
            ignore_index: token not considered in loss calculation
    """
    print("Device:", device)
    writer = SummaryWriter('./output/logs')
    train_dataloader = DataLoader(train_dataset,batch_size=BATCH_SIZE)#,num_workers=args.num_workers)
    loss_fct = CrossEntropyLoss(ignore_index=ignore_index) #ignores padding token for loss calculation
    optimizer = AdamW(model.parameters(),lr=LR)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=-1
    )

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    for epoch in range(EPOCHS):
        print(f"Epoch {epoch}")
        for step, batch in enumerate(tqdm(train_dataloader)):
            inputs, labels = batch['token_ids'], batch['token_ids']

            inputs = inputs.to(device)
            labels = labels.to(device)
            model.train()
            logits = model(inputs)[0]
            idx = batch['sep_pos']#.item() # index of separator token

            losses = []
            for i in range(len(idx)):
                shift_logits = logits[i, idx[i]:-1, :].contiguous()
                shift_labels = labels[i, idx[i]+1:].contiguous()
                loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
                losses.append(loss)

            # Combine the losses
            loss = torch.stack(losses).mean()


            loss = loss/gradient_accumulation_steps
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            tr_loss += loss.item()

            if (step + 1) % gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1
                writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                writer.add_scalar('loss', (tr_loss - logging_loss)/gradient_accumulation_steps, global_step)
                logging_loss = tr_loss
                
            if (step + 1) % (10*gradient_accumulation_steps) == 0:
                results = evaluate(model, valid_dataset, ignore_index, global_step)
                for key, value in results.items():
                    writer.add_scalar('eval_{}'.format(key), value, global_step)

        # torch.save(
        #     model.state_dict(),
        #     os.path.join("./output/models", f"model-{epoch}.pt"),
        # )
    return model

In [48]:
def evaluate(model, eval_dataset, ignore_index, global_step=None):
    """ Returns perplexity score on validation dataset.
        global_step: no. of times gradients have backpropagated
        ignore_index: token not considered in loss calculation
    """
    eval_dataloader = DataLoader(eval_dataset, batch_size=BATCH_SIZE)
    loss_fct = CrossEntropyLoss(ignore_index=ignore_index) #ignores padding token for loss calculation

    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in eval_dataloader:
        inputs, labels = batch['token_ids'].to(device), batch['token_ids'].to(device)
        
        with torch.no_grad():
            logits = model(inputs)[0]
            idx = batch['sep_pos']#.item() # index of separator token

            losses = []
            for i in range(len(idx)):
                shift_logits = logits[i, idx[i]:-1, :].contiguous()
                shift_labels = labels[i, idx[i]+1:].contiguous()
                loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
                losses.append(loss)

            # Combine the losses
            eval_loss += torch.stack(losses).mean()

        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    return {"perplexity": perplexity}           


In [49]:
# Load model
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50259, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50259, bias=False)
)

In [50]:
model = train(model, train_dataset, val_dataset, ignore_idx)




Device: cuda:0
Epoch 0


 30%|██▉       | 209/706 [00:26<01:03,  7.87it/s]


KeyboardInterrupt: 

In [None]:
torch.save(
    model.state_dict(),
    os.path.join("./output/models", "model.pt"),
)


In [None]:
import numpy as np

seq_lengths = []
for i, row in df.iterrows():
	tokens = tokenizer.tokenize(row['description'] + ' ' + row['title'])
	seq_lengths.append(len(tokens))

print(f'Min: {min(seq_lengths)}, Max: {max(seq_lengths)}')
print(f'Median:\t{np.median(seq_lengths)}\nMean:\t{np.mean(seq_lengths)}\nStddev:\t{np.std(seq_lengths)}')


Min: 14, Max: 260
Median:	98.0
Mean:	92.53046816943275
Stddev:	46.36685974956369


In [85]:
import torch.nn.functional as F

def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
        Args:
            logits: logits distribution shape (vocabulary size)
            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
    """
    assert logits.dim() == 1  # batch size 1 for now - could be updated for more but the code would be less clear
    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probs > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[indices_to_remove] = filter_value
    return logits


def sample_seq(model, context, length, device, temperature=1, top_k=0, top_p=0.0):
    """ Generates a sequence of tokens 
        Args:
            model: gpt/gpt2 model
            context: tokenized text using gpt/gpt2 tokenizer
            length: length of generated sequence.
            device: torch.device object.
            temperature >0: used to control the randomness of predictions by scaling the logits before applying softmax.
            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
    """
    
    context = torch.tensor(context, dtype=torch.long, device=device)
    context = context.unsqueeze(0)
    generated = context
    with torch.no_grad():  
        for _ in range(length):
            inputs = {'input_ids': generated}
            outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
            next_token_logits = outputs[0][0, -1, :] / temperature
            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
            next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
            generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
    return generated


def beam_search(model, context, length, beam_size, device, temperature=1):
    """ Generate sequence using beam search https://machinelearningmastery.com/beam-search-decoder-natural-language-processing/
        Args:
            model: gpt/gpt2 model
            context: tokenized text using gpt/gpt2 tokenizer
            length: length of generated sequence.
            beam_size: >=1 and <= total_no_of_tokens
            device: torch.device object.
            temperature >0: used to control the randomness of predictions by scaling the logits before applying softmax.
    """
    context = torch.tensor(context, dtype=torch.long, device=device)
    context = context.unsqueeze(0)
    with torch.no_grad():  
        inputs = {'input_ids': context}
        outputs = model(**inputs) 
        next_token_logits = outputs[0][0, -1, :] / temperature
        next_token_probs = F.softmax(next_token_logits)
        scores, indices = torch.topk(next_token_probs, beam_size)
        indices = indices.tolist()
        sequences = [[c] for c in indices]
        for _ in tnrange(length-1):
            logits = torch.zeros(beam_size*len(next_token_logits))
            for j in range(len(sequences)):
                new_generated = torch.cat((context,torch.tensor([sequences[j]], dtype=torch.long, device=device)),dim=1)
                inputs = {'input_ids': new_generated}
                outputs = model(**inputs) 
                next_token_logits = outputs[0][0, -1, :] / temperature
                next_token_probs = F.softmax(next_token_logits)
                start, stop = j*len(next_token_logits), (j+1)*len(next_token_logits)
                logits[start:stop] = scores[j]*next_token_probs
            scores, new_logits_indices = torch.topk(logits,beam_size)
            logits = (new_logits_indices%50259).tolist()
            for j in range(len(sequences)):
                sequences[j] = sequences[j]+[logits[j]]
    return scores, sequences


def generate_beam_sample(data, tokenizer, model, num=1, length=100, beam_size=3, device=torch.device('cuda')):
    """ Generate summaries for "num" number of articles using beam search.
        Args:
            data = GPT21024Dataset object
            tokenizer = gpt/gpt2 tokenizer
            num = number of articles for which summaries has to be generated
    """
    for i in range(num):
        sample = data[i]
        idx = sample['sep_pos']
        context = sample['article'][:idx].tolist()
        summary = sample['article'][idx+1:][:100].tolist()
        scores, sequences = beam_search(model, context, length, beam_size, device)
        print('new_article', end='\n\n')
        print(tokenizer.decode(context[:-1]), end='\n\n')
        print('actual_summary', end='\n\n')
        print(tokenizer.decode(summary), end='\n\n')
        for i in range(len(sequences)):
            text = tokenizer.convert_ids_to_tokens(sequences[i],skip_special_tokens=True)
            text = tokenizer.convert_tokens_to_string(text)  
            print("generated_summary-{} and Score is {}.".format(i+1, scores[i]), end='\n\n')
            print(text, end='\n\n')


def oldgenerate_sample(data, tokenizer, model, num=1, eval_step=False, length=20, temperature=1, top_k=10, top_p=0.5, device=torch.device('cuda')):
    """ Generate summaries for "num" number of articles.
        Args:
            data = GPT21024Dataset object
            tokenizer = gpt/gpt2 tokenizer
            model = gpt/gpt2 model
            num = number of articles for which summaries has to be generated
            eval_step = can be True/False, checks generating during evaluation or not
    """
    for i in range(num):
        sample = data[i]
        idx = sample['sep_pos']
        context = sample['token_ids'][:idx].tolist()
        summary = sample['token_ids'][idx+1:][:100].tolist()
        generated_text = sample_seq(model, context, length, device, temperature, top_k, top_p)
        generated_text = generated_text[0, len(context):].tolist()
        text = tokenizer.convert_ids_to_tokens(generated_text,skip_special_tokens=True)
        text = tokenizer.convert_tokens_to_string(text)
        if eval_step==False:
            print('new_article', end='\n\n')
            print(tokenizer.decode(context), end='\n\n')
            print("generated_summary", end='\n\n')
            print(text, end='\n\n')
            print('actual_summary', end='\n\n')
            print(tokenizer.decode(summary), end='\n\n')
        else:
            print(tokenizer.decode(context), end='\n\n')
            print("generated_summary", end='\n\n')

In [105]:
def generate_sample(data, tokenizer, model, num=1, length=20, temperature=1, top_k=10, top_p=0.5, device=torch.device('cuda')):
    for i in range(num):
        print("*"*50)
        sample = data[i]
        idx = sample['sep_pos']
        description = sample['token_ids'][:idx].tolist()
        title = sample['token_ids'][idx+1:][:100].tolist()
        generated_tokens = sample_seq(model, description, length, device, temperature, top_k, top_p)
        generated_tokens = generated_tokens[0, len(description):].tolist()
        generated_title = tokenizer.convert_ids_to_tokens(generated_tokens)#,skip_special_tokens=True)
        generated_title = tokenizer.convert_tokens_to_string(generated_title)

        print('Description:\n', tokenizer.decode(description, skip_special_tokens=True))
        print("\nGENERATED title:\n", generated_title)
        print('TRUE title:\n', tokenizer.decode(title,skip_special_tokens=True),"   \n")


In [90]:
# new_model = torch.load('./output/models/model.pt')
new_model = GPT2LMHeadModel.from_pretrained('gpt2')
new_model.resize_token_embeddings(len(tokenizer))

new_model.load_state_dict(torch.load('./output/models/model.pt'))
new_model.eval()

new_model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50259, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50259, bias=False)
)

In [98]:
generate_sample(test_dataset, tokenizer, new_model, num=5, device=device)


**************************************************
Description:
 Dawson Geophysical (NASDAQ:DWSN &#8211; Get Rating) crossed above its 200-day moving average during trading on Tuesday. The stock has a 200-day moving average of $1.73 and traded as high as $1.73. Dawson Geophysical shares last traded at $1.65, with a volume of

GENERATED title:
  $1.73. Dawson Geophysical shares last traded at $1.65, with a volume
TRUE title:
 Dawson Geophysical (NASDAQ:DWSN) Shares Cross Above 200-Day Moving Average of $1.73    

**************************************************
Description:
 The Energy Department now joins the Federal Bureau of Investigation in saying the virus likely spread via a mishap at a Chinese laboratory

GENERATED title:
 The Energy Department now joins the FederalThe Energy Department now joins the Federal Bureau of Investigation in
TRUE title:
 FBI chief confirms Covid-19 originated from lab incident in Wuhan    

**************************************************
Descriptio

In [102]:
orignal_model = GPT2LMHeadModel.from_pretrained('gpt2')
orignal_model.resize_token_embeddings(len(tokenizer))
orignal_model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50259, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50259, bias=False)
)

In [106]:
generate_sample(test_dataset, tokenizer, orignal_model, num=5, device=device)


**************************************************
Description:
 Dawson Geophysical (NASDAQ:DWSN &#8211; Get Rating) crossed above its 200-day moving average during trading on Tuesday. The stock has a 200-day moving average of $1.73 and traded as high as $1.73. Dawson Geophysical shares last traded at $1.65, with a volume of

GENERATED title:
 <|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|>  <|pad|><|pad|><|pad|><|pad|><|pad|><|pad|>  <|pad|>
TRUE title:
 Dawson Geophysical (NASDAQ:DWSN) Shares Cross Above 200-Day Moving Average of $1.73    

**************************************************
Description:
 The Energy Department now joins the Federal Bureau of Investigation in saying the virus likely spread via a mishap at a Chinese laboratory

GENERATED title:
 <|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|>
TRUE title:
 FBI chief confirms Covid-19 originated from 