In [1]:
import torch
from sklearn.model_selection import train_test_split
import yaml

from util import load_model, get_tokenizer
from data_util import NewsDataset, get_data
from generate_util import sample_seq, beam_search

device = 'cpu'

In [3]:
EXPERIMENT_NAME = "med_desc_target"
CHECKPOINT = 10

with open(f"./config/{EXPERIMENT_NAME}.yaml", 'r') as file:
	config = yaml.safe_load(file)

MODEL_PATH = f'./output/{EXPERIMENT_NAME}/models/checkpoint_epoch{CHECKPOINT}.pt'

In [4]:
RANDOM_SEED = config["RANDOM_SEED"]
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)
torch.random.manual_seed(RANDOM_SEED)
import random
random.seed(RANDOM_SEED)

In [5]:
tokenizer = get_tokenizer(config["GPT_SIZE"])

df = get_data(tokenizer)
df_train, df_test = train_test_split(df, train_size=int(config["TRAIN_SPLIT"]*len(df)), random_state=RANDOM_SEED)
df_test, df_val = train_test_split(df_test, train_size=int(config["TEST_SPLIT"]*len(df)), random_state=RANDOM_SEED)

test_dataset = NewsDataset(df_test, tokenizer, config["TARGET_TYPE"])
print(len(test_dataset))

10000 articles loaded.
6461 samples after cleaning
646


In [6]:
def generate_sample(data, tokenizer, model, num=1, length=20, temperature=1, top_k=0, top_p=0.5, device=torch.device('cuda')):
    # Set seed in between subsequent calls for reproducibility
    torch.manual_seed(RANDOM_SEED) 
    torch.use_deterministic_algorithms(True)

    for i in range(num):
        print("*"*50)
        sample = data[i]
        idx = sample['sep_pos']
        description = sample['token_ids'][:idx].tolist()
        title = sample['token_ids'][idx+1:][:100].tolist()

        generated_tokens = sample_seq(model, description, length, device, temperature, top_k, top_p)
        generated_tokens = generated_tokens[0, len(description):].tolist()
        generated_title = tokenizer.decode(generated_tokens, skip_special_tokens=True)
  
        print('Description:\n', tokenizer.decode(description, skip_special_tokens=True))
        print("\nGENERATED title:\n", generated_title)
        print('TRUE title:\n', tokenizer.decode(title,skip_special_tokens=True),"   \n")


In [8]:
model = load_model(MODEL_PATH, tokenizer, config["GPT_SIZE"])

model.to(device)
model.eval()

In [9]:
# generate_beam_sample(test_dataset, tokenizer, model, num=1, device=device)

generate_sample(test_dataset, tokenizer, model, num=1, device=device, length=100)

**************************************************
Description:
 Healthcare of Ontario Pension Plan Trust Fund Decreases Stake in Target Co. (NYSE:TGT)

GENERATED title:
  Healthcare of Ontario Pension Plan Trust Fund decreased its stake in Target Co. (NYSE:TGT - Get Rating) by 11.3% during the third quarter, HoldingsChannel.com reports. The institutional investor owned 0,078,840 shares of the industrial products company's stock after selling 1,093,581 shares during the quarter. Healthcare of Ontario Pension Plan Trust Fund's holdings in Target [...]Welcome to the Wiki for the Foothills Christian School. Please read the rules before
TRUE title:
 Healthcare of Ontario Pension Plan Trust Fund cut its holdings in Target Co. (NYSE:TGT - Get Rating) by 88.9% during the third quarter, according to the company in its most recent Form 13F filing with the Securities and Exchange Commission (SEC). The institutional investor owned 3,529 shares of the retailer's stock after selling 28,196 shares [

In [None]:
def generate_beam_sample(data, tokenizer, model, num=1, length=20, beam_size=3, device=torch.device('cuda')):
    for i in range(num):
        sample = data[i]
        idx = sample['sep_pos']
        context = sample['token_ids'][:idx].tolist()
        summary = sample['token_ids'][idx+1:][:100].tolist()
        scores, sequences = beam_search(model, context, length, beam_size, device)
        print('description', end='\n\n')
        print(tokenizer.decode(context[:-1]), end='\n\n')
        # print('actual_summary', end='\n\n')
        # print(tokenizer.decode(summary), end='\n\n')
        for i in range(len(sequences)):
            text = tokenizer.convert_ids_to_tokens(sequences[i],skip_special_tokens=True)
            text = tokenizer.convert_tokens_to_string(text)  
            print("generated_summary-{} and Score is {}.".format(i+1, scores[i]), end='\n\n')
            print(text, end='\n\n')

In [None]:
# sample = test_dataset[0]
# idx = sample['sep_pos']
# description = sample['token_ids'][:idx].unsqueeze(0)
# title = sample['token_ids'][idx+1:][:100].tolist()

# sample_output = model.generate(
#     description, 
#     do_sample=True, 
#     max_length=95, 
#     top_k=0
# )
# tokens = sample_output[0, len(description[0]):]

# print("Output:\n" + 100 * '-')
# print(tokenizer.decode(tokens, skip_special_tokens=True))