In [None]:
from transformers import pipeline

In [None]:
from datasets import load_dataset
dataset = load_dataset('khalidalt/HuffPost')
dataset

In [None]:
selected_categories = ['ENTERTAINMENT','SPORTS','BUSINESS','TECH','POLITICS']
categories_translation = {
    'ENTERTAINMENT':'entertainment',
    'SPORTS':'sport',
    'BUSINESS':'business',
    'TECH':'tech',
    'POLITICS':'politics',
} # to be consistent with BBC news dataset
dataset = dataset['test'].filter(lambda x:x['category'] in selected_categories).filter(lambda x:x['short_description'] not in ['',None,float('nan')])
print(dataset) # 54427

In [None]:
import random
alist = list(range(54427))
random.shuffle(alist)
alist_train = alist[:50000]
alist_dev = alist[50000:54427]
alist_test = alist[50000:54427]

train_dataset = dataset.select(alist_train)
dev_dataset = dataset.select(alist_dev)
test_dataset = dataset.select(alist_test)
print(train_dataset)
print(dev_dataset)
print(test_dataset)


# concat headline and short_description as the content
train_questions = train_dataset['headline']
train_answers = train_dataset['short_description']
train_contents = [' '.join([q,a]) for q,a in zip(train_questions, train_answers)]
train_labels = [categories_translation[l] for l in train_dataset['category']]

dev_questions = dev_dataset['headline']
dev_answers = dev_dataset['short_description']
dev_contents = [' '.join([q,a]) for q,a in zip(dev_questions, dev_answers)]
dev_labels =  [categories_translation[l] for l in dev_dataset['category']]

test_questions = test_dataset['headline']
test_answers = test_dataset['short_description']
test_contents = [' '.join([q,a]) for q,a in zip(test_questions, test_answers)]
test_labels =  [categories_translation[l] for l in test_dataset['category']]

print(len(train_contents),len(train_labels),len(set(train_labels)))
print(len(dev_contents),len(dev_labels),len(set(dev_labels)))
print(len(test_contents),len(test_labels),len(set(test_labels)))

import ossaudiodev
data_path = 'data_clf/5huffpost_scorer'
if not os.path.exists(data_path):
    os.mkdir(data_path)
pd.DataFrame({'content':train_contents, 'label':train_labels}).to_csv(f"{data_path}/train.csv")
pd.DataFrame({'content':dev_contents, 'label':dev_labels}).to_csv(f"{data_path}/dev.csv")
pd.DataFrame({'content':test_contents, 'label':test_labels}).to_csv(f"{data_path}/test.csv")


In [None]:
# use the sketches in the 5huffpost_1000
import pandas as pd
data = pd.read_csv('data_clf/5huffpost_1000/sega_promptTrue_asonly_False_sega-old_aug4.csv')
orig_contents = list(data['content'])[:1000]
sega_contents = list(data['content'])[1000:2000]
sketches = list(data['sketch'])[1000:2000]
labels = list(data['label'])[1000:2000]

In [None]:
ss = []
for s,l in zip(sketches, labels):
    ss.append(s.replace('%s: '%l, ''))

In [None]:
bart_data = pd.read_csv('data_clf/5huffpost_1000/sega_promptTrue_asonly_False_poor-bart_aug4.csv')
bart_contents = list(bart_data['content'])[1000:2000]
bart_sketches = list(bart_data['sketch'])[1000:2000]
bart_labels = list(bart_data['label'])[1000:2000]

In [None]:
i = 1
sketches[i], bart_sketches[i], sega_contents[i], bart_contents[i]

In [None]:
# save bart/sega outputs'

pd.DataFrame({'orig_text':orig_contents,'gen_text':sega_contents,'sketch':ss,'label':labels})\
.to_csv('nlg_eval/sega_huff.csv')


pd.DataFrame({'orig_text':orig_contents,'gen_text':bart_contents,'sketch':ss,'label':labels})\
.to_csv('nlg_eval/bart_huff.csv')



In [None]:
# bart = pipeline('text2text-generation',model='facebook/bart-large',device=7)
bart('sport: <mask> offseason is smoking <mask> NFL players <mask>', num_beams=3, do_sample=True,max_length=200)

In [None]:
# scorer:
'saved_models/5huffpost_scorer_distilbert-base-cased_train.pkl'

# generating for others, with pipeline
- t5-commongen
- bart-k2t

In [None]:
k2t_ss = []
for s in ss:
    k2t_ss.append(s.replace(' <mask> ',' ').replace('<mask> ','').replace(' <mask>',''))


In [None]:
# 顺便把CBART的输入给弄了
with open('nlg_eval/nkeywords.txt','w') as f:
    for i,s in enumerate(k2t_ss):
        print(f'{i}',file=f)
        print(f'Keywords:	{s}',file=f)
        print(f'Ground truth:	 just for test',file=f)


In [None]:
cbart_gen_contents = []
with open('other_gen/CBART-master/outputs/cbart-large_one-billion-words_w1.0_max_insert_label1_insert_mode0_encoder_loss_type0_sample_top_k_5_decoder_chain5_nkeywords.txt','r') as f:
    lines = f.readlines()
    for line in lines:
        if 'Generated sentence:	' in line:
            gen_text = line.replace('Generated sentence:	','').replace('\n','')
            cbart_gen_contents.append(gen_text)
print(len(cbart_gen_contents))

pd.DataFrame({'orig_text':orig_contents,'gen_text':cbart_gen_contents,'sketch':k2t_ss,'label':labels})\
.to_csv('nlg_eval/cbart_huff.csv')

In [None]:
model = pipeline('text2text-generation',model='facebook/bart-base',device=7)

In [None]:
s = '<mask> machine learning <mask> my research interest <mask> data science <mask>'
# s = 'machine learning my research interest data science'
model(s,max_length=100, do_sample=True, num_beams=3)

In [None]:
from sega_utils import List2Dataset

ss_dataset = List2Dataset(k2t_ss)
ss_dataset[0]

In [None]:
from tqdm import tqdm
gen_contents = []
for out in tqdm(model(
    ss_dataset, num_beams=3, do_sample=True, 
    num_return_sequences=1, max_length=200, 
    batch_size=50, truncation=True)):
    generated_text = out[0]['generated_text']
    gen_contents.append(generated_text)

In [None]:
pd.DataFrame({'orig_text':orig_contents,'gen_text':gen_contents,'sketch':sketches,'label':labels})\
.to_csv('nlg_eval/sega-t4-l_huff.csv')

# ROUGE scores

In [None]:
from datasets import load_metric
rouge_score = load_metric("rouge")
bert_score = load_metric("bertscore")
ppl_score = load_metric("perplexity", module_type="metric")


In [None]:
import pandas as pd
# eval_data = pd.read_csv('nlg_eval/sega_huff.csv')
# eval_data = pd.read_csv('nlg_eval/sega-t1_huff.csv')
eval_data = pd.read_csv('nlg_eval/sega-t4_huff.csv')
# eval_data = pd.read_csv('nlg_eval/sega-t4-l_huff.csv')

# eval_data = pd.read_csv('nlg_eval/bart_huff.csv')
# eval_data = pd.read_csv('nlg_eval/t5cg_huff.csv')
# eval_data = pd.read_csv('nlg_eval/cbart_huff.csv')
# eval_data = pd.read_csv('nlg_eval/bart-k2t_huff.csv')
# eval_data = pd.read_csv('nlg_eval/ilm-sent_huff.csv')
# eval_data = pd.read_csv('nlg_eval/ilm-ngram_huff.csv')

orig_contents = list(eval_data['orig_text'])
gen_contents = list(eval_data['gen_text'])
# sketches_mask = list(eval_data['sketch'])
# sketches = list(eval_data['sketch'])

gen_contents = sketches
# for i in [5,57,70,89,98]:
#     print(i)
#     print(orig_contents[i])
#     print(sketches[i])
#     print(gen_contents[i])

In [None]:
#### ROUGE
scores = rouge_score.compute(
    predictions=gen_contents, references=orig_contents)
for k in scores:
    print(f'{k}. F1: {scores[k].mid.fmeasure * 100}, Recall:{scores[k].mid.recall * 100}')

# Bert-score
other_contents = []
for s,c in zip(sketches, gen_contents):
    for f in s:
        c = c.replace(f,'')
    other_contents.append(c)
results = bert_score.compute(predictions=other_contents, references=sketches, lang="en",model_type='bert-base-uncased')['f1']
print('bert-score', sum(results)/len(results))

##### ppl
scores = ppl_score.compute(input_texts=gen_contents, model_id='gpt2')
ppl = scores['mean_perplexity']
import math
print('ppl',ppl, math.pow(2,math.log(ppl)))

##### length ratio
lr = [len(gen_c)/len(orig_c) for gen_c,orig_c in zip(gen_contents,orig_contents)]
print('length ratio',sum(lr)/len(lr))

##### sketch-lost
scores = []
for s,c in zip(sketches, gen_contents):
    items = s.split(' ')
    N = len(items)
    n = 0
    for item in items:
        if item not in c:
            n += 1
    scores.append(n/N)
slost = sum(scores)/len(scores)
print('sketch-lost:',slost)

##### sketch-fragment lost
n = 0
N = 0
for s,c in zip(sketches_mask, gen_contents):
    frags = s.split('<mask>')
    for f in frags:
        N += 1
        f = f.strip()
        if f not in c:
            n += 1
flost = n/N
print('f-lost:',flost)
print('avg-lost:',(slost+flost)/2)


##### Novel Mentions
from nltk.tokenize import word_tokenize
orig_words = []
for c in orig_contents:
    orig_words += word_tokenize(c)
orig_words = list(set(orig_words))

gen_words = []
for c in gen_contents:
    gen_words += word_tokenize(c)
gen_words = list(set(gen_words))

n = 0
for w in gen_words:
    if w not in orig_words:
        n += 1
print('NM:',n, "NM rate:",n/len(orig_words))

In [None]:
sketches_mask[10], gen_contents[10]

In [None]:
# classifier score

labels = list(eval_data['label'])
unique_labels = sorted(list(set(labels)))
label2idx = {unique_labels[i]: i for i in range(len(unique_labels))}
idx2label = {label2idx[label]: label for label in label2idx}

label2idx

In [None]:
from torch.utils.data import DataLoader, Dataset

class MyDataset(Dataset):
    def __init__(self, tokenizer, texts, labels, label2idx, maxlen):
        self.tokenizer = tokenizer
        # 我先不用padding，后面通过data_collator来做dynamic padding
        texts = [t if (t != None and str(t) != 'nan') else '' for t in texts]
        self.encodings = tokenizer(texts, truncation=True, max_length=maxlen)
        self.labels = labels
        self.label2idx = label2idx
    def __getitem__(self, idx):
        item = {k:torch.tensor(v[idx]) for k,v in self.encodings.items()}
        item['labels'] = torch.tensor(self.label2idx[self.labels[idx]])  # labels字段应该保存label的idx，而不是具体label名
        return item
    def __len__(self):
        return len(self.labels)

def get_dataloader_from_list(texts, labels, tokenizer, label2idx, maxlen, bsz, collate_fn, shuffle=True):
    dataset = MyDataset(tokenizer, texts, labels, label2idx, maxlen)
    dataloader = DataLoader(dataset, batch_size=bsz, collate_fn=collate_fn, shuffle=shuffle)
    return dataloader

In [None]:

from transformers import AutoModelForSequenceClassification, AutoTokenizer,DataCollatorWithPadding
from tqdm import tqdm

clf_checkpoint = 'saved_models/5huffpost_scorer_bert-base-uncased_train.pkl'
clf_model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(unique_labels))
clf_model.load_state_dict(torch.load(clf_checkpoint)) # the non-aug model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu',1)
print('>>> ',device)
clf_model.to(device)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

bz = 32
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataloader = get_dataloader_from_list(
    gen_contents, labels, tokenizer, label2idx, 100, bz, 
    data_collator, shuffle=False) # must set `shuffle=False` to keep the original order


clf_model.eval()
all_true_label_probs = []
i = 0
for batch in tqdm(train_dataloader):
    batch = {k:v.to(device) for k,v in batch.items()}
    logits = clf_model(**batch).logits
    probs = torch.softmax(logits, dim=1)
    # all_probs.append(probs.cpu())
    label_ids = [label2idx[label] for label in labels[i*bz :(i+1)*bz]]
    label_ids = torch.LongTensor([[idx] for idx in label_ids])
    true_label_probs = probs.gather(1, label_ids.to(device))
    # print(true_label_probs.shape)
    # print(true_label_probs.view(-1,).tolist())
    all_true_label_probs += true_label_probs.view(-1,).tolist()
    i += 1

In [None]:
1-sum(all_true_label_probs)/len(all_true_label_probs)

In [None]:
gen_contents[10]