In [3]:
import os
import transformers
import math
import json
import pandas as pd
import subprocess
import re
from tqdm import tqdm
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    T5Tokenizer
)
from sacremoses import MosesDetokenizer

ROOT_DIR = "/projects/ogma2/users/andrewsi/control-data2text"
gpu = "0"

In [2]:
totto_dir = f"{ROOT_DIR}/google-language/language/totto"
temp_dir = f"{totto_dir}/temp"
parent_preds_file = f"{temp_dir}/t5_small_64158.txt"

In [3]:
results = subprocess.run(["bash", totto_dir + "/totto_parent_eval.sh", "--prediction_path", parent_preds_file, "--target_path", totto_dir + "/totto_data/totto_dev_data.jsonl", "--output_dir", temp_dir], stdout=subprocess.PIPE)

In [14]:
def get_parent_metric(results, metric):
            return float(re.search("{} = ([0-9]+.[0-9]+)".format(metric), results).group(1))

In [15]:
results.stdout



In [16]:
parent_metrics = ["Precision", "Recall", "F-score"]
metric_dict = {}
for metric in parent_metrics:
    metric_dict[metric] = get_parent_metric(str(results.stdout), metric)

In [17]:
metric_dict

{'Precision': 65.53, 'Recall': 44.58, 'F-score': 47.29}

In [5]:
e2e_model_path = f"{ROOT_DIR}/transformers/examples/language-modeling/exp/e2e_targets/gpt2-02/checkpoint-9464"
totto_model_path = f"{ROOT_DIR}/transformers/examples/language-modeling/exp/totto_targets/gpt2/checkpoint-20264"

def compute_perplexity(preds, tokenizer, language_model, device):
    language_model.to(device)
    ppls = []
    print(f"First pred: {preds[0]}")
    for pred in tqdm(preds):
        inputs = tokenizer(pred, return_tensors='pt').to(device)
        outputs = language_model(**inputs, labels=inputs['input_ids'])
        ppls.append(math.exp(outputs.loss))
    return round((sum(ppls) / len(ppls)), 4)

def compute_e2e_ppl(preds):
    device = f"cuda:{gpu}"
    e2e_lm_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    e2e_lm = GPT2LMHeadModel.from_pretrained(e2e_model_path)
    return compute_perplexity(preds, e2e_lm_tokenizer, e2e_lm, device)

def compute_totto_ppl(preds):
    device = f"cuda:{gpu}"
    totto_lm_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    totto_lm = GPT2LMHeadModel.from_pretrained(totto_model_path)
    return compute_perplexity(preds, totto_lm_tokenizer, totto_lm, device)

def process_and_get_e2e_ppl(inpath):
    return compute_e2e_ppl(postprocess_e2e_preds(inpath))

In [3]:
md = MosesDetokenizer(lang='en')

def postprocess_e2e_preds(inpath, outpath=None):
    processed_lines = []
    with open(inpath, "r") as f:
        original_lines = [line for line in f]
    for line in tqdm(original_lines):
        processed_lines.append(md.detokenize(line.strip().replace("_", " ").split()))
    if outpath:
        with open(outpath, "w+") as f:
            f.writelines([line + "\n" for line in processed_lines])
    return processed_lines

In [4]:
preds = postprocess_e2e_preds("/projects/ogma2/users/andrewsi/control-data2text/transformers/examples/seq2seq/exp/e2e/e2e_k3_t5_small_01/checkpoint-8295/validation_preds.txt", "/projects/ogma2/users/andrewsi/control-data2text/transformers/examples/seq2seq/exp/e2e/e2e_k3_t5_small_01/checkpoint-8295/validation_preds_postprocessed.txt")

100%|██████████| 6300/6300 [00:02<00:00, 2247.01it/s]


In [4]:
train_file = "/projects/ogma2/users/andrewsi/control-data2text/DTG-SI/e2e_data/train/y_aux.train.txt"
val_file = "/projects/ogma2/users/andrewsi/control-data2text/DTG-SI/e2e_data/val/y_aux.valid.txt"
test_file = "/projects/ogma2/users/andrewsi/control-data2text/DTG-SI/e2e_data/test/y_aux.test.txt"
outdir = "/projects/ogma2/users/andrewsi/control-data2text/transformers/examples/language-modeling/test_data/e2e_targets"

In [14]:
postprocess_e2e_preds(test_file, f"{outdir}/test.txt")

100%|██████████| 6274/6274 [00:04<00:00, 1565.00it/s]


In [10]:
process_and_get_e2e_ppl("/projects/ogma2/users/andrewsi/control-data2text/transformers/examples/seq2seq/exp/e2e/e2e_k5_t5_small_01/checkpoint-18432/validation_preds.txt")

100%|██████████| 6300/6300 [00:03<00:00, 1916.88it/s]
  0%|          | 5/6300 [00:00<02:13, 47.23it/s]First pred: Loch Fyne near The Rice Boat has a high customer rating. It serves French food in riverside.
100%|██████████| 6300/6300 [01:52<00:00, 56.10it/s]


3.9865

In [3]:
def get_prop_longer(col, thresh):
    return len(col[col > thresh]) / len(col)

def get_len_df(data_file):
    with open(data_file, "r") as f:
        data_lines = [line for line in f]
    tokenizer = T5Tokenizer.from_pretrained("t5-small", model_max_length=4096)
    special_tokens = ["[SEP]"]
        
    if len(special_tokens) > 0:
        special_tokens_dict = {"additional_special_tokens": (special_tokens)}
        tokenizer.add_special_tokens(special_tokens_dict)
    print("\nTokenizer length: {}".format(len(tokenizer)))
    
    src_lens = []
    tgt_lens = []
    print(f"Num lines: {len(data_lines)}\nFirst line: {data_lines[0]}")
    for line in tqdm(data_lines):
        json_example = json.loads(line)
        src_lens.append(len(tokenizer(json_example["source"], max_length=4096, truncation=True)['input_ids'])) 
        tgt_lens.append(len(tokenizer(json_example["target"], max_length=4096, truncation=True)['input_ids']))

    return pd.DataFrame([src_lens, tgt_lens], index=["src_len", "tgt_len"]).transpose()


In [6]:
totto_gtt = "/projects/ogma2/users/andrewsi/control-data2text/transformers/examples/language-modeling/test_data/totto_targets/validation.txt"
totto_baseline = "/projects/ogma2/users/andrewsi/controllable-data-to-text/examples/seq2seq/results/totto/baseline/t5-small/checkpoint-67932/test_generations.txt"
totto_val_clean_source_embed = "/projects/ogma2/users/andrewsi/control-data2text/transformers/examples/seq2seq/exp/totto/totto_k5_t5_small_new_parent/eval_results/val_clean_source_embed/preds.txt"
preds_file = "/projects/ogma2/users/andrewsi/control-data2text/transformers/examples/seq2seq/exp/totto/totto_k5_t5_masked_target_embed_new_parent/eval_results/val_clean_source_embed/preds.txt"

In [8]:
with open(preds_file, "r") as f:
    preds = [pred.strip() for pred in f]
compute_totto_ppl(preds)

  0%|          | 6/7700 [00:00<02:16, 56.16it/s]First pred: Daniel Henry Chamberlain was the 76th Governor of South Carolina on December 1, 1874.
100%|██████████| 7700/7700 [02:23<00:00, 53.71it/s]


64.8

In [None]:
compute_perplexity(totto_baseline)