In [5]:
import pandas as pd
import numpy as np
import pickle as pkl
import os

from src.utils.dataset import get_question_column_name
from src.rescoring.rescore_summaries import get_latent_column_name
from src.rescoring.rescore_summaries import process_llama2_preds


cache_dir = "/home/mila/c/cesare.spinoso/RSASumm/data/generate_summaries/cache"
rescoring_cache_path = "/home/mila/c/cesare.spinoso/RSASumm/data/rescoring/cache/flan-t5-large.pkl"
summarizer_names = [
    "bart_generic",
    "led_generic",
    "llama2_generic",
    # "peg_generic",
    # "t5_generic",
    "bart_e2e",
    "led_e2e",
    "llama2_e2e",
]
datasets = {
    "covidet": "/home/mila/c/cesare.spinoso/RSASumm/data/covidet/test.csv",
    "debatepedia": "/home/mila/c/cesare.spinoso/RSASumm/data/debatepedia/test.csv",
    "duc_single": "/home/mila/c/cesare.spinoso/RSASumm/data/duc/duc_single/preprocessed.csv",
    "multioped": "/home/mila/c/cesare.spinoso/RSASumm/data/multioped/test.csv",
    "qmsum": "/home/mila/c/cesare.spinoso/RSASumm/data/qmsum/test.csv",
}

In [6]:
cached_summaries = {}
for summarizer_name in summarizer_names:
    with open(os.path.join(cache_dir, f"{summarizer_name}.pkl"), "rb") as f:
        cached_summaries[summarizer_name] = pkl.load(f)

In [7]:
cache_rescorings = {}
with open(rescoring_cache_path, "rb") as f:
    cache_rescorings = pkl.load(f)

In [15]:
for summarizer_name in summarizer_names:
    if "generic" not in summarizer_name:
        continue
    for dataset_name, dataset_path in datasets.items():
        print("="*80)
        print(f"Summarizer: {summarizer_name} - Dataset: {dataset_name}")
        dataset = pd.read_csv(dataset_path)
        latent_column_name = get_latent_column_name(dataset_name)
        dataset["pred"] = [[None if pred_dict is None else pred_dict["pred"] for pred_dict in cached_summaries[summarizer_name].get((doc,), [None])] for doc in dataset["document"]]
        print(f"Documents missing at least one pred: [{dataset['pred'].isna().sum()}]")
        dataset["pred_score"] = [[None if pred_dict is None else pred_dict["pred_score"]  for pred_dict in cached_summaries[summarizer_name].get((doc,), [None])] for doc in dataset["document"]]
        dataset_exploded = dataset.explode(["pred", "pred_score"])
        if "llama" in summarizer_name:
            dataset_exploded["pred"] = process_llama2_preds(dataset_exploded["pred"].values.tolist())
        print(f"Number of initial rows : {len(dataset)}, number of expected exploded rows: {len(dataset) * 5}, number of exploded rows: {len(dataset_exploded)}")
        dataset_exploded["source_rec_score"] = [cache_rescorings.get((pred, doc), {"reconstruction_score": None})["reconstruction_score"] for doc, pred in zip(dataset_exploded["document"], dataset_exploded["pred"])]
        print(f"Number of missing source rescoring: {sum(dataset_exploded['source_rec_score'].isnull())}")
        # print(dataset_exploded[dataset_exploded['source_rec_score'].isnull()].head(1).values)
        # print(sum([1 if cache_rescorings.get((pred, latent)) else 0 for pred, latent in zip(dataset_exploded["pred"], dataset_exploded[latent_column_name])]))
        dataset_exploded["latent_rec_score"] = [cache_rescorings.get((pred, latent), {"reconstruction_score": None})["reconstruction_score"] for pred, latent in zip(dataset_exploded["pred"], dataset_exploded[latent_column_name])]
        print(f"Number of missing latent rescoring: {sum(dataset_exploded['latent_rec_score'].isnull())}")

Summarizer: bart_generic - Dataset: covidet
Documents missing at least one pred: [0]
Number of initial rows : 1524, number of expected exploded rows: 7620, number of exploded rows: 7620
Number of missing source rescoring: 0
Number of missing latent rescoring: 0
Summarizer: bart_generic - Dataset: debatepedia
Documents missing at least one pred: [0]
Number of initial rows : 893, number of expected exploded rows: 4465, number of exploded rows: 4465
Number of missing source rescoring: 0
Number of missing latent rescoring: 0
Summarizer: bart_generic - Dataset: duc_single
Documents missing at least one pred: [0]
Number of initial rows : 4500, number of expected exploded rows: 22500, number of exploded rows: 22500
Number of missing source rescoring: 0
Number of missing latent rescoring: 0
Summarizer: bart_generic - Dataset: multioped
Documents missing at least one pred: [0]
Number of initial rows : 531, number of expected exploded rows: 2655, number of exploded rows: 2660
Number of missing s

In [14]:
for summarizer_name in summarizer_names:
    if "e2e" not in summarizer_name:
        continue
    for dataset_name, dataset_path in datasets.items():
        print("="*80)
        print(f"Summarizer: {summarizer_name} - Dataset: {dataset_name}")
        dataset = pd.read_csv(dataset_path)
        latent_column_name = get_latent_column_name(dataset_name)
        question_column_name = get_question_column_name(dataset_name)
        if dataset_name == "covidet":
            question_column = dataset["emotion"].apply(lambda emotion : f"Describe the {emotion} of this post.")
        else:
            question_column = dataset[question_column_name]
        dataset["pred"] = [[None if pred_dict is None else pred_dict["pred"] for pred_dict in cached_summaries[summarizer_name].get((doc, ques), [None])] for (doc, ques) in zip(dataset["document"], question_column)]
        print(f"Documents missing at least one pred: [{dataset['pred'].isna().sum()}]")
        dataset["pred_score"] = [[None if pred_dict is None else pred_dict["pred_score"] for pred_dict in cached_summaries[summarizer_name].get((doc, ques), [None])] for (doc, ques) in zip(dataset["document"], question_column)]
        dataset_exploded = dataset.explode(["pred", "pred_score"])
        if "llama" in summarizer_name:
            dataset_exploded["pred"] = process_llama2_preds(dataset_exploded["pred"].values.tolist())
        print(f"Number of initial rows : {len(dataset)}, number of expected exploded rows: {len(dataset) * 5}, number of exploded rows: {len(dataset_exploded)}")
        dataset_exploded["source_rec_score"] = [cache_rescorings.get((pred, doc), {"reconstruction_score": None})["reconstruction_score"] for doc, pred in zip(dataset_exploded["document"], dataset_exploded["pred"])]
        print(f"Number of missing source rescoring: {sum(dataset_exploded['source_rec_score'].isnull())}")
        dataset_exploded["latent_rec_score"] = [cache_rescorings.get((pred, latent), {"reconstruction_score": None})["reconstruction_score"] for pred, latent in zip(dataset_exploded["pred"], dataset_exploded[latent_column_name])]
        print(f"Number of missing latent rescoring: {sum(dataset_exploded['latent_rec_score'].isnull())}")
        # print(dataset_exploded.columns)
        # print(dataset_exploded[dataset_exploded['latent_rec_score'].isnull()].head(1).values.tolist())

Summarizer: bart_e2e - Dataset: covidet
Documents missing at least one pred: [0]
Number of initial rows : 1524, number of expected exploded rows: 7620, number of exploded rows: 7620
Number of missing source rescoring: 0
Number of missing latent rescoring: 0
Summarizer: bart_e2e - Dataset: debatepedia
Documents missing at least one pred: [0]
Number of initial rows : 893, number of expected exploded rows: 4465, number of exploded rows: 4465
Number of missing source rescoring: 0
Number of missing latent rescoring: 0
Summarizer: bart_e2e - Dataset: duc_single
Documents missing at least one pred: [0]
Number of initial rows : 4500, number of expected exploded rows: 22500, number of exploded rows: 22500
Number of missing source rescoring: 0
Number of missing latent rescoring: 0
Summarizer: bart_e2e - Dataset: multioped
Documents missing at least one pred: [0]
Number of initial rows : 531, number of expected exploded rows: 2655, number of exploded rows: 2655
Number of missing source rescoring:

In [None]:
dataset_exploded["latent_rec_score"] = [cache_rescorings.get((pred, latent), {"reconstruction_score": None})["reconstruction_score"] for pred, latent in zip(dataset_exploded["pred"], dataset_exploded[latent_column_name])]


In [2]:
print("""
In the coming days, President Trump Donald John TrumpFive takeaways from the Democratic National Convention What we'll remember from the 2020 Biden convention Chris Wallace labels Biden's acceptance speech 'enormously effective' MORE plans to announce his final decision on whether the United States will withdraw from the 2015 Iran nuclear accord. President Trump, who has described the agreement as “one of the worst deals” he has ever witnessed, is expected to leave the pact.

Ultimately, this is the right decision for the United States and for global security at large. From the outset, the accord, which provided sanctions relief for Iran in exchange for restrictions on their nuclear program, has a number of fatal flaws.

ADVERTISEMENT

Foremost, while the plan limits Iran’s access to uranium, this restriction only lasts until 2025 to 2030. After that, the Iranians are free to revitalize their nuclear program on a potentially even larger scale. Notably, however, this “sunset” clause, which allow parts of the deal to expire, are the least of the deal’s shortcomings.

One of the primary failures of the deal is that the agreement fails to address Iran’s ballistic missile program. As such, the country has continued to unrestrictedly build and test ballistic missiles. Moreover, President Trump and others have rightly objected to the terms under which regulatory inspectors are permitted to visit nuclear sites.

The terms of the deal give Iran 14 days to object to a request for inspection, followed by a period of seven days for an arbitration committee to rule about the inspection, and another three days for Tehran to set up an inspection. Thus, this provides Iran with up to 24 days to conceal, destroy, or relocate contraband materials.

Even more problematically, Iran has stated that it will prohibit inspections of military sites, thus further complicating the issue of compliance verification. These flaws have become so glaringly problematic that even those who once championed the deal have begun to question it.

“Everyone recognizes that the deal is not ideal. I think President Obama would say the deal is not ideal,” said Bob Einhorn, who was the State Department’s special adviser for nonproliferation and arms control during the Obama administration. While these flaws are not necessarily brand new, there have been several recent alarming developments that have sparked concern among elected officials.

Last week, standing in front a screen which blatantly displayed the text “Iran lied” in all caps, Israeli Prime Minister Benjamin Netanyahu declared that Israeli intelligence services had obtained proof that Iran had been deceptive about its nuclear program.

Netanyahu claims to have 55,000 pages and 183 CDs full of evidence that Iran had sought “to design, produce and test five warheads with 10 kiloton of TNT yield for integration on missiles.” He also indicated that the documents had been stolen from a warehouse in Tehran by a team working for Mossad, the Israeli intelligence service, and that they definitively proved that Iran has been lying about its nuclear program.

While some intelligence experts doubt that the evidence is quite as reliable as Netanyahu claims, the prime minister’s presentation did underscore an important point: Under the terms of the current Iran deal, it is next to impossible for us to truly know the full details of the Iranian nuclear program and whether Tehran is abiding by the guidelines established by the accord.

Indeed, it has become increasingly clear that the Iran agreement was a bad move for the United States and its allies. Instead of forcing Iran to stop developing nuclear weapons, the deal has merely compelled Tehran to become more covert about the project. At the same time, relief from sanctions has provided tens of billions of dollars for Iran, much of which will certainly be funneled to the Syrian regime of Bashar Assad and terrorist groups like Hezbollah and Hamas.

Ultimately, President Obama’s heralded deal is really a win for the rouge Iranian regime and its allies and a major setback for the West. At this time, President Trump would be right to withdraw from the deal.
""".replace("\n", "\\n"))

\nIn the coming days, President Trump Donald John TrumpFive takeaways from the Democratic National Convention What we'll remember from the 2020 Biden convention Chris Wallace labels Biden's acceptance speech 'enormously effective' MORE plans to announce his final decision on whether the United States will withdraw from the 2015 Iran nuclear accord. President Trump, who has described the agreement as “one of the worst deals” he has ever witnessed, is expected to leave the pact.\n\nUltimately, this is the right decision for the United States and for global security at large. From the outset, the accord, which provided sanctions relief for Iran in exchange for restrictions on their nuclear program, has a number of fatal flaws.\n\nADVERTISEMENT\n\nForemost, while the plan limits Iran’s access to uranium, this restriction only lasts until 2025 to 2030. After that, the Iranians are free to revitalize their nuclear program on a potentially even larger scale. Notably, however, this “sunset” cl