### Generating Summaries from VECTORIZED-RSA

This script is used to accept set of pkl files where likelihood are aggregated, and then use these values to compute a single RSA values that is later used to retrieve GLIMPSE-SPEAKER and GLIMPSE-UNIQUE

Please not that this notebook will save all obtained results in a new folder (similar to all other scripts) where results can be easily evaluated to be compared later against other techniques

In [152]:
from functools import cache
from typing import List
import numpy as np
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModel

class VectorRSAReranking:
    def __init__(
            self,
            lm_probas: np.ndarray,
            num_models: int,
            candidates: List[str],
            source_texts: List[str],
            batch_size: int = 32,
            rationality: int = 1,
            device="cuda",
    ):
        
        self.num_models = num_models 
        self.device = device
        self.likelihood_matrix = torch.Tensor(lm_probas)

        self.candidates = candidates
        self.source_texts = source_texts

        self.batch_size = batch_size
        self.rationality = rationality


    @cache
    def S(self, t):
        if t == 0:
            return self.initial_speaker_probas
        else:
            listener = self.L(t - 1)
            # Process each element in the lists separately
            result = []
            for i in range(self.num_models):
                # Extract i-th elements from each list in the matrix
                current_layer = listener[..., i]
                # Perform operation on the current layer
                prod = current_layer * self.rationality
                processed_layer = torch.log_softmax(prod, dim=-1)
                result.append(processed_layer)
            
            # Stack the processed layers into a single tensor
            # Shape will be [batch_size, num_classes, list_length]
            return torch.stack(result, dim=-1)

    @cache
    def L(self, t):
        speaker = self.S(t)
        result = []
        for i in range(self.num_models):
            # Extract i-th elements from each list in the matrix
            current_layer = speaker[..., i]
            processed_layer = torch.log_softmax(current_layer, dim=-2)
            result.append(processed_layer)
        
        return torch.stack(result, dim=-1)

    def mk_listener_dataframe(self, t, agg_method):
        self.initial_speaker_probas = self.likelihood_matrix #self.likelihood_matrix()        
        initial_listener_probas = self.L(0)
        initial_speaker_probas = self.S(0)

        # compute consensus
        uniform_distribution_over_source_texts = torch.ones_like(
            initial_listener_probas
        ) / len(self.source_texts)

        consensuality_scores = (
            (
                torch.exp(self.L(t))
                * (self.L(t) - torch.log(uniform_distribution_over_source_texts))
            )
            .sum(0).cpu().numpy()
        )

        # Aggregating the scores over the models
        if agg_method=="mean":
            initial_listener_probas = initial_listener_probas.mean(dim=-1).cpu().numpy()
            initial_speaker_probas = initial_speaker_probas.mean(dim=-1).cpu().numpy()
            consensuality_scores = consensuality_scores.mean(axis=1)
            speaker_df = pd.DataFrame(self.S(t).mean(dim=-1).cpu().numpy().tolist())
            listener_df = pd.DataFrame(self.L(t).mean(dim=-1).cpu().numpy().tolist())
        elif agg_method=="max":
            initial_listener_probas = initial_listener_probas.max(dim=-1).values.cpu().numpy()
            initial_speaker_probas = initial_speaker_probas.max(dim=-1).values.cpu().numpy()
            consensuality_scores = consensuality_scores.max(axis=1)
            speaker_df = pd.DataFrame(self.S(t).max(dim=-1).values.cpu().numpy().tolist())
            listener_df = pd.DataFrame(self.L(t).max(dim=-1).values.cpu().numpy().tolist())

        else:
            raise ValueError(f"agg_method must be either 'mean' or 'max', got: {agg_method}")

        # Compute and return `initial_listener_probas` and other necessary components
        initial_listener_probas = pd.DataFrame(initial_listener_probas)
        initial_listener_probas.index = self.source_texts
        initial_listener_probas.columns = self.candidates
   
        initial_speaker_probas = pd.DataFrame(initial_speaker_probas)
        initial_speaker_probas.index = self.source_texts
        initial_speaker_probas.columns = self.candidates
        
        consensuality_scores = pd.Series(consensuality_scores, index=self.candidates)

        listener_df.index = self.source_texts
        speaker_df.index = self.source_texts

        listener_df.columns = self.candidates
        speaker_df.columns = self.candidates

        return listener_df, speaker_df, initial_listener_probas, initial_speaker_probas, None, consensuality_scores

    def rerank(self, t=1, agg_method="mean"):
        """
        return the best summary (according to rsa) for each text
        """
        (
            listener_df,
            speaker_df,
            initial_listener_proba,
            initial_speaker_proba,
            initital_consensuality_score,
            consensuality_scores,
        ) = self.mk_listener_dataframe(t=t, agg_method=agg_method)
        best_rsa = speaker_df.idxmax(axis=1).values
        best_base = initial_listener_proba.idxmax(axis=1).values

        return (
            best_rsa,
            best_base,
            speaker_df,
            listener_df,
            initial_listener_proba,
            initial_speaker_proba,
            initital_consensuality_score,
            consensuality_scores,
        )

In [None]:
import pickle as pk
import numpy as np
from pathlib import Path

path_probas = Path("data/vector_probas")
for file in path_probas.glob('*.pkl'):
    dataset_name = str(file).split('\\')[-1].split('all')[0]
    if dataset_name == '':
        dataset_name = 'extr_'

    with open(file, 'rb') as file:
        agg_lm_probas = pk.load(file)

    assert len(agg_lm_probas['results']) == 226

    for aggregation_method in ["mean", "max"]:
        resultsGS = pd.DataFrame(columns=['id','summary','text','gold'])
        resultsGS_file = "data\\vector_results\\"+dataset_name+"vector_"+aggregation_method+"_GS.csv"
        resultsGU = pd.DataFrame(columns=['id','summary','text','gold'])
        resultsGU_file = "data\\vector_results\\"+dataset_name+"vector_"+aggregation_method+"_GU.csv"

        for i in range(len(agg_lm_probas['results'])):
            lm_probas_concat = agg_lm_probas['results'][i]['lm_probas_concat']
            candidates = lm_probas_concat.columns.tolist()
            source_texts = lm_probas_concat.index.tolist()
            lm_probas = lm_probas_concat.values
            lm_probas = np.array([[np.array(cell) for cell in row] for row in lm_probas])
            best_rsa, best_base, speaker_df, listener_df, initial_listener_proba, initial_speaker_proba, _, \
            consensuality_score = VectorRSAReranking(num_models=5, lm_probas=lm_probas, device='cpu', candidates=candidates, \
                                                    source_texts=source_texts).rerank(t=3, agg_method=aggregation_method)
            
            id = agg_lm_probas['results'][i]['id']
            gold = agg_lm_probas['results'][i]['gold']

            consensus_samples = consensuality_score.sort_values(ascending=True).head(3).index.tolist()
            disensus_samples = consensuality_score.sort_values(ascending=False).head(3).index.tolist()

            consensus = ".".join(consensus_samples)
            disensus = ".".join(disensus_samples)

            summaryGU = consensus + "\n\n" + disensus

            rsa = best_rsa.tolist()[:3]
            rsa = ".".join(rsa)

            summaryGS = consensus + "\n\n" + rsa

            newGS = {'id': id, 'summary': summaryGS, 'text': " ".join(source_texts), 'gold': gold}
            newGSdf = pd.DataFrame(newGS, index = None)
            resultsGS = pd.concat([resultsGS, newGSdf], ignore_index=True)

            newGU = {'id': id, 'summary': summaryGU, 'text': " ".join(source_texts), 'gold': gold}
            newGUdf = pd.DataFrame(newGU, index = None)
            resultsGU = pd.concat([resultsGU, newGUdf], ignore_index=True)


        resultsGS.to_csv(resultsGS_file, index=False)
        resultsGU.to_csv(resultsGU_file, index=False)

data\vector_probas\all_merged_226_extr_LED_Large_PEGASUS_Large_PEGASUS_BigBird_Arxiv.pkl
data\vector_probas\BART_all_merged_226_abstr_LED_Large_PEGASUS_Large_PEGASUS_BigBird.pkl
data\vector_probas\PEGASUS_Arxiv_all_merged_226_abstr_LED_Large_PEGASUS_Large_PEGASUS.pkl
data\vector_probas\PEGASUS_Large_all_merged_226_abstr_LED_Large_PEGASUS_Large_PEGASUS.pkl
