dataset = "dialogsum" | "samsum"

dataset_type = "train" | "test" | "validation"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp /content/drive/MyDrive/NLP_project/comet_train_merged_*.json .

In [None]:
%%bash
pip install sentence-transformers
pip install datasets

Collecting sentence-transformers
  Downloading sentence_transformers-2.3.1-py3-none-any.whl (132 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 132.8/132.8 kB 1.8 MB/s eta 0:00:00
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.3.1
Collecting datasets
  Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 536.7/536.7 kB 3.5 MB/s eta 0:00:00
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.3/116.3 kB 15.3 MB/s eta 0:00:00
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 16.1 MB/s eta 0:00:00
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.17.1 dill-0.3.8 multiprocess-0.70.16


In [None]:
from datasets import load_dataset
import json, pandas as pd, numpy as np
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
dataset = "samsum"
dataset_type = "train"
save_result_comets = True
extra_file_name = "pt2"

FILTERED_COMET_FILE_PATH = f"./comet_{dataset_type}_merged_{extra_file_name}.json"
DIALOGSUM_FILE_PATH =  f"./dialogsum.{dataset_type}.jsonl"
FILE_OUTPUT_FOLDER = "."

In [None]:
comet = None
with open(FILTERED_COMET_FILE_PATH) as f:
    comet = json.loads(f.read())

In [None]:
summary_dict = None
dilaogsum = None
if dataset == "samsum":
    samsum = load_dataset('samsum')
    summary_dict = dict([(obj["id"], obj["summary"]) for obj in samsum[dataset_type]])
elif dataset == "dialogsum":
       dilaogsum = pd.read_json(path_or_buf=DIALOGSUM_FILE_PATH, lines=True)
       summary_dict = dict([(obj["fname"], obj["summary"]) for _, obj in dilaogsum[["fname","summary"]].iterrows()])
else:
    raise NotImplementedError()

In [None]:
cs_unraveled = []
for sample_id, sammple_cs in comet.items():
    for sentence_num, sentence_obj in enumerate(sammple_cs):
        for cs_key_or_sentence, cs_arr_or_sentence in sentence_obj.items():
            if type(cs_arr_or_sentence) == type([]):
                for cs in cs_arr_or_sentence:
                    cs_unraveled.append({"sample_id":sample_id,"sentence_id":sentence_num, \
                    "sentence":sentence_obj["sentence"], "cs_type":cs_key_or_sentence, "cs":cs, "summary":summary_dict[sample_id]})

In [None]:
len([obj["sentence"] for obj in cs_unraveled]) == sum([len(sentences) for _, sentences in comet.items()])*25

### Model details:
https://github.com/UKPLab/sentence-transformers

In [None]:
sbert = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

In [None]:
cs_unraveled[0]

In [None]:
cs_final= []
for cs_obj in tqdm(cs_unraveled):
    cs_emb =  sbert.encode(cs_obj["cs"])
    summary_emb = sbert.encode(cs_obj["summary"])
    sentence_emb = sbert.encode(cs_obj["sentence"])
    cs_final.append({**cs_obj,
                     "sentence_encoded":sentence_emb,"cs_encoded":cs_emb, "sumary_encoded":summary_emb,
                     "cos_similary_cs_summmary":float(util.cos_sim(cs_emb, summary_emb)),
                     "cos_similary_cs_sentence":float(util.cos_sim(cs_emb, sentence_emb))})

In [None]:
cs_dp = pd.DataFrame(cs_final)

In [None]:
cs_dp.iloc[:2,:]

In [None]:
best_cs_per_sentence_ours = cs_dp.loc[cs_dp.groupby(['sample_id','sentence_id'])['cos_similary_cs_summmary'].idxmax()]

In [None]:
best_cs_per_sentence_theirs = cs_dp.loc[cs_dp.groupby(['sample_id','sentence_id'])['cos_similary_cs_sentence'].idxmax()]

In [None]:
peaks_ours = best_cs_per_sentence_ours["cos_similary_cs_summmary"]
peaks_theirs = best_cs_per_sentence_theirs["cos_similary_cs_sentence"]

In [None]:
np.average(peaks_ours), np.std(peaks_ours)

In [None]:
np.average(peaks_theirs), np.std(peaks_theirs)

In [None]:
best_cs_per_sentence_ours['cs_type'].value_counts().plot.bar()

In [None]:
best_cs_per_sentence_theirs['cs_type'].value_counts().plot.bar()

In [None]:

max_drow = 90
plt.plot(range(0,len(peaks_ours[:max_drow])),peaks_ours[:max_drow],  label ="ours")
plt.plot(range(0,len(peaks_theirs[:max_drow])), peaks_theirs[:max_drow],  label ="theirs", linestyle="--")
#plt.plot(range(0,len(peaks_theirs[:max_drow])), np.array([n1+n2/2 for n1, n2 in zip(peaks_theirs[:max_drow], peaks_ours[:max_drow])])/max([n1+n2/2 for n1, n2 in zip(peaks_theirs[:max_drow], peaks_ours[:max_drow])]),  label ="sum normalized", linestyle="-.")
plt.title("peak similarity per sentence compared")
plt.legend()
plt.show()

In [None]:
def convert_back_to_dict(df):
    converted_df = {}
    for _, row in df.iterrows():
        if converted_df.get(str(row["sample_id"])):
            converted_df[str(row["sample_id"])][str(row["sentence_id"])] = \
                {"sentence":row["sentence"], "relation":row["cs_type"], "out":row["cs"]}
        else:  converted_df[str(row["sample_id"])] = \
            {str(row["sentence_id"]): {"sentence":row["sentence"], "relation":row["cs_type"], "out":row["cs"]}}
    return converted_df

In [None]:
def save_files(converted_dfs_and_paths):
    for converted_df, path in converted_dfs_and_paths:
        with open(path, "w") as file:
            file.write(json.dumps(converted_df, indent=2))

In [None]:
if save_result_comets:
    converted_dfs_and_paths = [(convert_back_to_dict(best_cs_per_sentence_ours),\
                                f"{FILE_OUTPUT_FOLDER}/comet_{dataset}_{dataset_type}_{extra_file_name}_z_ours.json"),\
                                (convert_back_to_dict(best_cs_per_sentence_theirs),\
                                f"{FILE_OUTPUT_FOLDER}/comet_{dataset}_{dataset_type}_{extra_file_name}_z_theirs.json")]
    save_files(converted_dfs_and_paths)
    cs_dp[["sample_id", "sentence_id", "sentence", "cs_type", "cs","cs_encoded", "summary", "cos_similary_cs_summmary", "cos_similary_cs_sentence"]].to_pickle((f"{FILE_OUTPUT_FOLDER}/comet_{dataset}_{dataset_type}_{extra_file_name}_z_entire.pkl"))




In [None]:
!cp ./*.pkl /content/drive/MyDrive/NLP_project/
!cp ./*.json /content/drive/MyDrive/NLP_project/