## Data exploration

In [1]:
import pandas as pd
import os
import re
import nltk

In [2]:
data_glimpse = "../../data/processed/"


dataset = pd.DataFrame()

for year in range (2017, 2022):
    sub_dataset = pd.read_csv(f"{data_glimpse}all_reviews_{year}.csv")
    dataset = pd.concat([dataset, sub_dataset])

dataset.tail()

Unnamed: 0,id,text,gold
11451,https://openreview.net/forum?id=QjINdYOfq0b,Summary: The paper presents a technique called...,The paper proposes to integrate multiple bit c...
11452,https://openreview.net/forum?id=POWv6hDd9XH,I couldn't follow the method described in the ...,This paper proposes a new method for post-trai...
11453,https://openreview.net/forum?id=POWv6hDd9XH,Post-training quantization is an important pro...,This paper proposes a new method for post-trai...
11454,https://openreview.net/forum?id=POWv6hDd9XH,This paper proposes BRECQ which is a new Post ...,This paper proposes a new method for post-trai...
11455,https://openreview.net/forum?id=POWv6hDd9XH,This paper explores the post-training inferenc...,This paper proposes a new method for post-trai...


In [3]:
dataset.dropna(subset=['gold'], inplace=True)

In [4]:
def preprocess_text(text: str) -> list:
    # Replace any set of successive dashes (e.g., --, ----, -----) with a newline
    text = re.sub(r'-{2,}', '\n', text)

    # Remove patterns like ".2-" or isolated numerics with hyphens
    text = re.sub(r'\.\d+-', '', text)

    # Replace multiple newlines or spaces with a single newline or space
    # Replace multiple newlines with one
    text = re.sub(r'\n+', '\n', text)
    # Replace multiple spaces with one
    text = re.sub(r'\s+', ' ', text)

    # Remove any remaining unwanted characters (e.g., control characters)
    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # To be discussed
    text = text.replace("\n", " ")

    sentences = nltk.sent_tokenize(text)

    # remove empty sentences
    sentences = [sentence for sentence in sentences if sentence != ""]
    return sentences

In [None]:
#nltk.download('punkt')
#nltk.download('punkt_tab')

dataset['text_processed'] = dataset['text'].apply(preprocess_text)
dataset['gold_processed'] = dataset['gold'].apply(preprocess_text)

dataset['len_text'] = dataset['text'].apply(lambda x: len(x))
dataset['len_gold'] = dataset['gold'].apply(lambda x: len(x))

dataset['len_text_sent'] = dataset['text_processed'].apply(lambda x: len(x))
dataset['len_gold_sent'] = dataset['gold_processed'].apply(lambda x: len(x))


[nltk_data] Downloading package punkt to C:\Users\Natalia
[nltk_data]     Lebedeva\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Natalia
[nltk_data]     Lebedeva\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [10]:
dataset[dataset['len_gold'] == 30]

Unnamed: 0,id,text,gold,text_processed,gold_processed,len_text,len_gold,len_text_sent,len_gold_sent
5908,https://openreview.net/forum?id=SyeKGgStDB,This paper proposes an RL agent for generating...,Paper is withdrawn by authors.,[This paper proposes an RL agent for generatin...,[Paper is withdrawn by authors.],1961,30,12,1
5909,https://openreview.net/forum?id=SyeKGgStDB,This paper presents a reinforcement learning a...,Paper is withdrawn by authors.,[This paper presents a reinforcement learning ...,[Paper is withdrawn by authors.],3665,30,32,1
5910,https://openreview.net/forum?id=SyeKGgStDB,The authors present the results of training a ...,Paper is withdrawn by authors.,[The authors present the results of training a...,[Paper is withdrawn by authors.],2972,30,18,1


In [22]:
dataset.describe()

Unnamed: 0,len_text,len_gold,len_text_sent,len_gold_sent
count,26276.0,26276.0,26276.0,26276.0
mean,2712.092366,836.400594,23.283719,6.423999
std,1700.358477,663.47774,14.786403,4.949147
min,22.0,30.0,1.0,1.0
25%,1586.0,411.0,14.0,3.0
50%,2323.0,666.0,20.0,5.0
75%,3389.0,1045.0,29.0,8.0
max,29777.0,7509.0,308.0,67.0


## Data filtering through dot products

In [53]:
from typing import Tuple

import numpy as np
import pandas as pd
import argparse
from pathlib import Path

import torch
from sentence_transformers import SentenceTransformer


def embed_text_and_summaries(df : pd.DataFrame, model : SentenceTransformer) -> Tuple[torch.Tensor, torch.Tensor]:

    text_embeddings = model.encode(df.text.tolist(), convert_to_tensor=True)
    # summary_embeddings = model.encode(df.summary.tolist(), convert_to_tensor=True)
    summary_embeddings = model.encode(df.gold.tolist(), convert_to_tensor=True)

    return text_embeddings, summary_embeddings


def compute_dot_products(df : pd.DataFrame, text_embeddings : torch.Tensor, summary_embeddings : torch.Tensor):

    df = df.reset_index()
    df['index'] = df.index

    # group by id
    grouped = df.groupby('id')

    # for each id gather the id of the text and the summary
    ids_per_sample = grouped.index.apply(list).tolist()

    # compute the dot product between the text and the summary

    metrics = {'proba_of_success' : []}
    for text_ids in ids_per_sample:
        # shape (num_text, embedding_dim)
        text_embedding = text_embeddings[text_ids]
        summary_embedding = summary_embeddings[text_ids]

        # shape (num_text, num_text=num_summary)
        dot_product = torch.matmul(text_embedding, summary_embedding.T)

        # apply log softmax
        log_softmax = torch.nn.functional.log_softmax(dot_product, dim=0)

        # num_text
        log_proba_of_success = torch.diag(log_softmax).squeeze()

        metrics['proba_of_success'].extend(log_proba_of_success.tolist())

    df['proba_of_success'] = metrics['proba_of_success']

    return df

def calculate_cossim(summaries_file, result_file, device='cuda'):
    # load the model
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2', device=device)
    # load the summaries
    df = pd.read_csv(summaries_file)
    # embedd the text and the summary
    text_embeddings, summary_embeddings = embed_text_and_summaries(df, model)
    df = compute_dot_products(df, text_embeddings, summary_embeddings)
    df.to_csv(result_file, index=False)

In [None]:
sum_file = "../../data/merged17-21_filtered_4_no_unique.csv"
res_file = "../../data/cossimresults/all_merged17-21cossim.csv"
calculate_cossim(sum_file, res_file, device='cpu')

In [89]:
cossim_res = pd.read_csv("../../data/cossimresults/all_merged17-21cossim.csv")
cossim_res.drop(columns=['index', 'gold_cleaned', 'gold_length', 'gold_word_count', 'gold_sentences', 'text_cleaned', 'text_length', 'text_word_count', 'text_sentences'], inplace=True)

In [90]:
cossim_res['proba_of_success'] = cossim_res.groupby('id')['proba_of_success'].transform('mean')
cossim_res.sort_values(by='proba_of_success', ascending=False, inplace=True)
checkdf = cossim_res[:200]
checkdf = checkdf[checkdf.groupby('id')['id'].transform('count') == 2]
checkdf.describe()

cossim_res.to_csv("../../data/cossimresults/all_merged17-21_cossim-sorted.csv")

In [None]:
cossim_res.head(30)

Unnamed: 0,id,text,gold,proba_of_success
5934,https://openreview.net/forum?id=Hkexw1BtDr,The paper introduces auto-deferring policies (...,This paper proposes a new way to formulate the...,-0.021927
5933,https://openreview.net/forum?id=Hkexw1BtDr,The paper proposes a Deep RL approach called A...,This paper proposes a new way to formulate the...,-0.021927
1383,https://openreview.net/forum?id=H1OQukZ0-,Summary of the paper--------------------------...,This paper presents an update to the method of...,-0.051171
1382,https://openreview.net/forum?id=H1OQukZ0-,# Summary of paper--------The paper proposes a...,This paper presents an update to the method of...,-0.051171
433,https://openreview.net/forum?id=Byiy-Pqlx,The Neural Turing Machine and related “externa...,The paper presents a Lie-(group) access neural...,-0.058358
434,https://openreview.net/forum?id=Byiy-Pqlx,*** Paper Summary ***----------------This pape...,The paper presents a Lie-(group) access neural...,-0.058358
5303,https://openreview.net/forum?id=rygixkHKDH,[Summary]-----This paper studies the problem o...,This paper investigates the use non-convex opt...,-0.072678
5302,https://openreview.net/forum?id=rygixkHKDH,This paper studies the dictionary learning pro...,This paper investigates the use non-convex opt...,-0.072678
14107,https://openreview.net/forum?id=lU5Rs_wCweN,This work aims at accelerating pre-training by...,The authors propose an approach for pre-traini...,-0.087223
14108,https://openreview.net/forum?id=lU5Rs_wCweN,Summary: This paper proposes a method for impr...,The authors propose an approach for pre-traini...,-0.087223
