In [1]:
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

In [2]:
with open("papers_segmented_data/davinci_dialogues_full_postproc.pkl", "rb") as file:
    data = pickle.load(file)

In [3]:
data[0].keys()

dict_keys(['text', 'dialogue', 'meta_segments', 'meta_paper', 'parsed_dialogue'])

In [4]:
len(data)

3588

In [5]:
text = list()
summary = list()

for row in tqdm(data):
    text.append(row["text"].replace("\n", " "))
    summary.append(row["parsed_dialogue"]["summary"])

  0%|          | 0/3588 [00:00<?, ?it/s]

In [6]:
import pandas as pd

In [7]:
df = pd.DataFrame({"text": text, "summary": summary})
df = df.loc[df.summary != ""]

In [11]:
train, validation = train_test_split(df, test_size=0.1, random_state=42)
train.shape, validation.shape

((3204, 2), (356, 2))

In [12]:
train.to_csv("datasets/sum_train.csv", index=False)
validation.to_csv("datasets/sum_val.csv", index=False)

### Num tokens estimation

In [13]:
import torch

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [14]:
train_hf = load_dataset('csv', data_files={'train': 'datasets/sum_train.csv'})

Downloading and preparing dataset csv/default to /home/jovyan/.cache/huggingface/datasets/csv/default-80e54394b1bd255d/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /home/jovyan/.cache/huggingface/datasets/csv/default-80e54394b1bd255d/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [16]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name_or_path = "sshleifer/distilbart-cnn-12-6"

In [17]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path).to(device)

In [19]:
num_tokens_text = []
num_tokens_summ = []

for text, title in tqdm(zip(train_hf["train"]["text"], train_hf["train"]["summary"]), total=len(train_hf["train"]["summary"])):
    num_tokens_text.append(len(tokenizer.encode(text)))
    num_tokens_summ.append(len(tokenizer.encode(title)))

  0%|          | 0/3204 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1042 > 1024). Running this sequence through the model will result in indexing errors


In [22]:
np.mean(num_tokens_text), np.median(num_tokens_text), np.quantile(num_tokens_text, 0.95)

(626.3027465667915, 588.0, 1242.2499999999995)

In [23]:
np.mean(num_tokens_summ), np.median(num_tokens_summ), np.quantile(num_tokens_summ, 0.95)

(60.16729088639201, 58.0, 98.84999999999991)

### Training cycles

In [None]:
!CUDA_VISIBLE_DEVICES=0 python custom_bart_scripts/run_summarization.py \
    --model_name_or_path="facebook/bart-large-cnn" \
    --do_train \
    --do_eval \
    --report_to="wandb" \
    --evaluation_strategy="steps" \
    --weight_decay=0.01 \
    --logging_steps=500 \
    --run_name="bart-large-cnn_2e-5_final" \
    --train_file="datasets/sum_train.csv" \
    --validation_file="datasets/sum_val.csv" \
    --output_dir="bart_summarization/bart-large-cnn" \
    --per_device_train_batch_size=4 \
    --per_device_eval_batch_size=4 \
    --max_target_length=228 \
    --learning_rate=2e-5 \
    --num_train_epochs=3 \
    --overwrite_output_dir \
    --predict_with_generate

In [None]:
!CUDA_VISIBLE_DEVICES=0 python custom_bart_scripts/run_summarization.py \
    --model_name_or_path="sshleifer/distilbart-cnn-12-6" \
    --do_train \
    --do_eval \
    --report_to="wandb" \
    --evaluation_strategy="steps" \
    --weight_decay=0.01 \
    --logging_steps=500 \
    --run_name="distilbart-cnn-12-6_1e-5" \
    --train_file="datasets/sum_train.csv" \
    --validation_file="datasets/sum_val.csv" \
    --output_dir="bart_summarization/distilbart-cnn-12-6_1e-5" \
    --per_device_train_batch_size=4 \
    --per_device_eval_batch_size=4 \
    --max_target_length=228 \
    --learning_rate=1e-5 \
    --num_train_epochs=3 \
    --overwrite_output_dir \
    --predict_with_generate

### Inference

In [40]:
from transformers import BartForConditionalGeneration, BartTokenizer

In [89]:
def generate_example():
    sample = validation.sample()
    text, title = sample.values[0]
    
    inputs = tokenizer([text], max_length=256, return_tensors="pt").to(device)
    summary_ids = model.generate(inputs["input_ids"])
    pred = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

    inputs1 = tokenizer1([text], max_length=256, return_tensors="pt").to(device)
    summary_ids1 = model1.generate(inputs1["input_ids"])
    pred1 = tokenizer1.batch_decode(summary_ids1, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    
    print("INDEX:", sample.index[0])
    print(text)
    print("----\n")
    print("BART-LARGE-CNN:", pred)
    print("--" * 10)
    print("DISTILBART-CNN-12-6:", pred1)
    print("--" * 10)
    print("GROUND TRUTH:", title)

In [33]:
validation = pd.read_csv("datasets/sum_val.csv")

In [34]:
validation

Unnamed: 0,text,summary
0,There has been some work on feature optimizati...,Previous work in feature optimization for depe...
1,"To generate scale-sensitive features, we need ...",We need to find filters that are active to the...
2,We now describe the fitting process in our sys...,Our system fits a rendering network to point c...
3,We presented global contrast based saliency co...,We have presented global contrast based salien...
4,This paper addresses the challenging black-box...,This paper proposes a simple baseline approach...
...,...,...
351,The following definitions and notations are us...,This paper presents a minimization model with ...
352,Our capturing method consists of sequentially ...,Our paper presents a method for capturing imag...
353,Baselines. We benchmark our talking-head model...,This paper evaluates the face redirection capa...
354,To evaluate the generalization performance of ...,This paper evaluates the generalization perfor...


In [63]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name_or_path = "bart_summarization/bart-large-cnn/checkpoint-1500"

In [64]:
tokenizer = BartTokenizer.from_pretrained(model_name_or_path)
model =  BartForConditionalGeneration.from_pretrained(model_name_or_path).to(device)

In [72]:
model_name_or_path1 = "bart_summarization/distilbart-cnn-12-6_1e-5/checkpoint-1500"

tokenizer1 = BartTokenizer.from_pretrained(model_name_or_path1)
model1 =  BartForConditionalGeneration.from_pretrained(model_name_or_path1).to(device)

In [84]:
generate_example()

CIFAR. The two CIFAR datasets (Krizhevsky 2009) consist of colored natural images with a size of 32×32. CIFAR-10 is drawn from 10 and CIFAR-100 is drawn from 100 classes. In each dataset, the train and test sets contain 50,000 and 10,000 images, respectively. A standard data augmentation scheme 3 (Lee et al. 2015;Romero et al. 2015;Larsson, Maire, and Shakhnarovich 2016;Huang et al. 2017a;Liu et al. 2017) (Netzer et al. 2011) consists of 32×32 colored digit images, with one class for each digit. The train and test sets contain 604,388 and 26,032 images, respectively. Following previous works (Goodfellow et al. 2013;Huang et al. 2016;2017a;Liu et al. 2017), we split a subset of 6,000 images for validation, and train on the remaining images without data augmentation. ImageNet. The ILSVRC 2012 classification dataset (Deng et al. 2009) consists of 1000 classes, with a number of 1.2 million training images and 50,000 validation images. We adopt the the data augmentation scheme following (Kr

In [85]:
generate_example()

We present and develop novel moments-based permutation tests where the permutation distributions are accurately approximated through Pearson distributions for considerably reduced computation cost. Comparing with regular random permutation, the proposed method considerably reduces computation cost without loss of accuracy. General and analytical formulations for the moments of permutation distribution are derived for weighted v-test statistics. The proposed strategy takes advantage of nonparametric permutation tests and parametric Pearson distribution approximation to achieve both accuracy/flexibility and efficiency. [(1,1) , (1,2), (1,2), (1,3), (2,3), (1,4)] , , , #( ) ( , ) ( , ) ( , ) ( , ) ( , ) ( , ) i j k l w w i i w i j w i j w i k w j k w i l . The permutation equivalent index subset is represented by an undirected graph. Every node denotes an index number. We connect two different nodes if these two corresponding index numbers are in the same index element, i.e., in the same 

In [90]:
generate_example()



INDEX: 334
Datasets. We use 4 datasets arising from different domains: (a) Incart-ECG (Goldberger et al., 2000): Dataset of ECG timeseries from PhysioNet bank, annotated with heartbeat arrhythmias. We use one ECG lead. The task is to classify atrial (positive) vs. ventricular premature contractions (negative). Both are common arrhythmias that co-occur in   Results. Table 1 presents our main experimental results. We make the following observations: (1) Short-circuiting matters: The comparison of TLP to SWLP directly evaluates the effect of summarizing the stream by the star-mesh transform, as they are otherwise identical. As noticed in Table 1, it yields a substantial improvement in the accuracy on the temporallyordered datasets Incart-ECG, Daphnet-Gait, and CamVid-Car, with almost no effect on the running time. This corroborates the presumption that TLP is well suited for streams that adhere to a temporal vicinity structure as per Section 5.1. However, when there is no natural temporal