# load summarization model

## load model

In [None]:
!pip install transformers
!pip install torch transformers

from transformers import BartTokenizer, BartForConditionalGeneration

model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

model.config.output_scores = True
import numpy as np
import torch
from torch.nn.functional import softmax
import random

In [2]:
# greedy search
model.config.num_beams = 1

In [3]:
model.config.length_penalty = 1.0

In [4]:
model.config.early_stopping = False

In [None]:
text = "LET’S GET THIS out of the way: Dua Lipa is finishing her` third album. It’s due for release in 2024 and, despite the trend of musicians announcing and delaying records for years, Lipa will almost certainly meet her deadline. It’s funny to think of a pop star — or any successful young artist — as just another striving professional. But at 27, Lipa has already become the kind of multihyphenate entrepreneur who not only finishes her assignments on time but discusses strategy and efficiency with the clarity of a company founder delivering a TED Talk. “If I wasn’t as organized as I am, I would be a mess right now,” she says when we meet one drizzly May afternoon in London. The singer had asked one of her favorite restaurants, Sushi on Jones, hidden on the second floor of a King’s Cross concert venue, to open before dinner so we could have the place to ourselves, then arrived 10 minutes early to make sure everything was as planned."
inputs = tokenizer(text, return_tensors="pt",add_special_tokens=False)
generation_output = model.generate(**inputs, max_new_tokens=20, min_length=20, return_dict_in_generate=True,output_scores=True)

In [6]:
partial_summary = tokenizer.decode(generation_output.sequences[0] ,skip_special_tokens=True)
print(type(partial_summary))
print(partial_summary)

<class 'str'>
Dua Lipa is finishing her third album. It’s due for release in


In [12]:
original_text = text
input_text = f"{original_text} {tokenizer.eos_token} {partial_summary}"
input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True)
generated_ids = model.generate(input_ids, max_new_tokens=3, min_length=3, return_dict_in_generate=True,output_scores=True, num_return_sequences=1)
generated_summary = tokenizer.decode(generated_ids.sequences[0], skip_special_tokens=True)
print("Generated Summary:", generated_summary)

Generated Summary: D


# Sequential sampeling

## greedy sequential sampling

In [13]:
# Generating text sequentially(by choosign max probability)
original_text = text
partial_summary = generated_summary
partial_summary_ids = generated_ids.sequences[0][:-1]

for i in range(7):
  gen_num = 4+i
  input_text = f"{original_text} {tokenizer.eos_token} {partial_summary}"
  input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True)

  generated_ids = model.generate(input_ids, max_new_tokens=gen_num, min_length=gen_num, return_dict_in_generate=True,output_scores=True, num_return_sequences=1)
  a = softmax(generated_ids.scores[-2][0],dim=-1)
  next = torch.argmax(a).item()
  generated_ids.sequences[0][:-2] = partial_summary_ids
  generated_ids.sequences[0][-2] = next
  partial_summary_ids = generated_ids.sequences[0][:-1]

  generated_summary = tokenizer.decode(generated_ids.sequences[0], skip_special_tokens=True)
  partial_summary = generated_summary
  print("Generated Summary:", generated_summary)

Generated Summary: Dua
Generated Summary: Dua Lip
Generated Summary: Dua Lipa
Generated Summary: Dua Lipa is
Generated Summary: Dua Lipa is finishing
Generated Summary: Dua Lipa is finishing her
Generated Summary: Dua Lipa is finishing her third


## greedy sequential sampling with hard watermark

In [14]:
# Get the model's vocabulary
values = list(tokenizer.get_vocab().values())
keys = list(tokenizer.get_vocab().keys())
vocab = {}
for i in range(len(values)):
  vocab[values[i]] = keys[i]

In [15]:
def give_watermarked(token_id, gama=0.5):
  seed = int(token_id)
  v = values.copy()
  v = [i-1 for i in v]
  random.seed(seed)

  # Shuffle the vocabulary to randomize it
  random.shuffle(v)

  total_tokens = len(v)
  green_portion = int(total_tokens * gama)

  green = v[:green_portion]
  red = v[green_portion:]

  return (green, red)

In [18]:
original_text = text
input_text = f"{original_text} {tokenizer.eos_token} {partial_summary}"
input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True)
generated_ids = model.generate(input_ids, max_new_tokens=3, min_length=3, return_dict_in_generate=True,output_scores=True, num_return_sequences=1)
generated_summary = tokenizer.decode(generated_ids.sequences[0], skip_special_tokens=True)
print("Generated Summary:", generated_summary)

Generated Summary: D


In [19]:
# Generating text sequentially with hard watermark (by choosign max probability)
original_text = text
partial_summary = generated_summary
partial_summary_ids = generated_ids.sequences[0][:-1]

for i in range(7):
  watermark_token = partial_summary_ids[-1]

  gen_num = 4+i
  input_text = f"{original_text} {tokenizer.eos_token} {partial_summary}"
  input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True)

  generated_ids = model.generate(input_ids, max_new_tokens=gen_num, min_length=gen_num, return_dict_in_generate=True,output_scores=True, num_return_sequences=1)
  logit = generated_ids.scores[-2][0]
  indices_to_mask = give_watermarked(watermark_token, gama=0.5)[1]
  for index in indices_to_mask:
    logit[index] = float('-inf')
  a = softmax(logit,dim=-1)
  next = torch.argmax(a).item()
  generated_ids.sequences[0][:-2] = partial_summary_ids
  generated_ids.sequences[0][-2] = next
  partial_summary_ids = generated_ids.sequences[0][:-1]

  generated_summary = tokenizer.decode(generated_ids.sequences[0], skip_special_tokens=True)
  partial_summary = generated_summary
  print("Generated Summary:", generated_summary)

Generated Summary: DUA
Generated Summary: DUA lip
Generated Summary: DUA lipo
Generated Summary: DUA lipo is
Generated Summary: DUA lipo is releasing
Generated Summary: DUA lipo is releasing the
Generated Summary: DUA lipo is releasing the`


## greedy sequential sampling with soft watermark

In [23]:
original_text = text
input_text = f"{original_text} {tokenizer.eos_token} {partial_summary}"
input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True)
generated_ids = model.generate(input_ids, max_new_tokens=3, min_length=3, return_dict_in_generate=True,output_scores=True, num_return_sequences=1)
generated_summary = tokenizer.decode(generated_ids.sequences[0], skip_special_tokens=True)
print("Generated Summary:", generated_summary)

Generated Summary: D


In [24]:
# Generating text sequentially(by choosign max probability)
original_text = text
partial_summary = generated_summary
partial_summary_ids = generated_ids.sequences[0][:-1]

landa = 10
# lambda =
for i in range(7):
  watermark_token = partial_summary_ids[-1]

  gen_num = 4+i
  input_text = f"{original_text} {tokenizer.eos_token} {partial_summary}"
  input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True)

  generated_ids = model.generate(input_ids, max_new_tokens=gen_num, min_length=gen_num, return_dict_in_generate=True,output_scores=True, num_return_sequences=1)
  green_indices = [index for index in give_watermarked(watermark_token, gama=0.5)[0]]
  mask = torch.zeros_like(generated_ids.scores[-2][0])
  mask[green_indices] = 1
  generated_ids.scores[-2][0] = generated_ids.scores[-2][0] + (mask * landa)
  a = softmax(generated_ids.scores[-2][0],dim=-1)
  next = torch.argmax(a).item()
  generated_ids.sequences[0][:-2] = partial_summary_ids
  generated_ids.sequences[0][-2] = next
  partial_summary_ids = generated_ids.sequences[0][:-1]

  generated_summary = tokenizer.decode(generated_ids.sequences[0], skip_special_tokens=True)
  partial_summary = generated_summary
  print("Generated Summary:", generated_summary)

Generated Summary: DUA
Generated Summary: DUA lip
Generated Summary: DUA lipo
Generated Summary: DUA lipo is
Generated Summary: DUA lipo is releasing
Generated Summary: DUA lipo is releasing the
Generated Summary: DUA lipo is releasing the`


# Entropy calculation

In [25]:
import torch
# Entropy comparison only makes sense for similar z
# Larger z -> smaller spike, however p having value more than 1/z effect more
# so better for comparison

def calculate_spike(P, z):
  return torch.sum(P / (1 + z * P)).item() # returns a number

tensor([0.6667, 1.0000, 1.2000, 1.3333])
4.200000286102295


# Calculate z-score(& p-value)

In [31]:
import numpy as np
from scipy import stats

def calculate_z_score(green_tokens, text_length, gama=0.5):
  # z-score
  z_score = (green_tokens - (gama*text_length)) / (np.sqrt(text_length*gama*(1-gama)))

  p_value = 1 * (1 - stats.norm.cdf(abs(z_score)))  # One-tailed test

  return (z_score, p_value)

In [34]:
calculate_z_score(128, 200, gama=0.5)[0]

3.959797974644666

# divide(& conqure)

In [None]:
# Picking the next
from torch.nn.functional import softmax
f = ''
for i in range(20):
  a = softmax(generation_output.scores[i][0],dim=-1)
  next = torch.argmax(a).item()
  print(torch.tensor([next]))
  next = tokenizer.decode(next)

In [None]:
# For soft
t = generation_output.scores[6][0]
print(t)
l = [0,1,2]
a = 10
mask = torch.zeros_like(t)
mask[l] = 1
result = t + (mask*a)
print(result)

tensor([-21.7627, -12.7303,     -inf,  ..., -12.8755, -13.3142, -12.9490])
tensor([-11.7627,  -2.7303,     -inf,  ..., -12.8755, -13.3142, -12.9490])


In [None]:
# For hard
t = generation_output.scores[5][0]
print(t)
indices_to_mask = [1,3]
for index in indices_to_mask:
  t[index] = float('-inf')
s = softmax(t,dim=-1)
print(s)
print(s[1])
print(s[3])

tensor([-19.6683, -13.2819,     -inf,  ..., -13.3650, -13.4223, -13.4817])
tensor([2.8720e-09, 0.0000e+00, 0.0000e+00,  ..., 1.5691e-06, 1.4818e-06,
        1.3963e-06])
tensor(0.)
tensor(0.)


In [None]:
# handling special tokens
special_tokens = tokenizer.special_tokens_map.values()
token_to_check = "[SEP]"
a = softmax(generation_output.scores[13][0],dim=-1)
next = torch.argmax(a).item()
next = tokenizer.decode(next)
token_to_check = next
is_special_token = token_to_check in special_tokens
print("Special Tokens:", special_tokens)
print(f"'{token_to_check}' is a special token: {is_special_token}")

Special Tokens: dict_values(['<s>', '</s>', '<unk>', '</s>', '<pad>', '<s>', '<mask>'])
' 2024' is a special token: False


# Dataset

In [None]:
! pip install transformers datasets
from transformers import BartTokenizer, BartForConditionalGeneration
from datasets import load_dataset
dataset = load_dataset("xsum")

In [None]:
for example in dataset["test"]:
    input_text = example["document"]
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=1024, truncation=True)

    # Generate a summary
    summary_ids = model.generate(input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)

    # Decode and print the generated summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    print("Original Text:", input_text)
    print("Generated Summary:", summary)
    break

# Entropy of a sentence

# Watermark

## create hash & devide vocab

In [None]:
input_text = "Your input text goes here."

input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=1024, truncation=True)

v = values.copy()

# Seed a random number generator with the hash
seed = int(input_ids[0, 3])
random.seed(seed)

# Shuffle the vocabulary to randomize it
random.shuffle(v)

# Calculate the size of each set
total_tokens = len(v)
half_tokens = total_tokens // 2

# Divide the vocabulary into two sets with equal sizes
green = v[:half_tokens]
red = v[half_tokens:]

# Print some statistics
print("Total tokens in vocabulary:", total_tokens)
print("Tokens in Set 1:", len(green))
print("Tokens in Set 2:", len(red))

Total tokens in vocabulary: 50265
Tokens in Set 1: 25132
Tokens in Set 2: 25133


In [None]:
# Input text
input_text = "This is an example input."

# Tokenize the input text
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Generate output
output_ids = model.generate(input_ids)

# Decode the generated output
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Get the output scores
output_scores = model(input_ids).logits

print("Generated Output:", output_text)
print("Output Scores:", output_scores)


Generated Output: This is an example input. Use this input to help people understand how to use this code. Use the code to help others understand how this code can be used to help them with their own code. The code can also be used as an example to help other people understand their code.
Output Scores: tensor([[[11.1690,  0.5774,  2.6892,  ...,  0.2707,  0.1449,  0.0438],
         [11.1690,  0.5774,  2.6892,  ...,  0.2707,  0.1449,  0.0438],
         [-5.0483, -0.0825,  1.7771,  ..., -0.0807, -0.0811, -0.0668],
         ...,
         [-5.4172,  0.2053,  4.9584,  ...,  0.3211, -0.1325,  0.3382],
         [-7.1791, -0.1360,  3.4101,  ...,  0.2763, -0.2612,  0.3939],
         [-1.9115, -0.3598, 11.5676,  ...,  0.2852, -0.5289, -0.1428]]],
       grad_fn=<AddBackward0>)
