In [1]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import T5ForConditionalGeneration, T5Tokenizer
from sentence_transformers import SentenceTransformer
import pandas as pd

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

import numpy as np
import random
random.seed(0)

In [2]:
import os
os.chdir('/workspace/ligo_general/prompt')

In [3]:
class PromptDataset(Dataset):
    def __init__(self, data, tokenizer, max_source_length, max_target_length):
        self.original_text = data['original_text']
        self.rewritten_text = data['rewritten_text']
        self.prompt_text = data['rewrite_prompt']
        self.tokenizer = tokenizer
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length
        self.input_ids = []
        self.attention_mask = []
        self.labels = []
        for i in range(len(self.original_text)):
            original_text = self.original_text[i]
            rewritten_text = self.rewritten_text[i]
            prompt_text = str(self.prompt_text[i])
            task_prefix = f"Recover instruction for the text transformation: original text: {original_text}, transformed text: {rewritten_text}"
            encoding =  self.tokenizer(
                task_prefix,
                padding="max_length",
                max_length=self.max_source_length,
                truncation=True,
                return_tensors="pt",
            )
            input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
            target_encoding = self.tokenizer(
                prompt_text,
                padding="max_length",
                max_length=self.max_target_length,
                truncation=True,
                return_tensors="pt",
            )
            labels = target_encoding.input_ids
            labels[labels == self.tokenizer.pad_token_id] = -100
            self.input_ids.append(input_ids.squeeze())
            self.attention_mask.append(attention_mask.squeeze())
            self.labels.append(labels.squeeze())
    
    def __len__(self):
        return len(self.original_text)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx],
        }

class PromptDecoderLightning(pl.LightningModule):
    def __init__(self, model, tokenizer, max_source_length, max_target_length):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length

    def forward(self, input_ids, attention_mask, labels):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return outputs.loss

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        
        loss = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)

        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        
        loss = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
    
    def predict_step(self, batch, batch_idx, dataloader_idx=None):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']

        outputs = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=self.max_target_length)
        predicted_text = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
        return predicted_text
    
    def generate(self, original_text, rewritten_text):
        task_prefix = f"Recover instruction for text transformation: original text: {original_text}, transformed text: {rewritten_text}"
        encoding = self.tokenizer(
            task_prefix,
            padding="max_length",
            max_length=self.max_source_length,
            truncation=True,
            return_tensors="pt",
        )
        input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

        outputs = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=self.max_target_length)
        predicted_text = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
        return predicted_text


    def configure_optimizers(self):
        return optim.AdamW(self.parameters(), lr=1e-4)

In [4]:
torch.cuda.empty_cache()
model = T5ForConditionalGeneration.from_pretrained('google/flan-t5-large')
tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-large')

# Define the maximum source and target lengths
max_source_length = 512
max_target_length = 128

#model = PromptDecoderLightning(model, tokenizer, max_source_length, max_target_length)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
device = 'cuda'
#checkpoint = 'llm-prompt-recovery/ckpt/flant5-te-epoch=09-val_loss=0.44.ckpt'
checkpoint = 'ckpt/flant5-encoder-epoch=09-val_loss=0.44.ckpt'
model = PromptDecoderLightning.load_from_checkpoint(checkpoint, model=model, tokenizer=tokenizer, max_source_length=512, max_target_length = 128, map_location=torch.device(device))

In [6]:
df = pd.read_csv('data/mistral_df.csv')
# select the random 50 samples from row = 150 - 1500
df = df.iloc[:2000].sample(100, random_state=0)
# trim the dataset so each column has a maximum of 256 characters
df['original_text'] = df['original_text'].str.slice(0, 512)
df['rewritten_text'] = df['rewritten_text'].str.slice(0, 512)

df = df.reset_index()
dataset = PromptDataset(df, tokenizer, max_source_length, max_target_length)
val_loader = DataLoader(dataset, batch_size=16)

In [7]:
predictions = []
ground_truth = []

model.eval()
for i, batch in enumerate(val_loader):
    outputs = model.model.generate(batch['input_ids'].to(device), max_new_tokens=64, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)
    predicted_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    mask = batch['labels'] != -100

    # Apply the mask to the labels
    labels = [label[mask[i]] for i, label in enumerate(batch['labels'])]

    # Decode each sequence separately
    labels = [tokenizer.decode(label, skip_special_tokens=True) for label in labels]

    predictions.extend(predicted_text)
    ground_truth.extend(labels)

OutOfMemoryError: CUDA out of memory. Tried to allocate 160.00 MiB (GPU 0; 39.39 GiB total capacity; 13.58 GiB already allocated; 158.44 MiB free; 13.86 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
def sharpened_cosine_similarity(s, k, q=1e-6, p=3):
    """
    Calculates the sharpened cosine similarity between two vectors.

    Parameters:
    s (numpy.ndarray): The first vector.
    k (numpy.ndarray): The second vector.
    q (float, optional): A small constant to avoid division by zero. Default is 1e-6.
    p (int, optional): The power to which the sharpening term is raised. Default is 2.

    Returns:
    float: The average sharpened cosine similarity between the two vectors.
    """
    assert s.shape == k.shape
    cosine_sims = []
    for s_vec, k_vec in zip(s, k):
        cosine_sim = np.dot(s_vec, k_vec) / (np.linalg.norm(s_vec) * np.linalg.norm(k_vec))
        cosine_sims.append(np.sign(cosine_sim) * np.power(np.abs(cosine_sim) / (np.linalg.norm(s_vec) + q), p))
    return np.mean(np.array(cosine_sims))

In [None]:
# get 5 random int from 0 to 100
ids = random.sample(range(100), 5)
for i in ids:
    print("--------")
    print(f"Original text: {df['original_text'][i]}")
    print("--------")
    print(f"Rewritten text: {df['rewritten_text'][i]}")
    print('--------')
    print(f"Predicted: {predictions[i]}")
    print('--------')
    print(f"Ground truth: {ground_truth[i]}")
    print()

--------
Original text:  we study a stochastic game where one player tries to find a strategy such that the state process reaches a target of controlled - loss - type , no matter which action is chosen by the other player .   we provide , in a general setup , a relaxed geometric 
--------
Rewritten text:    **Relaxed Geometric Dynamic Programming Principle:**  In a stochastic game, find a strategy such that the state process reaches a target state, regardless of the actions chosen by the other player. This can be achieved by iteratively refining a sequence of relaxed targets, starting from the initial state and moving towards the target state, until the desired target state is reached.  **Main Ide
--------
Predicted: emphasizes the key concepts and ideas, while omitting unnecessary details and jargon. focus on the main idea of the text, which is to find a strategy to iteratively improve the game process.
--------
Ground truth: emphasizes the main idea of the text more clearly. for exam

In [None]:
def load_model():
    return SentenceTransformer('sentence-transformers/sentence-t5-base', device='cpu')

model = load_model()

In [None]:
predictions_embeddings = model.encode(predictions)
ground_truth_embeddings = model.encode(ground_truth)

scs = sharpened_cosine_similarity(predictions_embeddings, ground_truth_embeddings)
print(scs)

0.6594531449426563
