In [1]:
# Install bitsandbytes for loading the LLM model faster
!pip install --no-index --find-links=/kaggle/input/bitsandbytes -r /kaggle/input/bitsandbytes/requirements.txt
!pip install --no-index --find-links=/kaggle/input/accelerate -r /kaggle/input/accelerate/requirements.txt
!pip install --no-index --find-links=/kaggle/input/transformers -r /kaggle/input/transformers/requirements.txt
# Install datasets
!pip install --no-index --find-links=/kaggle/input/datasets-installation -r /kaggle/input/datasets-installation/requirements.txt
# Install TRL for using Supervised Fine-tuning Trainer
!pip install --no-index --find-links=/kaggle/input/transformer-reinforcement-learning -r /kaggle/input/transformer-reinforcement-learning/requirements.txt
# Install PEFT
!pip install --no-index --find-links=/kaggle/input/peft-installation -r /kaggle/input/peft-installation/requirements.txt
# Install optimum
!pip install --no-index --find-links=/kaggle/input/optimum-installation -r /kaggle/input/optimum-installation/requirements.txt

Looking in links: /kaggle/input/bitsandbytes
Processing /kaggle/input/bitsandbytes/bitsandbytes-0.43.0-py3-none-manylinux_2_24_x86_64.whl (from -r /kaggle/input/bitsandbytes/requirements.txt (line 1))
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.0
Looking in links: /kaggle/input/accelerate
Looking in links: /kaggle/input/transformers
Looking in links: /kaggle/input/datasets-installation
Processing /kaggle/input/datasets-installation/datasets-2.16.0-py3-none-any.whl (from -r /kaggle/input/datasets-installation/requirements.txt (line 1))
Processing /kaggle/input/datasets-installation/dill-0.3.7-py3-none-any.whl (from datasets==2.16.0->-r /kaggle/input/datasets-installation/requirements.txt (line 1))
Processing /kaggle/input/datasets-installation/fsspec-2023.10.0-py3-none-any.whl (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets==2.16.0->-r /kaggle/input/datasets-installation/requirements.txt (line 1))
INFO: pip is looking at multiple versions of m

# Import libraries <a class="anchor"  id="libraries"></a>

In [2]:
import os, random
import pandas as pd
import numpy as np
# from string import Template
from pathlib import Path

from torch import nn
# Transformer
from accelerate import Accelerator
import transformers
from transformers import (pipeline, AutoTokenizer, AutoModelForCausalLM, 
                          BitsAndBytesConfig, AutoConfig, TrainingArguments)
# Supervised Trainser
from datasets import Dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import LoraConfig, get_peft_model, TaskType, PeftConfig, PeftModel
# Split data into training and test (valid) dataset
from sklearn.model_selection import train_test_split

# For quantization
import bitsandbytes, accelerate
from tqdm.notebook import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import optimum

2024-06-14 08:11:19.830902: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-14 08:11:19.831013: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-14 08:11:19.964478: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
import ctypes, gc
import torch

libc = ctypes.CDLL("libc.so.6")
# Seed the same seed to all 
def seed_everything(seed=42):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    
def clear_memory():
    libc.malloc_trim(0)
    torch.cuda.empty_cache()
    gc.collect()

SEED = 42
seed_everything(SEED)
# Set the GPUs
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Fine-tuning Gemma-2b model using Keras library <a class="anchor"  id="gemma-2b"></a>

Ref: @JUAN MERINO [Fine Tuning with Gemma 2b](https://www.kaggle.com/code/juanmerinobermejo/fine-tuning-with-gemma-2b)

In [4]:
# Import keras and Keras-NLP for training
import keras
import keras_nlp

import os
os.environ["KERAS_BACKEND"] = "jax"
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"]="1.00"

import warnings
warnings.simplefilter("ignore")

In [5]:
class CFG:
    model_name = 'gemma_2b_en'
    model_path = '/kaggle/input/gemma/keras/gemma_2b_en/2'
    data_path = '/kaggle/input/rewritten-texts-with-gemma-2b/rewritten_texts_csv.csv'
    output_path = f'outputs'
    model_save_path =  f'{model_name}_adapter'
    
    # Model training argument
    epochs=20
    batch_size=1 
    max_length=512 
    lr = 1e-3
    
print(CFG.model_save_path)

gemma_2b_en_adapter


### Load and Train the model

Quantization technique is used to reduce memory and computational costs by representing weights and activations with lower-precision data types like 8-bit integers (int8). 

In [6]:
## Load data and split into training and valid dataset
def load_data():
    df = pd.read_csv(CFG.data_path, encoding='latin-1')
    output_texts = []
    for index in range(len(df)):
        row = df.iloc[index]
        original_text = row['original_text']
        prompt = row['prompt']
        rewritten_text = row['rewritten_text']
        # Format the prompt with original and rewritten texts
        formatted_prompt = f"""Original Text:\n{original_text}\n\n
                               Prompt:\n{prompt}\n\n
                               Rewritten text:\n{rewritten_text}"""
        if len(formatted_prompt) < CFG.max_length:
            output_texts.append(formatted_prompt)
    del df
    return output_texts

In [7]:
def train_model():
    # Load the training data
    training_data = load_data() 
    # Load the Gemma and add lora layer
    # ref: https://ai.google.dev/gemma/docs/lora_tuning
    gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset(CFG.model_name)
    gemma_lm.summary()
    # This will freeze all weights on the backbone,
    # while enabling Lora on the query & value layers of the attention layers.
    gemma_lm.backbone.enable_lora(rank=4)
    gemma_lm.preprocessor.sequence_length = CFG.max_length
    # Create the optimizer (AdamW)
    optimizer = keras.optimizers.AdamW(learning_rate=CFG.lr,
                                       weight_decay=0.001,
                                       beta_1=0.9,
                                       beta_2=0.999)
    optimizer.exclude_from_weight_decay(var_names=["bias", "scale"])
    # Add optimizer, loss function and evalution metrics
    gemma_lm.compile(loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                     optimizer=optimizer,
                     weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()])
    # Train the model with 
    gemma_lm.fit(training_data, epochs=CFG.epochs, batch_size=1, verbose=1)      
    # Save the model
    gemma_lm.save_weights(CFG.model_save_path)
    gemma_lm.preprocessor.tokenizer.save_assets(CFG.model_save_path)

In [8]:
TRAINING = False # True: Enable training, False: Infer only
if TRAINING:
    train_model()
    os._exit(0)

# Fine-tuning Gemma-7b using pytorch library<a class="anchor" id="pretrained"></a>
Fine-tuned pretrained LLM (Gemma/Mistral/Phi) to infer the testing data's prompt.

- @ZHANSAYA YUSSUPOVA [Gemma 7B with LoRa | Prompt Recovery](https://www.kaggle.com/code/yujansaya/gemma-7b-with-lora-prompt-recovery)


In [9]:
class CFG:
    model_name = 'gemma_7b'
    model_paths = {'gemma_7b': '/kaggle/input/gemma/transformers/7b-it/2'}
    model_path = model_paths[model_name]
    
    # Model training argument
    data_path = '/kaggle/input/gemma-rewrite-nbroad/nbroad-v2.csv'
    model_save_path =  f'{model_name}_adapter'
    max_length=150 # truncate the text to the first 150 words to avoid OOM issues.
    NROWS = 10 # Read 1000 texts from dataset
    batch_size = 1
    lr = 2e-4

## Load the model

In [10]:
def load_model():
    accelerator = Accelerator()
    # Use quantization technique to reduce the memory usage
    quantization_config = BitsAndBytesConfig(
        load_in_4bit = True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
    )
    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)
    # Load the model
    model = AutoModelForCausalLM.from_pretrained(
                                CFG.model_path,
                                device_map = "auto",
                                trust_remote_code = True,
                                quantization_config=quantization_config)
    model = accelerator.prepare(model)
    return model, tokenizer

## Model training with prompts generated by Gemma LLM

In [11]:
# Formate the row (example) data with an instruction
def formatting_func(example):
    prompt = f"""Original Essay:\n{example['original_text'][0]}\n\n
               Rewritten Essay:\n{example['rewritten_text'][0]}\n\n
               Instruction:\n Given are 2 essays, the Rewritten essay was created from the Original essay using the google Gemma model.
               You are trying to understand how the original essay was transformed into a new version. 
               Analyzing the changes in style, theme, etc., please come up with a prompt that must have been used to guide the transformation from the original to the rewritten essay.
               Only give me the PROMPT. Start directly with the prompt, that's all I need.
               Output should be only line ONLY.\n\n
               Response: \n{example['rewrite_prompt'][0]}"""
    return [prompt]

def train_model(model, tokenizer):
    # Load the training data
    df = pd.read_csv(CFG.data_path, nrows=CFG.NROWS)
    # Create the dataset
    training_ds = Dataset.from_pandas(df)
    # Tokenizer 
    training_ds = training_ds.map(lambda samples: tokenizer(samples["original_text"]), batched=True)
    training_ds = training_ds.map(lambda samples: tokenizer(samples["rewritten_text"]), batched=True)
    training_ds = training_ds.map(lambda samples: tokenizer(samples["rewrite_prompt"]), batched=True)    
    # Add PEFT (lora) layer
    lora_config = LoraConfig(r=32, # Rank
                             lora_alpha=32,
                             target_modules=["q_proj", "o_proj", "k_proj", 
                                             "v_proj", "gate_proj", "up_proj", "down_proj"],
                             lora_dropout=0.05,
                             bias="none",
                             task_type=TaskType.CAUSAL_LM)
    # Training arguments
    args = TrainingArguments(
            per_device_train_batch_size=CFG.batch_size,
            gradient_accumulation_steps=4,
            warmup_steps=2,
            max_steps=10,
            learning_rate=CFG.lr,
            fp16=True,
            logging_steps=1,
            output_dir="outputs",
            optim="paged_adamw_8bit",
            report_to="none"
        )
    # Create a trainer (supervised fine-tuned trainer)
    trainer = SFTTrainer(model=model,
                         train_dataset=training_ds,
                         args=args,
                         peft_config=lora_config,
                         formatting_func=formatting_func)
    trainer.train()
    # Save the model
    trainer.save_model(CFG.model_save_path)
    tokenizer.save_pretrained(CFG.save_path)
    print(f"Save the model to {CFG.save_path}")
    

In [12]:
TRAINING = False # True: Enable training, False: Infer only
if TRAINING:
    model, tokenizer = load_model()
    train_model(model, tokenizer)
    os._exit(0)

# Model Inference <a class='anchor' id='infer'></a>
- [Load testing data](#load_data)
- [Generate prompts using fine-tuned Phi LLM](#phi)
- [Generate the prompts using pretrained Gemma-7b LLM](#llm)
- [Generate the prompts using pretrained Mistral-7b LLM (version 2)](#mistral)

In [13]:
class CFG:
    # Get device (CPUs or GPUs)
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_paths = {'phi': '/kaggle/input/phi/transformers/2/1',
                   'gemma-7b': '/kaggle/input/gemma/transformers/7b-it/2', 
                   'mistral-7b': '/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1',
                   'mistral-7b-v2': '/kaggle/input/mistral-7b-it-v02',
                   }
    adapter_paths = {'phi': '/kaggle/input/phi2-public-data-sft-adapter/pytorch/public-data-sft/1/phi2_public_data_sft'
                    }


## Load testing data <a class='anchor' id='load_data'></a> 

In [14]:
# Load the testing data
test_df = pd.read_csv('/kaggle/input/llm-prompt-recovery/test.csv', index_col='id')
test_df["rewrite_prompt"] = "-" # Empty
test_df.head()

Unnamed: 0_level_0,original_text,rewritten_text,rewrite_prompt
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,The competition dataset comprises text passage...,Here is your shanty: (Verse 1) The text is rew...,-


# Generate prompts using fine-tuned Phi LLM <a class='anchor' id='phi'></a>
Use the Microsoft Phi LLM fined-tuned by @LUMOS [phi2-public-data-sft-adapter](https://www.kaggle.com/models/mozhiwenmzw/phi2-public-data-sft-adapter/frameworks/PyTorch/variations/public-data-sft/versions/1) to generate the prompts of testing data

Credits:
- @Lumos [[0.61+]LLMPR phi2 sft model training](https://www.kaggle.com/code/mozhiwenmzw/0-61-llmpr-phi2-sft-model-training)
- @Lumos [[0.61+]LLMPR phi2 sft model generate infer](https://www.kaggle.com/code/mozhiwenmzw/0-61-llmpr-phi2-sft-model-generate-infer)

In [15]:
class PhiModelRecover:
    def __init__(self):
        self.model_name = 'phi'
        self.load_model()
        self.input_token_len = 1024
        self.output_token_len = 100 
        
    # Load tokenizer and model
    def load_model(self):
        model_path = CFG.model_paths[self.model_name]
        print(f"model_path = {model_path}")
         # Load the tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        # Load the model
        base_model = AutoModelForCausalLM.from_pretrained(model_path,
                                                          device_map="auto",
                                                          trust_remote_code=True)
        # Load PEFT adapter layer
        adapter_path = CFG.adapter_paths[self.model_name]
        # Load PEFT adapter to the model
        self.model = PeftModel.from_pretrained(base_model, adapter_path)
        print(f"Complete loading PEFT adapter {adapter_path}")
        self.model.to(CFG.DEVICE)
        self.model.eval()
        print("Complete loading the model")
        
    # Generate the prompts using Phi models
    def prompt_generate(self, original_text, rewrite_text):
        prompt = f"""Instruct: Original Text:{original_text}\n
                     Rewritten Text:{rewrite_text}\n
                     Write a prompt that was likely given to the LLM to rewrite original text
                     to rewritten text.\nOutput:"""
        # print(f"prompt = {prompt}")
        # Tokenize the prompt and truncate to '1024' tokens
        inputs = self.tokenizer(prompt, max_length=self.input_token_len,
                                truncation=True, return_tensors="pt", return_attention_mask=False)
        try:
            max_length = len(inputs.input_ids[0]) + self.output_token_len
            #print(f"max_length = {max_length}")
            # Move inputs to GPU
            inputs = {k:v.to(CFG.DEVICE) for k,v in inputs.items()}
            # print(f"inputs = {inputs}")        
            # Generate the prompt
            outputs = self.model.generate(**inputs,
                                         do_sample=False,
                                         max_length=max_length,
                                         pad_token_id=self.tokenizer.pad_token_id)
            # Encode the output to texts (strings)
            text = self.tokenizer.batch_decode(outputs,
                                               skip_special_tokens=True,
                                               clean_up_tokenization_spaces=False)[0]
            text_arr = text.split("Output:")
            generated_prompt = text_arr[1].strip()
            # print(f"generated_prompt = {generated_prompt}")
            return generated_prompt
        except Exception as e:
            print(f"ERROR: {e}")
            pass # Add the default prompt if errors occur
    
    def infer(self, test_df):
        default_prompt = """Please improve the following text using the writing style of, 
                            maintaining the original meaning but altering the tone, diction, 
                            and stylistic elements to match the new style.Enhance the clarity, 
                            elegance, and impact of the following text by adopting the writing style of,
                            ensuring the core message remains intact while transforming the tone,
                            word choice, and stylistic features to align with the specified style."""
        rewrite_prompts = []
        for i in range(len(test_df)):
            row = test_df.iloc[i]
            prompt = default_prompt
            try:
                prompt = self.prompt_generate(row['original_text'], row['rewritten_text'])
            except Exception as e:
                print(f"ERROR: {e}")
                pass # Add the default prompt if errors occur
            rewrite_prompts.append(prompt)
        return rewrite_prompts

In [16]:
SUBMISSION = False
if SUBMISSION:
    recover = PhiModelRecover() 
    rewrite_prompts = recover.infer(test_df)
    print(f"rewrite_prompts = {rewrite_prompts}")
    del recover
    # Submission
    submission = pd.read_csv('/kaggle/input/llm-prompt-recovery/sample_submission.csv')
    submission["rewrite_prompt"] = rewrite_prompts
    submission.to_csv('submission.csv', index=False)
    display(submission)


# Generate the prompts using pretrained Gemma-7b LLM <a class='anchor' id='llm'></a>
Use pretrained Gemma-7b LLM to generate the prompts directly from testing data.
- @RENOIR [Perplexity Baseline [Phi-2,Gemma-7b-it]](https://www.kaggle.com/code/itahiro/perplexity-baseline-phi-2-gemma-7b-it)
- @PSI [h2oGPT Perplexity Ranking](https://www.kaggle.com/code/philippsinger/h2ogpt-perplexity-ranking)

In [17]:
# Perplexity is a metric that measures the quality of language models
# Perplexity is calculated as the exponent of the loss obtained from the model.
class Perplexity(nn.Module):
    def __init__(self, reduce: bool = True):
        super().__init__()
        self.loss_fn = nn.CrossEntropyLoss()
        self.reduce = reduce

    def forward(self, logits, labels):
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()

        perplexity = []
        for i in range(labels.shape[0]):
            perplexity.append(self.loss_fn(shift_logits[i], shift_labels[i]))
        perplexity = torch.stack(perplexity, dim=0)
        #perplexity = torch.exp(perplexity)
        if self.reduce:
            perplexity = torch.mean(perplexity)
        return perplexity

In [18]:
rewrite_prompt_templates = [
"""Please improve this text using the writing style with maintaining the original meaning
   but altering the tone.""",
"""Please improve the following text by reimagining it through the lens of [insert desired style here],
   retaining the original essence while elevating its clarity, eloquence, and potency by modulating
   the tone, word choice, and stylistic nuances to harmoniously embody the stylistic features 
   while ensuring the core message remains intact.""",
"""Please improve the following text using the writing style of, 
   maintaining the original meaning but altering the tone, diction, 
   and stylistic elements to match the new style.Enhance the clarity, 
   elegance, and impact of the following text by adopting the writing style of,
   ensuring the core message remains intact while transforming the tone,
   word choice, and stylistic features to align with the specified style.""",
]

In [19]:
class GemmaModelRecover:
    def __init__(self):
        self.model_name = 'gemma-7b'
        self.perp_nn = Perplexity() # Compute the perplexity
        self.load_model()
        
    # Load tokenizer and model
    def load_model(self):
        model_path = CFG.model_paths[self.model_name]
        print(f"model_path = {model_path}")
         # Load the tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        # Load the pretrained LLM in 4bit quantization  
        q_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
        )
        # Load the model
        self.model = AutoModelForCausalLM.from_pretrained(model_path,
                                                          device_map="auto",
                                                          trust_remote_code=True,
                                                          quantization_config=q_config)
        print("Complete loading the model")
        
    # Infer the prompt for given texts (df)
    def infer(self, df):
        prompts = []
        for idx in range(len(df)):
            row = df.iloc[idx]
            p_scores = []
            with torch.no_grad():
                 # # Combine the rewrite prompt with row data (original text, rewritten text) as a prompt
                rw_prompts = []
                for rw_prompt in rewrite_prompt_templates:
                    rw_prompts.append(f"""<start_of_turn>
                                            user {rw_prompt} {row["original_text"]}
                                          <end_of_turn>
                                          <start_of_turn>
                                              model{row["rewritten_text"]}
                                          <end_of_turn>""")
                # Encode prompts to embeddings
                inputs = self.tokenizer(rw_prompts, return_tensors="pt",
                                        add_special_tokens=False,
                                        padding=True, truncation=True).to(CFG.DEVICE)
                # Get the output
                output = self.model(input_ids=inputs["input_ids"],
                                    attention_mask=inputs["attention_mask"])
                logits = output.logits

                labels = inputs["input_ids"]
                # Attention masks has three kinds of scores:
                # 1 = attend; 0 = ignore; -100: nullifying their impact on the sequence.
                labels.masked_fill_(~inputs["attention_mask"].bool(), -100) # -100 

                # Compute the perplexity of model output (logits) and actual labels
                for i in range(len(rewrite_prompt_templates)):
                    p_score = self.perp_nn(logits[i].unsqueeze(0), 
                                           labels[i].unsqueeze(0))
                    p_scores.append(p_score.detach().cpu())
                del inputs, labels, output, logits
            # Convert 'perps' as numpy array
            p_scores = np.array(p_scores)
            # Display the perplexity metric
            print(f"p_scores = {p_scores}")
            # Get the best output results of the lowest 
            best_pred = [np.array(rewrite_prompt_templates)[np.argsort(p_scores)][0]]
            print(f"best_pred = {best_pred}")
            prompts.append(best_pred[0])
            clear_memory()
        return prompts

In [20]:
SUBMISSION = False
if SUBMISSION:
    recover = GemmaModelRecover() 
    rewrite_prompts = recover.infer(test_df)
    print(f"rewrite_prompts = {rewrite_prompts}")
    del recover
    # Submission
    submission = pd.read_csv('/kaggle/input/llm-prompt-recovery/sample_submission.csv')
    submission["rewrite_prompt"] = rewrite_prompts
    submission.to_csv('submission.csv', index=False)
    display(submission)


# Generate the prompts using pretrained Mistral-7b LLM (version 2) <a class='anchor' id='mistral'></a>
Use pretrained Mistral-7b LLM to generate the prompts directly from testing data.

- @RICH OLSON [Mistral 7B Prompt Recovery (Version 2)](https://www.kaggle.com/code/richolson/mistral-7b-prompt-recovery-version-2)
- @AATIF FRAZ [Prompt Prediction w/ Mixtral/Mistral7B/Gemma/Llama](https://www.kaggle.com/code/aatiffraz/prompt-prediction-w-mixtral-mistral7b-gemma-llama/notebook)

In [21]:
# Disable effiency to avoid the issues reported by https://github.com/Lightning-AI/lit-gpt/issues/327
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [22]:
# 10 examples of rewritten prompts 
example_df = pd.read_csv('/kaggle/input/rewrite-prompts-examples/rewrite_examples.csv')
display(example_df)

Unnamed: 0.1,Unnamed: 0,original_text,rewritten_text,rewrite_prompt
0,0,Hey there! Just a heads up: our friendly dog m...,Warning: Protective dog on premises. May exhib...,Improve this text to be a warning.
1,1,A lunar eclipse happens when Earth casts its s...,"Yo check it, when the Earth steps in, takes it...",Improve this text to make it a rap.
2,2,Drinking enough water each day is crucial for ...,"Arrr, crew! Sail the health seas with water, t...",Improve this text to have a pirate.
3,3,"In a bustling cityscape, under the glow of neo...","On an ordinary evening, amidst the cacophony o...",Improve this text by making it about time travel.
4,4,"Late one night in the research lab, Dr. Evelyn...","In the deep silence of the lab, under the watc...",Improve this text by adding an intelligent com...
5,5,"The park was empty, save for a solitary figure...","Beneath the cloak of twilight, the park transf...",Improve this text to be more poetic.
6,6,The annual town fair was bustling with activit...,Beneath the riot of color and sound that marke...,Improve this text by adding a magician.
7,7,"The startup team sat in the dimly lit room, su...","In the quiet before dawn, a small group of inn...",Improve this text by adding a talking car.


In [23]:
mistral_instruction = """
Provide the new text and I will tell you what new element was added or change in tone was made
to improve it - with no references to the original.
I will avoid mentioning names of characters.
It is crucial no person, place or thing from the original text be mentioned.
For example - I will not say things like 'change the puppet show into a book report'
- I would just say 'improve this text into a book report'.
If the original text mentions a specific idea, person, place, or thing - I will not mention it in my answer.
For example if there is a 'dog' or 'office' in the original text - the word 'dog' or 'office' must not be in my response.
My answer will be a single sentence."""

default_prompt = """
Refine the following passage by emulating the writing style of [insert desired style here], 
with a focus on enhancing its clarity, elegance, and overall impact.
Preserve the essence and original meaning of the text, while meticulously adjusting its tone, vocabulary, and stylistic elements to resonate with the chosen style.
Please improve the following text using the writing style of, maintaining the original meaning but altering the tone, diction, and stylistic elements to match the new style.
Enhance the clarity, elegance, and impact of the following text by adopting the writing style of ,
ensuring the core message remains intact while transforming the tone, word choice, and stylistic features
to align with the specified style.
"""

In [24]:
#mistral v02 tends to respond with the input after providing the answer  
#This trims response text to the requested number of sentences (or first LF or double-space sequence)
def trim_to_first_num_sentences(text, num_sentences):
    if num_sentences <= 0:
        return "" # Return empty string

    # Split text at the first linefeed
    text_chunks = text.split('\n', 1)
    first_chunk = text_chunks[0]

    # Split the first chunk into sentences, considering the space after each period
    sentences = [sentence.strip() for sentence in first_chunk.split('.') if sentence]

    # If there's a linefeed, return the text up to the first linefeed
    if len(text_chunks) > 1:
        # Check if the first chunk has fewer sentences than x, and if so, just return it
        if len(sentences) < num_sentences:
            trimmed_text = first_chunk
        else:
            # Otherwise, trim to x sentences within the first chunk
            trimmed_text = '. '.join(sentences[:num_sentences]).strip()
    else:
        # If there's no linefeed, determine if the number of sentences is less than or equal to x
        if len(sentences) <= num_sentences:
            trimmed_text = '. '.join(sentences).strip()  # Ensure space is preserved after periods
        else:
            # Otherwise, return the first x sentences, again ensuring space after periods
            trimmed_text = '. '.join(sentences[:num_sentences]).strip()

    # Add back the final period if it was removed and the text needs to end with a sentence.
    if len(sentences) > 0 and not trimmed_text.endswith('.'):
        trimmed_text += '.'

    return trimmed_text


# Get text after last [/INST]
def trim_output(text):
    TERMINATE = "[/INST]"
    text = text.replace('</s>', '')
    #just in case it puts things in quotes
    text = text.replace('"', '')
    text = text.replace("'", '')
    # Get the last [/INST]
    last_pos = text.rfind(TERMINATE)
    return text[last_pos + len(TERMINATE):] if last_pos != -1 else text

# remove all number bullets
def remove_numbered_bullets(text):
    processed_lines = []
    lines = text.split('\n')
    for line in lines:
        # Split each line at the first occurrence of '. '
        parts = line.split('. ', 1)
        # Part is likely a numbered list item, remove the numbering
        if len(parts) > 1 and parts[0].isdigit():
            processed_lines.append(parts[1])
        else: # Not a numbered lis. Add the line
            processed_lines.append(line)
    # Combine all processed lines to a single text
    return '\n'.join(processed_lines)

# Returns only response text that occurs after "the request was: "
# for example, "The request was:  Improve this text by making it a shanty."
def get_response(text):
    repsonse = text
    parts = text.rsplit("The request was: ", 1)
    if len(parts) > 1: # Check if the text contain "The request was: "
        response = parts[1].strip()  # Get the texts after "The request was"
    #Clean up numbered lists
    response = remove_numbered_bullets(response)
    return response

In [25]:
class MistralModelRecover:
    def __init__(self, example_df=example_df):
        self.model_name = 'mistral-7b-v2'
        self.example_df = example_df
        self.max_new_tokens = 30 # number of generated prompts (output)
        self.max_sentences = 1 # number of sentences of generated prompts (output)
        self.load_model()
        
    # Load tokenizer and model
    def load_model(self):
        model_path = CFG.model_paths[self.model_name]
        print(f"model_path = {model_path}")
         # Load the tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.tokenizer.padding_side = 'left'
        # Load the pretrained LLM in 4bit quantization  
        q_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
        )
        # Load the model
        self.model = AutoModelForCausalLM.from_pretrained(model_path,
                                                          device_map="auto",
                                                          trust_remote_code=True,
                                                          quantization_config=q_config)
        print(f"Complete loading the model")
      
    def generate_prompt(self, original_text, rewritten_text):
        messages = []
        # Add 10 examples
        for example_text, example_rewrite, example_prompt in zip(self.example_df['original_text'],
                                                                 self.example_df['rewritten_text'],
                                                                 self.example_df['rewrite_prompt']):
            messages.append({"role": "user", "content": f"Original Text: {example_text}"})
            messages.append({"role": "assistant", "content": mistral_instruction})
            messages.append({"role": "user", "content": f"Re-written Text: {example_rewrite}"})
            messages.append({"role": "assistant", "content": f"The request was:  {example_prompt}"})
        # Add testing data
        messages.append({"role": "user", "content": f"Original Text: {original_text}"})
        messages.append({"role": "assistant", "content": mistral_instruction})
        messages.append({"role": "user", "content": f"Re-written Text: {rewritten_text}"})
        messages.append({"role": "assistant", "content": f"The request was:  Improve this text by"})

        # Pass messages to Mistral
        model_inputs = self.tokenizer.apply_chat_template(messages, return_tensors="pt")
        # Move to GPUs
        model_inputs = model_inputs.to(CFG.DEVICE) 
        # Generate the prompts 
        generated_ids = self.model.generate(model_inputs,
                                            max_new_tokens=self.max_new_tokens,
                                            pad_token_id=self.tokenizer.eos_token_id)

        # Decode and trim to actual response
        decoded_output = self.tokenizer.batch_decode(generated_ids)
        # print(f"decoded_output[0] = {decoded_output[0]}")
        trimed_output = trim_output(decoded_output[0])
        # print(f"trimed_output = {trimed_output}")
        response = get_response(trimed_output)
        # Trim the first number of sentences
        print(f"Before trimming first number of sentences: {response}")
        response = trim_to_first_num_sentences(response, self.max_sentences)
        print(f"After trimming first number of sentences: {response}")
        #default to baseline if empty or unusually short
        if len(response) < 15:
            response = base_line
        return response

    # Infer the prompt for given texts (df)
    def infer(self, df):
        prompts = []
        for idx in range(len(df)):
            row = df.iloc[idx]
            prompt = self.generate_prompt(row['original_text'], row['rewritten_text'])
            prompts.append(prompt)
        return prompts

In [26]:
SUBMISSION = True
if SUBMISSION:
    recover = MistralModelRecover() 
    rewrite_prompts = recover.infer(test_df)
    print(f"rewrite_prompts = {rewrite_prompts}")
    del recover
    # Submission
    submission = pd.read_csv('/kaggle/input/llm-prompt-recovery/sample_submission.csv')
    submission["rewrite_prompt"] = rewrite_prompts
    submission.to_csv('submission.csv', index=False)
    display(submission)

model_path = /kaggle/input/mistral-7b-it-v02


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Complete loading the model
Before trimming first number of sentences: Improve this text by making it a shanty.

Re-written Text: (Verse 1)
In the realm of code, were
After trimming first number of sentences: Improve this text by making it a shanty.
rewrite_prompts = ['Improve this text by making it a shanty.']


Unnamed: 0,id,rewrite_prompt
0,9559194,Improve this text by making it a shanty.
