In [2]:
# https://www.datacamp.com/tutorial/understanding-prompt-tuning
# https://www.kaggle.com/code/aisuko/prompt-tuning-for-causal-language-modeling

In [11]:
model_id = "google/gemma-2b-it"

In [12]:
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = model_id #"bigscience/bloomz-560m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
# pip install datasets

In [16]:
from datasets import load_dataset

dataset_prompt = "fka/awesome-chatgpt-prompts" #

data_prompt = load_dataset(dataset_prompt)
data_prompt = data_prompt.map(lambda x: tokenizer(x["prompt"]), batched=True)
train_prompts = data_prompt["train"].select(range(100))

In [17]:
data_prompt

DatasetDict({
    train: Dataset({
        features: ['act', 'prompt', 'input_ids', 'attention_mask'],
        num_rows: 170
    })
})

In [18]:
train_prompts

Dataset({
    features: ['act', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 100
})

In [45]:
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()
from sklearn.model_selection import train_test_split

In [46]:
df = pd.read_csv('training_data_en.csv',sep=';',on_bad_lines='skip')

In [47]:
df_train, df_test = train_test_split(df, test_size=0.30, random_state=42)

In [48]:
train_prompt = lambda cntxt,inpt,response: f"""<bos><start_of_turn>user

### Instruction: You are an AI assistant specialized in Finance Causal extraction. Your task is to identify and return either the cause or effect as requested, verbatim, from the provided financial text in ### Context.
### Definitions
        - Cause: The reason why an event occurs.
        - Effect: The event that happens as a result of the cause.
### Guidelines:
        - Focus on extractive responses only—do not add or modify text outside the given context.No added words or rephrasing.
        - Ensure responses follow the cause-and-effect relationship: a cause precedes an effect, and an effect follows a cause.

### Context: {{cntxt}}
### Input: {{inpt}} <end_of_turn>
<start_of_turn>model
{{response}}<end_of_turn>""".format(cntxt=cntxt,inpt=inpt,response=response)

In [49]:
test_prompt = lambda cntxt,inpt: f"""<bos><start_of_turn>user

### Instruction: You are an AI assistant specialized in Finance Causal extraction. Your task is to identify and return either the cause or effect as requested, verbatim, from the provided financial text in ### Context.
### Definitions
        - Cause: The reason why an event occurs.
        - Effect: The event that happens as a result of the cause.
### Guidelines:
        - Focus on extractive responses only—do not add or modify text outside the given context.No added words or rephrasing.
        - Ensure responses follow the cause-and-effect relationship: a cause precedes an effect, and an effect follows a cause.

### Context: {{cntxt}}
### Input: {{inpt}} <end_of_turn>
<start_of_turn>model
""".format(cntxt=cntxt,inpt=inpt)

In [50]:
df_train['prompt'] = df_train.progress_apply(lambda x: train_prompt(x['Text'],x['Question'],x['Answer']),axis=1)
df_test['prompt'] = df_test.progress_apply(lambda x: test_prompt(x['Text'],x['Question']),axis=1)

  0%|          | 0/1397 [00:00<?, ?it/s]

  0%|          | 0/599 [00:00<?, ?it/s]

In [51]:
df_train = df_train[['prompt']]

In [52]:
df_train.to_csv('train_prompts.csv',index=False)

In [28]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(df_train)

In [29]:
train_dataset

Dataset({
    features: ['prompt', '__index_level_0__'],
    num_rows: 1397
})

In [30]:
data_prompt = train_dataset.map(lambda x: tokenizer(x["prompt"]), batched=True)
train_prompts = data_prompt.select(range(50))

Map:   0%|          | 0/1397 [00:00<?, ? examples/s]

In [31]:
# train_prompts = data_prompt
train_prompts

Dataset({
    features: ['prompt', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 50
})

In [32]:
import numpy as np
np.__version__

'1.25.0'

In [33]:
df_test.reset_index(drop=True,inplace=True)

In [34]:
df_test['prompt'][0]

"<bos><start_of_turn>user\n\n### Instruction: You are an AI assistant specialized in Finance Causal extraction. Your task is to identify and return either the cause or effect as requested, verbatim, from the provided financial text in ### Context.\n### Definitions\n        - Cause: The reason why an event occurs.\n        - Effect: The event that happens as a result of the cause.\n### Guidelines:\n        - Focus on extractive responses only—do not add or modify text outside the given context.No added words or rephrasing.\n        - Ensure responses follow the cause-and-effect relationship: a cause precedes an effect, and an effect follows a cause.\n\n### Context: The Audit and Risk Committee has carried out its annual assessment of the internal controls of the Fund's service providers for the year ended 30th June 2017 and considered the internal control procedures to be adequate based on the findings of their respective ISAE 3402 or SSAE 16 reports.\n### Input: What factor led the Aud

In [12]:
def generate_text(model, tokenizer, prompt_text, max_tokens):
    prompt_text = tokenizer(prompt_text, return_tensors="pt")

    model.to('cuda')
    # print('model',model.device)
    # print('input',prompt_text.device)
    
    input_id = prompt_text["input_ids"].to('cuda')

    print('model',model.device)
    print('input',input_id.device)
    
    outputs = model.generate(
        input_ids=input_id,
        attention_mask=prompt_text["attention_mask"].to('cuda'),
        max_length=max_tokens,
        repetition_penalty=1.5,
        eos_token_id=tokenizer.eos_token_id
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)


initial_output = generate_text(model, tokenizer, "I want you to act as a logistician. ", 150)
print("Initial model output:", initial_output)

model cuda:0
input cuda:0
Initial model output: ['I want you to act as a logistician. \n\n**Problem:** A company is looking for a reliable supplier of high-quality office supplies and equipment. The company requires the following items:\n* Printer cartridges (black, color)\n* Paper rolls in various sizes\n* Stapler staples\n* Pens and pencils\n* Highlighters\n\n\nPlease provide a detailed proposal outlining your recommendations on how to select a reliable supplier, what factors to consider when evaluating suppliers, and what questions to ask potential suppliers.\n\n## Proposal for Selecting Reliable Suppliers of Office Supplies and Equipment\n\nAs a seasoned logistics professional with extensive experience in sourcing and managing diverse product categories, I recommend the following approach to selecting a reliable supplier of high- quality office supplies']


In [36]:
# pip install -q peft

In [8]:
from peft import  get_peft_model, PromptTuningConfig, TaskType, PromptTuningInit

tuning_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM, #This type indicates the model will generate text.
    prompt_tuning_init=PromptTuningInit.RANDOM,  #The added virtual tokens are initializad with random numbers
    num_virtual_tokens=4, #Number of virtual tokens to be added and trained.
    tokenizer_name_or_path=model_name
)

peft_model = get_peft_model(model, tuning_config)

In [9]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    # use_cpu=True, # This is necessary for CPU clusters.
    per_device_train_batch_size=1,
    output_dir="outputs",
    # auto_find_batch_size=True, # Find a suitable batch size that will fit into memory automatically, you can also use a custom batch size
    learning_rate= 0.005,
    # load_best_model_at_end=True,
    # save_strategy="epoch",
    # evaluation_strategy="epoch", 
    num_train_epochs=5,
    report_to='none'
    )

In [10]:
from transformers import Trainer, DataCollatorForLanguageModeling
trainer = Trainer(
    model=peft_model, # We pass in the PEFT version of the foundation model, bloomz-560M
    args=training_args, #The args for the training.
    train_dataset=train_prompts, #The dataset used to train the model.
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False) # mlm=False indicates not to use masked language modeling
    )
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 282.00 MiB. GPU 0 has a total capacity of 11.63 GiB of which 229.75 MiB is free. Including non-PyTorch memory, this process has 11.38 GiB memory in use. Of the allocated memory 11.03 GiB is allocated by PyTorch, and 134.63 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [10]:
trainer.save_model("promptTuned_model")
# tokenizer.save_pretrained("promptTuned_model")

In [13]:
tuned_output = generate_text(trainer.model, tokenizer, "I want you to act as a logistician. ", 100)
print("Tuned model output:", tuned_output)

model cuda:0
input cuda:0




Tuned model output: ['I want you to act as a logistician. \n\n**Here are the details of the task:**\n\n* You will be given a set of items that need to be delivered to different locations.\n* Each item has a unique ID, location ID, and weight.\n* The items can be delivered to multiple locations.\n* Each location has a unique ID and capacity.\n* The delivery process must follow these rules:\n    - Only one item can be delivered to each']


In [14]:
import torch
torch.cuda.empty_cache()