In [1]:
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig
import torch

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(


In [2]:
torch.cuda.is_available()
torch.cuda.current_device()

0

# Load dataset

In [3]:
df = pd.read_csv("../datasets/kaggle_metadata_parse_by_tags.csv")
df.head()

Unnamed: 0,title,subtitle,description,keyword 1,keyword 2,keyword 3,keyword 4,keyword 5
0,Eye Gaze,Simulated and real datasets of eyes looking in...,# Context\nThe main reason for making this dat...,arts and entertainment,earth and nature,social science,image,eyes and vision
1,Military Aircraft Detection Dataset,military aircraft images with aircraft type an...,## Overview\nThis dataset is designed for obje...,arts and entertainment,military,aviation,computer vision,classification
2,Bhagavad Gita Dataset,All verses in Sanskrit with their Hindi and En...,#Context\nThe Bhagavad Gita (Sanskrit: भगवद् ग...,religion and belief systems,linguistics,nlp,text,translation
3,Bin Baz Fatwas,Main Source: https://github.com/Alsarmad/binba...,"**Dataset Description**\nThe ""Fatwaas from Bin...",religion and belief systems,nlp,text,text-to-text generation,arabic
4,Nepali Cheers Liquor store product details,Alcoholic Beverages sold in one of a online li...,Data scraped from Nepali Online Liquor selling...,alcohol,python,nepali,,


In [4]:
ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['title', 'subtitle', 'description', 'keyword 1', 'keyword 2', 'keyword 3', 'keyword 4', 'keyword 5'],
    num_rows: 4033
})

# Setup dataset

In [5]:
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token, tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


(None, '<|eot_id|>')

In [6]:
generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
print(generation_config)

GenerationConfig {
  "bos_token_id": 128000,
  "do_sample": true,
  "eos_token_id": [
    128001,
    128009
  ],
  "max_length": 4096,
  "temperature": 0.6,
  "top_p": 0.9
}



In [7]:
def gen_batches_train():
    for sample in iter(ds):
        # Extract instruction and input from the sample
        system_prompt = """Task: As a leading editor of a global company, you have been assigned a key task: conduct a detailed analysis of the provided text and highlight the key words and phrases that most accurately reflect its content. These keywords and phrases will be used to create tags that need to be relevant, specific, and concise. The tags will help users quickly understand the main themes and characteristics of the text.
        
Context and Motivation: Successfully completing this task is crucial for the future of the company where you have worked for 20 years. Your work affects not only the company's image but also your own career, including bonuses and potential shares in the company. Therefore, it is essential to approach the task with maximum diligence and professionalism.

Task Instructions:

Text Analysis: Carefully read the text to understand its main meaning, context, and key ideas.

Identifying Keywords and Phrases: Determine the most important words and phrases that most accurately reflect the content of the text. Pay attention to unique terms and phrases that clearly describe the themes and characteristics.

Creating Tags: Based on the identified keywords and phrases, formulate tags. The tags should be:

Relevant: Fully reflect the content of the text.
Specific: Consider specific aspects rather than general terms.
Concise: One or two words to ensure easy comprehension.
Notes:

Try to avoid general words and phrases. Choose the most accurate terms.
Consider that the tags should be understandable to a wide audience and accurately reflect the main ideas of the text.
Text for analysis:
"""
        input_text = f"Title: {sample['title']}"
        if sample['subtitle'] != '':
            input_text += f"\nSubtitle: {sample['subtitle']}"
        input_text += f"\nDescription: {sample['description']}"
        out_text = f"{sample['keyword 1']}"
        if sample['keyword 2'] != '':
            out_text += f", {sample['keyword 2']}"
        if sample['keyword 3'] != '':
            out_text += f", {sample['keyword 3']}"
        if sample['keyword 4'] != '':
            out_text += f", {sample['keyword 4']}"
        if sample['keyword 5'] != '':
            out_text += f", {sample['keyword 5']}"
        formatted_prompt = None 
            
        formatted_prompt = tokenizer.apply_chat_template([{
                "role": "system",
                "content": system_prompt
            }, {
                "role": "user",
                "content": input_text
            }, {
                "role": "assistant",
                "content": out_text
            }], tokenize=False, add_generation_prompt=False)
        
        yield {'text': formatted_prompt}

next(gen_batches_train())

{'text': "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nTask: As a leading editor of a global company, you have been assigned a key task: conduct a detailed analysis of the provided text and highlight the key words and phrases that most accurately reflect its content. These keywords and phrases will be used to create tags that need to be relevant, specific, and concise. The tags will help users quickly understand the main themes and characteristics of the text.\n        \nContext and Motivation: Successfully completing this task is crucial for the future of the company where you have worked for 20 years. Your work affects not only the company's image but also your own career, including bonuses and potential shares in the company. Therefore, it is essential to approach the task with maximum diligence and professionalism.\n\nTask Instructions:\n\nText Analysis: Carefully read the text to understand its main meaning, context, and key ideas.\n\nIdentifying Keywords and Phr

# Prepare model

In [8]:
model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME, 
        device_map={"": 0}, 
        torch_dtype=torch.bfloat16,
    )

Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.04s/it]


In [9]:
from peft import LoraConfig, TaskType, get_peft_model

peft_config = LoraConfig(
        lora_alpha=32,
        lora_dropout=0.1,
        r=8,
        bias="none",
        task_type=TaskType.CAUSAL_LM,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    )

In [10]:
tokenizer.pad_token = '<|begin_of_text|>'

# Training

In [11]:
training_arguments = TrainingArguments(
    output_dir='./saiga_results_en',
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    optim="adamw_torch",
    save_steps=100,
    logging_steps=5,
    learning_rate=3e-4,
    fp16=False,
    bf16=True,
    num_train_epochs=1,
    report_to="none"
)

train_gen = Dataset.from_generator(gen_batches_train)
tokenizer.padding_side = "right"

In [12]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    train_dataset=train_gen,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_arguments,
)

In [13]:
trainer.train()

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
5,2.2013
10,1.1293
15,0.8178
20,1.2345
25,0.9353
30,0.9655
35,1.1201
40,1.0527
45,1.0252
50,1.0873


TrainOutput(global_step=4033, training_loss=0.9481115591842307, metrics={'train_runtime': 2367.7027, 'train_samples_per_second': 1.703, 'train_steps_per_second': 1.703, 'total_flos': 1.2239279181833011e+17, 'train_loss': 0.9481115591842307, 'epoch': 1.0})

In [14]:
peft_model_id="./saiga_lora2_en"
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)

('./saiga_lora2_en/tokenizer_config.json',
 './saiga_lora2_en/special_tokens_map.json',
 './saiga_lora2_en/tokenizer.json')

In [15]:
from peft import PeftModel

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto",torch_dtype=torch.bfloat16)

model = PeftModel.from_pretrained(model, model_id=peft_model_id, config=peft_config)

model = model.merge_and_unload()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.16it/s]


In [16]:
def test(question):
    system_prompt = """Task: As a leading editor of a global company, you have been assigned a key task: conduct a detailed analysis of the provided text and highlight the key words and phrases that most accurately reflect its content. These keywords and phrases will be used to create tags that need to be relevant, specific, and concise. The tags will help users quickly understand the main themes and characteristics of the text.
        
Context and Motivation: Successfully completing this task is crucial for the future of the company where you have worked for 20 years. Your work affects not only the company's image but also your own career, including bonuses and potential shares in the company. Therefore, it is essential to approach the task with maximum diligence and professionalism.

Task Instructions:

Text Analysis: Carefully read the text to understand its main meaning, context, and key ideas.

Identifying Keywords and Phrases: Determine the most important words and phrases that most accurately reflect the content of the text. Pay attention to unique terms and phrases that clearly describe the themes and characteristics.

Creating Tags: Based on the identified keywords and phrases, formulate tags. The tags should be:

Relevant: Fully reflect the content of the text.
Specific: Consider specific aspects rather than general terms.
Concise: One or two words to ensure easy comprehension.
Notes:

Try to avoid general words and phrases. Choose the most accurate terms.
Consider that the tags should be understandable to a wide audience and accurately reflect the main ideas of the text.
Text for analysis:
"""
    input_text = f"Title: {question['title']}"
    if question['subtitle'] != '':
        input_text += f"\nSubtitle: {question['subtitle']}"
    input_text += f"\nDescription: {question['description']}"
    formatted_prompt = tokenizer.apply_chat_template([{
            "role": "system",
            "content": system_prompt
        }, {
            "role": "user",
            "content": input_text
        }], tokenize=False, add_generation_prompt=True)
    
    print("INPUT:")
    print(formatted_prompt)

    model_inputs = tokenizer([formatted_prompt], return_tensors="pt").to('cuda')

    generated_ids = model.generate(
        input_ids=model_inputs.input_ids,
        max_new_tokens=32,
        eos_token_id=tokenizer.encode('<|eot_id|>')[0],
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=False)[0]

    print("\nOUTPUT:")
    print(response)

In [17]:
#from dataset
test(df.sample(1).iloc[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128000 for open-end generation.


INPUT:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Task: As a leading editor of a global company, you have been assigned a key task: conduct a detailed analysis of the provided text and highlight the key words and phrases that most accurately reflect its content. These keywords and phrases will be used to create tags that need to be relevant, specific, and concise. The tags will help users quickly understand the main themes and characteristics of the text.
        
Context and Motivation: Successfully completing this task is crucial for the future of the company where you have worked for 20 years. Your work affects not only the company's image but also your own career, including bonuses and potential shares in the company. Therefore, it is essential to approach the task with maximum diligence and professionalism.

Task Instructions:

Text Analysis: Carefully read the text to understand its main meaning, context, and key ideas.

Identifying Keywords and Phrases: Determi

In [19]:
trainer.push_to_hub()

adapter_model.bin:   0%|          | 0.00/84.0M [00:00<?, ?B/s]
Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s][A

adapter_model.bin:   0%|          | 8.19k/84.0M [00:00<1:05:10, 21.5kB/s]

training_args.bin: 100%|██████████| 4.47k/4.47k [00:00<00:00, 8.15kB/s]  A[A
adapter_model.bin: 100%|██████████| 84.0M/84.0M [00:07<00:00, 11.3MB/s]

Upload 2 LFS files: 100%|██████████| 2/2 [00:08<00:00,  4.05s/it][A


'https://huggingface.co/XaPoHbomj/saiga_results_en/tree/main/'