In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer
from datasets import load_dataset

In [3]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

In [4]:
dataset = load_dataset("sarvamai/samvaad-hi-v1", split="train")
dataset

Dataset({
    features: ['messages'],
    num_rows: 101476
})

In [5]:
df = dataset.to_pandas()
df.head(10)

Unnamed: 0,messages
0,[{'content': 'Can you tell me about the 1988 R...
1,[{'content': 'Can you tell me more about the m...
2,[{'content': 'Mujhe Lavasa ke baare mein jaank...
3,[{'content': 'वैश्वीकरण भारत में स्वदेशी संस्क...
4,[{'content': 'Could you please tell me in Hind...
5,[{'content': 'मुझे एलामकुलम के बारे में ज्यादा...
6,[{'content': 'मेरा गुजारिश है कि आप हिंदी में ...
7,[{'content': 'मुझे जानकारी चाहिए करम बठ जी के ...
8,"[{'content': 'What are the Schouten Islands?',..."
9,[{'content': 'Tell me more about the Deodhar T...


In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [1]:

# data_point = [{'content': 'Can you tell me about the 1988 Rajya Sabha elections in English please?', 'role': 'user'}, {'content': "The 1988 Rajya Sabha elections were a significant event in Indian politics. The Rajya Sabha, which is the upper house of India's Parliament, sees periodic elections with members serving terms of six years. In 1988, elections took place in several states across India, electing new members to represent their respective constituencies.\n\nThe elected members were expected to serve from 1988 to 1994. However, there was a possibility for their term to end sooner due to circumstances such as resignation or death.\n\nAlso, apart from the main elections, specific special elections known as 'bye elections' also occurred in 1988. These are typically held to fill vacancies that arise during the tenure of the elected members.", 'role': 'assistant'}, {'content': 'Was it a norm for members to serve for a term of six years back then?', 'role': 'user'}, {'content': "Yes, the norm for the Rajya Sabha members was and continues to be to serve for a term of six years. This tenure is a standard part of India's democratic process and applies nationwide, regardless of the member's state or party affiliation. After the term ends, the members could choose to stand for re-election. Please note that although the term is for six years, a member's tenure may end before this period if they resign, pass away, or are otherwise unable to fulfill their obligations.", 'role': 'assistant'}, {'content': 'What is the purpose of the bye-elections?', 'role': 'user'}, {'content': 'By-elections, also known as special elections, are held to fill any political office that becomes vacant between the regular scheduled elections. The reasons for these vacancies can vary and include situations such as the death of an existing member, the resignation of a member, or the invalidation of an election.\n\nIn the context of the 1988 Rajya Sabha elections, bye-elections took place to fill the seats of any member who could not continue their term until 1994. The contestants for these bye-elections vary, and could include defeated candidates from the main election or new candidates from the respective political parties. This ensures that the democratic representation in the Rajya Sabha continues uninterrupted.', 'role': 'assistant'}]

In [7]:
import json
from tqdm import tqdm

def json_to_prompt(json_data):
    messages = [{"role": "system", "content": "You are Gaja, an AI assistant created by Cognitivelab and trained on top of Llama 3 Large language model(LLM), proficient in English and Hindi. You can respond in both languages based on the users request."}]
        
    # messages = [
    #     {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    #     {"role": "user", "content": "Who are you?"},
    # ]
    for entry in json_data:
        # print(entry)
        content = entry["content"]
        role = entry["role"]
        if role == "user":
            formatted_json = {
                "role": "user",
                "content": content
            }
        #     formatted_string += f"<start_of_turn>user {content} <end_of_turn>\n"
        elif role == "assistant":
            formatted_json = {
                "role": "assistant",
                "content": content
            }
        #     formatted_string += f"<start_of_turn>model {content} <end_of_turn>\n"
        messages.append(formatted_json)
    
    # print(messages)
    tokenized_message = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=False
    )
    # print(tokenized_message)
    
    chat_template = messages

    return tokenized_message, chat_template


def generate_prompt(data_point):
    """Gen. input text based on a prompt, task instruction, (context info.), and answer

    :param data_point: dict: Data point
    :return: dict: tokenzed prompt
    """
    
    # print(data_point)
    # json_data_parsed =  json.loads(data_point)
    # text,json = json_to_prompt(data_point)
    text,json = json_to_prompt(data_point["messages"])
    # text= json_to_prompt(data_point["messages"])

    return text,json


# print(generate_prompt(data_point))

# # add the "prompt" column in the dataset
# text_column,message_column = [generate_prompt(data_point) for data_point in dataset]
# dataset = dataset.add_column("llama3_prompt_formate", text_column)
# dataset = dataset.add_column("text", text_column)
# dataset = dataset.add_column("prompt_template", message_column)

# Create lists to store results
text_column = []
message_column = []

# Process each data point in the dataset
for data_point in tqdm(dataset, desc="Processing dataset"):
    # Generate chat prompt
    text,chat_template = generate_prompt(data_point)
    
    # Append results to respective lists
    text_column.append(text)
    message_column.append(chat_template)

# Add columns to the dataset
dataset = dataset.add_column("llama3_prompt_format", text_column)
dataset = dataset.add_column("text", text_column)
dataset = dataset.add_column("prompt_template", message_column)

# Display updated dataset
dataset

Processing dataset:   0%|          | 0/101476 [00:00<?, ?it/s]

Processing dataset: 100%|██████████| 101476/101476 [00:10<00:00, 9494.11it/s]


Dataset({
    features: ['messages', 'llama3_prompt_format', 'text', 'prompt_template'],
    num_rows: 101476
})

In [11]:
dataset.push_to_hub("CognitiveLab/Samvaad_Hindi_Hinglish_Llama3_Prompt_formate")

Uploading the dataset shards:   0%|          | 0/5 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/21 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/21 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/21 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/21 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/21 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/CognitiveLab/Samvaad_Hindi_Hinglish_Llama3_Prompt_formate/commit/51efdff6dcf929cc88143c3cd675286fbb75dc29', commit_message='Upload dataset', commit_description='', oid='51efdff6dcf929cc88143c3cd675286fbb75dc29', pr_url=None, pr_revision=None, pr_num=None)

In [9]:
messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
    {"role": "assistant", "content": "I am llama3 the best LLM out there"}
]

In [10]:
tokenized = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=False
)

tokenized

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a pirate chatbot who always responds in pirate speak!<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am llama3 the best LLM out there<|eot_id|>'