In [None]:
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0

# Format the train and test dataset as required by Amazon Nova for Amazon Bedrock FT

### Install the requirements

In [None]:
! pip install -r ../requirements.txt

### import required packages

In [None]:
import os
import pandas as pd
import random
import json
import logging
from enum import Enum
from tqdm import tqdm
import boto3
from botocore.config import Config

In [None]:
prompt_list= "You are a bot that can handle different requests with tools."

#Train dataset 

train_question_bank_path = "../assets/train_data.txt"
train_question_list = []
with open(train_question_bank_path) as f:
    for line in f.readlines():
        train_question_list.append(eval(line))

ft_sample_data_list = []

for idx in range(len(train_question_list)):
    data_i = train_question_list[idx]
   
    question_i = data_i['question'].strip()
    system_prompt_i = prompt_list
    target_i = data_i[ 'answer' ].strip()
    
    ft_data_i = {"system": system_prompt_i, 
                 "messages": [{"role": "user", "content": question_i}, {"role": "assistant", "content": f"{{'name':{data_i['answer']}, 'parameters':{data_i['args']}}}"}]}
    
    ft_sample_data_list.append( ft_data_i )

output_path = f"../assets/bedrock_nova_ft/train_ft.jsonl" 

df_train = pd.DataFrame( ft_sample_data_list )
df_train.to_json( output_path, orient='records', lines=True)
        

In [None]:
#Test dataset

test_question_bank_path = "../assets/test_data.txt"
test_question_list = []
with open(test_question_bank_path) as f:
    for line in f.readlines():
        test_question_list.append(eval(line))


ft_sample_data_list = []

for idx in range(len(test_question_list)):
    data_i = test_question_list[idx]
   
    question_i = data_i['question'].strip()
    system_prompt_i = prompt_list
    target_i = data_i[ 'answer' ].strip()
    
    ft_data_i = {"system": system_prompt_i, 
                 "messages": [{"role": "user", "content": question_i}, {"role": "assistant", "content": f"{{'name':{data_i['answer']}, 'parameters':{data_i['args']}}}"}]}
    
    ft_sample_data_list.append( ft_data_i )

output_path = f"../assets/bedrock_olympus_ft/test_ft.jsonl" 

df_test = pd.DataFrame( ft_sample_data_list )
df_test.to_json( output_path, orient='records', lines=True)
        


## Reformat the files to include tool config and appropriate tool calling prompt  in the 'messages' 

### Setup tools

To properly train our model on tool usage we need to define our tool definitions. We can do so by defining functions with explicit typed inputs and structured docstrings. 

We are going to define 8 tools:
- weather_api_call
- stat_pull
- text_to_sql
- terminal
- wikipedia
- duckduckgo_results_json
- youtube_search
- pubmed_search

While we are defining 8 tools, we are only going to train our model on 7 of them. This is so that we can test out our performance on unseen tools after training.

In [None]:
import weather_api_call, stat_pull,terminal,text_to_sql,wikipidea,youtube_search, pubmed_search, duckduckgo_results_json


In [None]:
# Prepare the tool configuration with the weather tool's specification
tool_config = {"tools": [weather_api_call.get_tool_spec(),
                         stat_pull.get_tool_spec(),
                         terminal.get_tool_spec(),
                         text_to_sql.get_tool_spec(),
                         wikipidea.get_tool_spec(),
                         youtube_search.get_tool_spec(),
                         pubmed_search.get_tool_spec(),
                         duckduckgo_results_json.get_tool_spec()                        
                        ]
              }



In [None]:
tool_config

### Define the appropriate prompt template for tool calling 


In [None]:

promt_template = """
Given the following functions within <tools>, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.
Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.Do not use variables. Donot give any explanations. 
ONLY output the resulting JSON structure and nothing else.Donot use the word 'json' anywhere in the result.

<tools>{tool_config}</tools>

Generate answer for the following question.
<question>{question}</question>
"""
# Convert tools configuration to JSON string
formatted_tool_config = json.dumps(tool_config, indent=2)


## Format the train and test data to insert the tool use config

## Training data

Let's load our training data.

In [None]:
train_question_bank_path = "../assets/bedrock_nova_ft/train_ft.jsonl"
test_question_bank_path = "../assets/bedrock_nova_ft/test_ft.jsonl"

train_question_list = []
with open(train_question_bank_path, 'r', encoding='utf-8') as file:
    for line in file:
        # Parse each line as a JSON object
        line = json.loads(line.strip())
        train_question_list.append(line)
        
test_question_list = []
with open(test_question_bank_path, 'r', encoding='utf-8') as file:
    for line in file:
        # Parse each line as a JSON object
        line = json.loads(line.strip())
        test_question_list.append(line)

In [None]:
# look at our of our training examples
train_question_list[0]
print(f"number of train records : {len(train_question_list)}, number of test records : {len(test_question_list)} \n")

### Create our formatted dataset

We are now going to apply our chat template to our dataset and preprocess our examples. Because we are training we need to include the template, the inputs, and the expected answer.

In [None]:
train_input_text = []
for question_dict in tqdm(train_question_list):
    question_dict['schemaVersion']= "tooluse-dataset-2024"
    question_dict['system']=[{"text": question_dict['system']}]
    question = question_dict['messages'][0]['content']
    #print(f"question : {question } \n")
    prompt = promt_template.replace("{question}", question)
    prompt = prompt.replace("{tool_config}", formatted_tool_config)
    #print(f"prompt : {prompt} \n")
    question_dict['messages'][0]['content'] = [{"text": prompt}]
    question_dict['messages'][1]['content'] = [{"text": question_dict['messages'][1]['content']}]
    train_input_text.append(question_dict)
    

test_input_text = []
for question_dict in tqdm(test_question_list):
    question_dict['schemaVersion']= "tooluse-dataset-2024"
    question_dict['system']=[{"text": question_dict['system']}]
    question = question_dict['messages'][0]['content']
    #print(f"question : {question } \n")
    prompt = promt_template.replace("{question}", question)
    prompt = prompt.replace("{tool_config}", formatted_tool_config)
    #print(f"prompt : {prompt} \n")
    question_dict['messages'][0]['content'] = [{"text": prompt}]
    question_dict['messages'][1]['content'] = [{"text": question_dict['messages'][1]['content']}]
    test_input_text.append(question_dict)

In [None]:
train_input_text[0]
 

In [None]:
# Write the formatted list to a output JSONL file
output_train_file_path = "../../assets/bedrock_olympus_ft/formatted_train_ft.jsonl"
with open(output_train_file_path, 'w', encoding='utf-8') as file:
    for item in train_input_text:
        file.write(json.dumps(item) + '\n')  # Convert to JSON string and write to file

        
output_test_file_path = "../../assets/bedrock_olympus_ft/formatted_test_ft.jsonl"
with open(output_test_file_path, 'w', encoding='utf-8') as file:
    for item in test_input_text:
        file.write(json.dumps(item) + '\n')  # Convert to JSON string and write to file


### upload the datasets to S3 before doing finetuning through Amazon Bedrock