In [None]:
!pip install datasets

In [None]:
import json
import os.path
import pandas as pd
from datasets import Dataset
from datasets import load_dataset
from datasets import disable_caching

import boto3

import sagemaker
from sagemaker.s3 import S3Uploader


dataset_loaded_sciq = False
dataset_loaded_squad = False

# creating some dummy value for the dataset global variable
dataset = { 'train': ["a", "b"], 'test':["a", "b"]}

disable_caching()

In [None]:
def print_dashes():
    print('-'.join('' for x in range(100)))

def translate_to_text(data, column_name="context"):
    data_pd = pd.DataFrame(data)
    return " \n".join(((data_pd.drop_duplicates(subset=[column_name]))[column_name]))

def safe_open_w(path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    return open(path, 'w')

def write_to_file(input_str, file_path):
    with safe_open_w(file_path) as text_file:
        text_file.write(input_str)

def create_custom_template(template, file_path):
    with safe_open_w(file_path) as text_file:
        json.dump(template, text_file)
    return template

def attach_cors_to_bucket(bucket_name):    
    s3 = boto3.client('s3')
  
    try:
        response = s3.put_bucket_cors(Bucket = bucket_name, 
                                      CORSConfiguration = {
                                            'CORSRules' : [
                                                {
                                                    'ID' : bucket_name + 'cors',
                                                    'AllowedHeaders' : [ '*' ],
                                                    'AllowedMethods' : [ 'PUT', 'GET', 'POST', 'DELETE', 'HEAD' ],
                                                    'AllowedOrigins' : [ '*' ],
                                                    'ExposeHeaders' :  [ 'ETag', 'x-amz-delete-marker', 'x-amz-server-side-encryption',
                                                                         'x-amz-request-id','x-amz-version-id','x-amz-id-2']
                                                }
                                            ]
                                        })
    except ClientError as e:
            return None
    return response

def upload_workshop_dataset(dataset_name,
                            output_bucket = sagemaker.Session().default_bucket(),
                            local_path = "."):
    
    attach_cors_to_bucket(output_bucket) # requirment by Studio UI
    
    output_s3_path =  output_bucket + "/datasets" 
    
    data_location = f"s3://{output_s3_path}/" + dataset_name
    
    fine_tune_data_ist_location = f"{data_location}/fine_tuning/instruction_fine_tuning"
    fine_tune_data_daft_location = f"{data_location}/fine_tuning/domain_adaptation_fine_tuning"
    evaluation_data_location = f"{data_location}/evaluation/automatic"
    evaluation_data_small_location = f"{data_location}/evaluation/automatic_small"
    evaluation_data_hil_location = f"{data_location}/evaluation/hil"
    
    if(os.path.isfile(f"{local_path}/template.json")):
        print("Uploading custom template...")
        S3Uploader.upload(f"{local_path}/template.json", fine_tune_data_ist_location)
        print("Done")
    
    if(os.path.isfile(f"{local_path}/dataset_finetune_ist.jsonl")):
        print("Uploading instruction tuning dataset...")
        S3Uploader.upload(f"{local_path}/dataset_finetune_ist.jsonl", fine_tune_data_ist_location)
        print(f"Fine-tuning ist data: {fine_tune_data_ist_location}")
    
    if(os.path.isfile(f"{local_path}/dataset_finetune_daft.txt")):
        print("Uploading domain adaptation tuning dataset...")
        S3Uploader.upload(f"{local_path}/dataset_finetune_daft.txt", fine_tune_data_daft_location)
        print(f"Fine-tuning daft data: {fine_tune_data_daft_location}")
    
    if(os.path.isfile(f"{local_path}/dataset_evaluation.jsonl")):
        print("Uploading evaluation dataset...")
        S3Uploader.upload(f"{local_path}/dataset_evaluation.jsonl", evaluation_data_location)
        print(f"Evaluation data: {evaluation_data_location}")
    
    if(os.path.isfile(f"{local_path}/dataset_evaluation_small.jsonl")):
        print("Uploading small evaluation dataset...")
        S3Uploader.upload(f"{local_path}/dataset_evaluation_small.jsonl", evaluation_data_small_location)
        print(f"Evaluation data: {evaluation_data_small_location}")
        
    if(os.path.isfile(f"{local_path}/dataset_evaluation_hil.jsonl")):
        print("Uploading HIL evaluation dataset...")
        S3Uploader.upload(f"{local_path}/dataset_evaluation_hil.jsonl", evaluation_data_hil_location)
        print(f"Evaluation data: {evaluation_data_hil_location}")




def prepare_dataset_sciq():
    dataset_name = 'sciq'

    # required for ipython shell
    global dataset_loaded_sciq 
    global dataset_loaded_squad 
    global dataset
    
    print (dataset_loaded_sciq)
    
    if(dataset_loaded_sciq == False):
        dataset = load_dataset(dataset_name)
        dataset_loaded_sciq = True
   
    dataset_training_df = pd.DataFrame(dataset['train'])
    dataset_validation_df = pd.DataFrame(dataset['test'])
    
    number_of_raws_training = 5000 # dataset_training_df.size
    dataset_training_df = dataset_training_df.sample(n=int(number_of_raws_training/len(dataset_training_df.columns)), random_state=42, ignore_index=True)
    
    #number_of_raws_validation = 2000 # dataset_training_df.size
    #dataset_validation_df = dataset_validation_df.sample(n=int(number_of_raws_validation/len(dataset_validation_df.columns)), random_state=42, ignore_index=True)
    
    print_dashes()
    print("Load data")
    print_dashes()
    print("Train dataset size " + str(dataset_training_df.size) + " with columns" + str(dataset_training_df.columns.to_list()) )
    print("Validation dataset size " + str(dataset_validation_df.size) + " with columns" + str(dataset_validation_df.columns.to_list()) )
    print_dashes()
    
    print("\nCreate DAFT dataset")
    print_dashes()
    data_train_daft = translate_to_text(dataset_training_df, column_name='support')
    print("DAFT dataset example: " + data_train_daft[0:1000])
    print("Export DAFT dataset: dataset_finetune_daft.txt")
    write_to_file(data_train_daft, f"./{dataset_name}/dataset_finetune_daft.txt")
    print_dashes()
    
    print("\nCreate IST dataset")
    print_dashes()
    
    include_context = False
    
    if include_context:
        fields = ['support', 'question', 'correct_answer']
        template = {
            "prompt": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{question}\n\n### Input:\n{support}",
            "completion": "{correct_answer}"
        }
        
    else:
        fields = ['question', 'correct_answer']
        template = {
            "prompt": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction: Answer this question:\n{question}\n",
            "completion": "{correct_answer}"
        }
        
    dataset_train_ist_df = dataset_training_df[fields].copy()
    print("IST dataframe example: " + dataset_train_ist_df.iloc[0])
    dataset_fine_tune_ist = Dataset.from_pandas(dataset_train_ist_df)
    print("IST dataset example: ",dataset_fine_tune_ist[0])
    print("Exoprt IST dataset: dataset_finetune_ist.jsonl")
    dataset_fine_tune_ist.to_json(f"./{dataset_name}/dataset_finetune_ist.jsonl",orient='records', lines=True)
    print_dashes()
    
    print("\nCreate prompt template and store to template.json")
    print_dashes()
    print(create_custom_template(template, f"./{dataset_name}/template.json"))
    print_dashes()
    
    print("\nCreate evaluation dataset")
    print_dashes()
    
    dataset_validation_with_context_df = dataset_validation_df.copy()
    dataset_validation_with_context_df["model_input"] = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n" + dataset_validation_with_context_df["question"] + "\n\n### Input:\n" + dataset_validation_with_context_df["support"]
    dataset_validation_with_context_df = dataset_validation_with_context_df[['model_input','correct_answer']].copy()
    dataset_validation_with_context_df = dataset_validation_with_context_df.rename(columns={"correct_answer": "target_output"})
    
    dataset_validation_no_context_df = dataset_validation_df.copy()
    dataset_validation_no_context_df["model_input"] = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction: Answer this question:\n"+ dataset_validation_no_context_df["question"]
    dataset_validation_no_context_df = dataset_validation_no_context_df[['model_input','correct_answer']].copy()
    dataset_validation_no_context_df = dataset_validation_no_context_df.rename(columns={"correct_answer": "target_output"})
    
    if include_context:
        print("Evaluation dataframe example: " + dataset_validation_with_context_df.iloc[0])
        dataset_evaluation = Dataset.from_pandas(dataset_validation_with_context_df)
        print("Evaluation dataset example: ", dataset_evaluation[0])
    else:
        print("Evaluation dataframe example: " + dataset_validation_no_context_df.iloc[0])
        dataset_evaluation = Dataset.from_pandas(dataset_validation_no_context_df)
        print("Evaluation dataset example: ", dataset_evaluation[0])
    print("Export Evaluation dataset: dataset_evaluation.jsonl")
    dataset_evaluation.to_json(f"./{dataset_name}/dataset_evaluation.jsonl")
    print_dashes()
    
    # making the evaluation_small dataset 
    print("\nCreate evaluation small dataset")
    
    dataset_evaluation_small_df = dataset_validation_with_context_df.head(10)
    print("Evaluation dataframe small example: " + dataset_evaluation_small_df.iloc[0])
    dataset_evaluation_small = Dataset.from_pandas(dataset_evaluation_small_df)
    print("Evaluation dataset small example: ", dataset_evaluation_small[0])
    print("Export Evaluation dataset: dataset_evaluation_small.jsonl")
    dataset_evaluation_small.to_json(f"./{dataset_name}/dataset_evaluation_small.jsonl")
    print_dashes()

    # making the evaluation_hil dataset 
    print("\nCreate evaluation HIL dataset")
    
    dataset_evaluation_hil_df = dataset_validation_with_context_df.head(10)
    dataset_evaluation_hil_df = dataset_evaluation_hil_df.rename(columns={"model_input": "prompt", "target_output": "referenceResponse"})
    print("Evaluation dataframe hil example: " + dataset_evaluation_hil_df.iloc[0])
    dataset_evaluation_hil = Dataset.from_pandas(dataset_evaluation_hil_df)
    print("Evaluation dataset hil example: ", dataset_evaluation_hil[0])
    print("Export Evaluation dataset: dataset_evaluation_hil.jsonl")
    dataset_evaluation_hil.to_json(f"./{dataset_name}/dataset_evaluation_hil.jsonl")
    print_dashes()
    
    upload_workshop_dataset(dataset_name = dataset_name, output_bucket = sagemaker.Session().default_bucket(), local_path = f"./{dataset_name}")

def prepare_dataset_squad():
    
    dataset_name = 'squad'

    # required for ipython shell
    global dataset_loaded_sciq 
    global dataset_loaded_squad 
    global dataset
    
    if(dataset_loaded_squad == False):
        dataset = load_dataset(dataset_name)
        dataset_loaded_squad = True
        
    dataset_training_df = pd.DataFrame(dataset['train'])
    dataset_validation_df = pd.DataFrame(dataset['validation'])
    
    #number_of_raws_training = 5000 # dataset_training_df.size
    #dataset_training_df = dataset_training_df.sample(n=int(number_of_raws_training/len(dataset_training_df.columns)), random_state=42, ignore_index=True)
    
    #number_of_raws_validation = 2000 # dataset_training_df.size
    #dataset_validation_df = dataset_validation_df.sample(n=int(number_of_raws_validation/len(dataset_validation_df.columns)), random_state=42, ignore_index=True)
    
    print_dashes()
    print("Load data")
    print_dashes()
    print("Train dataset size " + str(dataset_training_df.size) + " with columns" + str(dataset_training_df.columns.to_list()) )
    print("Validation dataset size " + str(dataset_validation_df.size) + " with columns" + str(dataset_validation_df.columns.to_list()) )
    print_dashes()
    
    print("\nCreate DAFT dataset")
    print_dashes()
    data_train_daft = translate_to_text(dataset_training_df)
    print("DAFT dataset example: " + data_train_daft[0:1000])
    print("Exoprt DAFT dataset: dataset_finetune_daft.txt")
    write_to_file(data_train_daft, f"./{dataset_name}/dataset_finetune_daft.txt")
    print_dashes()
    
    print("\nCreate IST dataset")
    print_dashes()
    dataset_train_ist_df = dataset_training_df[['context', 'question', 'answers']].copy()
    dataset_train_ist_df['answers'] = dataset_train_ist_df['answers'].apply(lambda x: str(x["text"][0]))
    print("IST dataframe example: " + dataset_train_ist_df.iloc[0])
    dataset_fine_tune_ist = Dataset.from_pandas(dataset_train_ist_df)
    print("IST dataset example: ",dataset_fine_tune_ist[0])
    print("Exoprt IST dataset: dataset_finetune_ist.jsonl")
    dataset_fine_tune_ist.to_json(f"./{dataset_name}/dataset_finetune_ist.jsonl")
    print_dashes()
    
    print("\nCreate prompt template and store to template.json")
    print_dashes()
    template = {
          "prompt": "Given the following context: {context}\n\nCould you answer this question: {question} ",
          "completion": "{answers}"
        }
    print(create_custom_template(template, f"./{dataset_name}/template.json"))
    print_dashes()
    
    print("\nCreate evaluation dataset")
    print_dashes()
    dataset_validation_df["model_input"] = "Given the following context:" + dataset_validation_df["context"] + "\n\nCould you answer this question: " + dataset_validation_df["question"]
    dataset_evaluation_df = dataset_validation_df[['model_input','answers']].copy()
    dataset_evaluation_df['answers'] = dataset_evaluation_df['answers'].apply(lambda x: str(' <OR> '.join(data for data in x["text"])))
    dataset_evaluation_df = dataset_evaluation_df.rename(columns={"answers": "target_output"})
    print("Evaluation dataframe example: " + dataset_evaluation_df.iloc[0])
    dataset_evaluation = Dataset.from_pandas(dataset_evaluation_df)
    print("Evaluation dataset example: ", dataset_evaluation[0])
    print("Exoprt Evaluation dataset: dataset_evaluation.jsonl")
    dataset_evaluation.to_json(f"./{dataset_name}/dataset_evaluation.jsonl")
    print_dashes()

    print("\nCreate evaluation small dataset")
    dataset_evaluation_small_df = dataset_evaluation_df.head(10)
    print("Evaluation dataframe small example: " + dataset_evaluation_small_df.iloc[0])
    dataset_evaluation_small = Dataset.from_pandas(dataset_evaluation_small_df)
    print("Evaluation dataset small example: ", dataset_evaluation_small[0])
    print("Export Evaluation dataset: dataset_evaluation_small.jsonl")
    dataset_evaluation_small.to_json(f"./{dataset_name}/dataset_evaluation_small.jsonl")
    print_dashes()

    # making the evaluation_hil dataset 
    print("\nCreate evaluation HIL dataset")
    
    dataset_evaluation_hil_df = dataset_evaluation_df.head(10)
    dataset_evaluation_hil_df = dataset_evaluation_hil_df.rename(columns={"model_input": "prompt", "target_output": "referenceResponse"})
    print("Evaluation dataframe hil example: " + dataset_evaluation_hil_df.iloc[0])
    dataset_evaluation_hil = Dataset.from_pandas(dataset_evaluation_hil_df)
    print("Evaluation dataset hil example: ", dataset_evaluation_hil[0])
    print("Export Evaluation dataset: dataset_evaluation_hil.jsonl")
    dataset_evaluation_hil.to_json(f"./{dataset_name}/dataset_evaluation_hil.jsonl")
    print_dashes()
    
    upload_workshop_dataset(dataset_name = dataset_name, output_bucket = sagemaker.Session().default_bucket(), local_path = f"./{dataset_name}")

if __name__ == '__main__':
    prepare_dataset_sciq()