# Pre-process the data before running the customization job

### This notebook should work well with the Data Science 3.0, Python 3, and ml.t3.medium kernel in SageMaker Studio CLassic

## Import libraries

In [None]:
!pip install --upgrade sagemaker datasets 


## Prepare the data in Bedrock required format

In [None]:
import json
from datasets import load_dataset

# Use the Dolly dataset for fine-tuning
dolly_dataset = load_dataset("databricks/databricks-dolly-15k", split="train")

# Filter the dataset to include only summarization examples
summarization_dataset = dolly_dataset.filter(lambda example: example["category"] == "summarization")
summarization_dataset = summarization_dataset.remove_columns("category")

# Create a new DataFrame with Bedrock supported format
modified_df = summarization_dataset.map(
    lambda example: {"prompt": f"{example['instruction']} {example['context']}",
                    "completion": example["response"]}
)

# Set max length to support Bedrock Customization Max token Length Quota
max_length_per_row = 15000

# Set max length to support Bedrock Customization Max token Length Quota
max_row_count = 9000

# Define a function to check if the total length of prompt and completion is within the specified max_length
def within_length(example):
    return len(example['prompt'] + example['completion']) <= max_length_per_row

# Filter the DataFrame to include only examples within the specified max length
modified_df = modified_df.filter(within_length)

num_rows = len(modified_df)
print(f"Number of rows: {num_rows}")


# Take the maximum supported dataset size
if num_rows > max_row_count:
    modified_df = modified_df.select(range(max_row_count)) 

# Remove unnecessary columns
modified_df = modified_df.remove_columns(["instruction", "context", "response"])

# Dump the modified DataFrame to a JSON file "train.jsonl" in local directory
modified_df.to_json("train.jsonl", orient="records", lines=True)

## Upload training data to S3

In [None]:
import boto3
from sagemaker import Session

# Create a session using the provided AWS SDK sessions
session = Session(boto_session=boto3.session.Session(),
                sagemaker_client=boto3.client('sagemaker'),
                sagemaker_runtime_client=boto3.client('runtime.sagemaker'))

# Create an S3 resource using the AWS SDK
s3 = boto3.resource('s3')

# Specify the path to the training data on your local machine
train_data_path = 'train.jsonl'

# Upload the training data to the specified S3 key prefix 'PreProcessed'
s3_train_data = session.upload_data(path=train_data_path, key_prefix='PreProcessed')

# Print a message indicating the successful upload
print(f"Uploaded {train_data_path} to {s3_train_data}")
