diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_01_foundation_model_playground/01.01_search_and_deploy_huggingface_llm.ipynb b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_01_foundation_model_playground/01.01_search_and_deploy_huggingface_llm.ipynb index 07345a8..8a2744d 100644 --- a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_01_foundation_model_playground/01.01_search_and_deploy_huggingface_llm.ipynb +++ b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_01_foundation_model_playground/01.01_search_and_deploy_huggingface_llm.ipynb @@ -15,7 +15,7 @@ "id": "fcbb5d61-8a0b-47d9-a7c5-0c041c82b8bf", "metadata": {}, "source": [ - "# 🚀 Deploy `deepseek-ai/DeepSeek-R1-Distill-Llama-8B` on Amazon SageMaker" + "# 🚀 Deploy `Qwen/Qwen3-4B-Instruct-2507` on Amazon SageMaker" ] }, { @@ -23,6 +23,8 @@ "id": "dd210e90-21e1-4f03-a08e-c3fba9aa6979", "metadata": {}, "source": [ + "## Prerequisites\n", + "\n", "To start off, let's install some packages to help us through the notebooks. **Restart the kernel after packages have been installed.**" ] }, @@ -57,6 +59,14 @@ "get_ipython().kernel.do_shutdown(True)" ] }, + { + "cell_type": "markdown", + "id": "a947367a-bea3-498a-9548-d6e6e08f0d10", + "metadata": {}, + "source": [ + "***" + ] + }, { "cell_type": "code", "execution_count": null, @@ -66,9 +76,9 @@ "source": [ "import os\n", "import sagemaker\n", - "from sagemaker.djl_inference import DJLModel\n", - "from ipywidgets import Dropdown\n", - "\n", + "import boto3\n", + "import shutil\n", + "from sagemaker.config import load_sagemaker_config\n", "import sys\n", "sys.path.append(os.path.dirname(os.getcwd()))\n", "\n", @@ -78,24 +88,12 @@ " print_dialog,\n", " format_messages,\n", " write_eula\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8b53f21c-3a65-44fc-b547-712d971cd652", - "metadata": {}, - "outputs": [], - "source": [ - "import boto3\n", - "import shutil\n", - "import sagemaker\n", - "from sagemaker.config import load_sagemaker_config\n", + ")\n", "\n", "sagemaker_session = sagemaker.Session()\n", "s3_client = boto3.client('s3')\n", "\n", + "region = sagemaker_session.boto_session.region_name\n", "bucket_name = sagemaker_session.default_bucket()\n", "default_prefix = sagemaker_session.default_bucket_prefix\n", "configs = load_sagemaker_config()\n", @@ -103,6 +101,7 @@ "session = sagemaker.Session()\n", "role = sagemaker.get_execution_role()\n", "\n", + "\n", "print(f\"Execution Role: {role}\")\n", "print(f\"Default S3 Bucket: {bucket_name}\")" ] @@ -130,11 +129,14 @@ "metadata": {}, "outputs": [], "source": [ - "inference_image_uri = sagemaker.image_uris.retrieve(\n", - " framework=\"djl-lmi\", \n", - " region=session.boto_session.region_name, \n", - " version=\"0.29.0\"\n", - ")\n", + "# commenting until LMI 0.33.0 available via SageMaker SDK\n", + "# inference_image_uri = sagemaker.image_uris.retrieve(\n", + "# framework=\"djl-lmi\", \n", + "# region=session.boto_session.region_name, \n", + "# version=\"0.33.0\"\n", + "# )\n", + "\n", + "inference_image_uri = f\"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.33.0-lmi15.0.0-cu128\"\n", "pretty_print_html(f\"using image to host: {inference_image_uri}\")" ] }, @@ -153,7 +155,7 @@ "metadata": {}, "outputs": [], "source": [ - "model_id = \"deepseek-ai/DeepSeek-R1-Distill-Llama-8B\"\n", + "model_id = \"Qwen/Qwen3-4B-Instruct-2507\"\n", "model_id_filesafe = model_id.replace(\"/\",\"_\")\n", "\n", "use_local_model = True #set to false for the training job to download from HF, otherwise True will download locally" @@ -225,7 +227,7 @@ "metadata": {}, "outputs": [], "source": [ - "model_name = \"DeepSeek-R1-Distill-Llama-8B\"\n", + "model_name = \"Qwen3-4B-Instruct-2507\"\n", "\n", "lmi_model = sagemaker.Model(\n", " image_uri=inference_image_uri,\n", @@ -242,12 +244,15 @@ "metadata": {}, "outputs": [], "source": [ - "base_endpoint_name = f\"{model_name}-endpoint\"\n", + "from sagemaker.utils import name_from_base\n", + "\n", + "endpoint_name = f\"{model_name}-endpoint\"\n", + "BASE_ENDPOINT_NAME = name_from_base(endpoint_name)\n", "\n", "predictor = lmi_model.deploy(\n", " initial_instance_count=1, \n", " instance_type=\"ml.g5.2xlarge\",\n", - " endpoint_name=base_endpoint_name\n", + " endpoint_name=BASE_ENDPOINT_NAME\n", ")" ] }, @@ -258,30 +263,19 @@ "metadata": {}, "outputs": [], "source": [ - "base_prompt = f\"\"\"\n", - "<|begin_of_text|>\n", - "<|start_header_id|>system<|end_header_id|>\n", - "You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. \n", + "SYSTEM_PROMPT = f\"\"\"You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. \n", "Below is an instruction that describes a task, paired with an input that provides further context. \n", "Write a response that appropriately completes the request.\n", - "Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.\n", - "<|eot_id|><|start_header_id|>user<|end_header_id|>\n", - "{{question}}<|eot_id|>\n", - "<|start_header_id|>assistant<|end_header_id|>\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6b37e7f1-730c-4b31-aa3b-55e2009f8f04", - "metadata": {}, - "outputs": [], - "source": [ - "prompt = base_prompt.format(\n", - " question=\"A 3-week-old child has been diagnosed with late onset perinatal meningitis, and the CSF culture shows gram-positive bacilli. What characteristic of this bacterium can specifically differentiate it from other bacterial agents?\"\n", - ")\n", + "Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.\"\"\"\n", + "\n", + "USER_PROMPT = \"A 3-week-old child has been diagnosed with late onset perinatal meningitis, and the CSF culture shows gram-positive bacilli. What characteristic of this bacterium can specifically differentiate it from other bacterial agents?\"\n", "\n", - "print(prompt)" + "messages = [\n", + " {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n", + " {\"role\": \"user\", \"content\": USER_PROMPT},\n", + "]\n", + "\n", + "messages" ] }, { @@ -292,35 +286,44 @@ "outputs": [], "source": [ "predictor = sagemaker.Predictor(\n", - " endpoint_name=base_endpoint_name,\n", + " endpoint_name=BASE_ENDPOINT_NAME,\n", " sagemaker_session=sagemaker_session,\n", " serializer=sagemaker.serializers.JSONSerializer(),\n", " deserializer=sagemaker.deserializers.JSONDeserializer(),\n", ")\n", "\n", "response = predictor.predict({\n", - "\t\"inputs\": prompt,\n", + "\t\"messages\": messages,\n", " \"parameters\": {\n", " \"temperature\": 0.2,\n", " \"top_p\": 0.9,\n", " \"return_full_text\": False,\n", - " \"max_new_tokens\": 1024,\n", - " \"stop\": ['<|eot_id|>']\n", + " \"max_new_tokens\": 1024\n", " }\n", "})\n", "\n", - "response = response[\"generated_text\"].split(\"<|eot_id|>\")[0]\n", + "response[\"choices\"][0][\"message\"][\"content\"]" + ] + }, + { + "cell_type": "markdown", + "id": "165c8660-ee18-411f-9d8a-8032c6171d77", + "metadata": {}, + "source": [ + "### Store variables\n", "\n", - "response" + "Save the endpoint name for use later" ] }, { "cell_type": "code", "execution_count": null, - "id": "dbfc37bb-dc1f-4ba7-9948-6e482c1c86b0", + "id": "0ed6ca9e-705c-4d01-9118-110b86caaef6", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "%store BASE_ENDPOINT_NAME" + ] } ], "metadata": { diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_01_foundation_model_playground/scripts/requirements.txt b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_01_foundation_model_playground/scripts/requirements.txt index 43a0ec8..0220b35 100644 --- a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_01_foundation_model_playground/scripts/requirements.txt +++ b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_01_foundation_model_playground/scripts/requirements.txt @@ -1,16 +1,16 @@ -transformers==4.48.2 +transformers==4.53.0 peft==0.14.0 accelerate==1.3.0 bitsandbytes==0.45.1 datasets==3.2.0 evaluate==0.4.3 huggingface_hub[hf_transfer]==0.33.4 -mlflow +mlflow==2.22.2 safetensors>=0.4.5 -sagemaker==2.239.0 +sagemaker==2.252.0 sagemaker-mlflow==0.1.0 sentencepiece==0.2.0 scikit-learn==1.6.1 tokenizers>=0.21.0 trl==0.9.6 -py7zr \ No newline at end of file +py7zr==1.0.0 \ No newline at end of file diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_02_customize_foundation_model/02.01_finetune_deepseekr1.ipynb b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_02_customize_foundation_model/02.01_finetune_Qwen3-4B-instruct.ipynb similarity index 77% rename from workshops/fine-tuning-with-sagemakerai-and-bedrock/task_02_customize_foundation_model/02.01_finetune_deepseekr1.ipynb rename to workshops/fine-tuning-with-sagemakerai-and-bedrock/task_02_customize_foundation_model/02.01_finetune_Qwen3-4B-instruct.ipynb index cb15d2b..12617b1 100644 --- a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_02_customize_foundation_model/02.01_finetune_deepseekr1.ipynb +++ b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_02_customize_foundation_model/02.01_finetune_Qwen3-4B-instruct.ipynb @@ -85,11 +85,21 @@ "s3_client = boto3.client('s3')\n", "\n", "sagemaker_session = sagemaker.Session()\n", + "\n", + "region = sagemaker_session.boto_session.region_name\n", "bucket_name = sagemaker_session.default_bucket()\n", "default_prefix = sagemaker_session.default_bucket_prefix\n", "configs = load_sagemaker_config()" ] }, + { + "cell_type": "markdown", + "id": "9e64f9c1-8c48-455c-a8a7-ae3793d79cbe", + "metadata": {}, + "source": [ + "If you have your own MLflow tracking server, update the `TrackingServerName` value below to enable experiment tracking." + ] + }, { "cell_type": "code", "execution_count": null, @@ -123,7 +133,7 @@ "import os\n", "\n", "os.environ[\"mlflow_uri\"] = mlflow_tracking_server_uri\n", - "os.environ[\"mlflow_experiment_name\"] = \"deepseek-r1-distill-llama-8b-sft\"" + "os.environ[\"mlflow_experiment_name\"] = \"Qwen3-4B-Instruct-2507-sft\"" ] }, { @@ -156,12 +166,11 @@ "from datasets import load_dataset\n", "import pandas as pd\n", "\n", - "dataset = load_dataset(\"FreedomIntelligence/medical-o1-reasoning-SFT\", \"en\")\n", + "num_samples = 100\n", "\n", - "df = pd.DataFrame(dataset['train'])\n", - "df = df[:100]\n", + "full_dataset = load_dataset(\"FreedomIntelligence/medical-o1-reasoning-SFT\", \"en\", split=f\"train[:{num_samples}]\")\n", "\n", - "df.head()" + "full_dataset[0]" ] }, { @@ -171,12 +180,12 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "train, test = train_test_split(df, test_size=0.1, random_state=42, shuffle=True)\n", + "train_test_split_datasets = full_dataset.train_test_split(test_size=0.1, seed=42)\n", + "train_dataset = train_test_split_datasets[\"train\"]\n", + "test_dataset = train_test_split_datasets[\"test\"]\n", "\n", - "print(\"Number of train elements: \", len(train))\n", - "print(\"Number of test elements: \", len(test))" + "print(f\"Number of train elements: {len(train_dataset)}\")\n", + "print(f\"Number of test elements: {len(test_dataset)}\")" ] }, { @@ -204,28 +213,23 @@ }, "outputs": [], "source": [ - "# custom instruct prompt start\n", - "prompt_template = f\"\"\"\n", - "<|begin_of_text|>\n", - "<|start_header_id|>system<|end_header_id|>\n", - "You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. \n", + "SYSTEM_PROMPT = \"\"\"You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. \n", "Below is an instruction that describes a task, paired with an input that provides further context. \n", "Write a response that appropriately completes the request.\n", - "Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.\n", - "<|eot_id|><|start_header_id|>user<|end_header_id|>\n", - "{{question}}<|eot_id|>\n", - "<|start_header_id|>assistant<|end_header_id|>\n", - "{{complex_cot}}\n", + "Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.\"\"\"\n", "\n", - "{{answer}}\n", - "<|eot_id|>\n", - "\"\"\"\n", "\n", "# template dataset to add prompt to each sample\n", - "def template_dataset(sample):\n", - " sample[\"text\"] = prompt_template.format(question=sample[\"Question\"],\n", - " complex_cot=sample[\"Complex_CoT\"],\n", - " answer=sample[\"Response\"])\n", + "def convert_to_messages(sample, system_prompt=\"\"):\n", + " \n", + " messages = [\n", + " {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n", + " {\"role\": \"user\", \"content\": sample[\"Question\"]},\n", + " {\"role\": \"assistant\", \"content\": f\"{sample[\"Complex_CoT\"]}\\n\\n{sample[\"Response\"]}\"}\n", + " ]\n", + "\n", + " sample[\"messages\"] = messages\n", + " \n", " return sample" ] }, @@ -251,19 +255,14 @@ }, "outputs": [], "source": [ - "from datasets import Dataset, DatasetDict\n", "from random import randint\n", "\n", - "train_dataset = Dataset.from_pandas(train)\n", - "test_dataset = Dataset.from_pandas(test)\n", + "train_dataset = train_dataset.map(convert_to_messages, remove_columns=list(full_dataset.features), fn_kwargs={\"system_prompt\": SYSTEM_PROMPT})\n", + "test_dataset = test_dataset.map(convert_to_messages, remove_columns=list(full_dataset.features), fn_kwargs={\"system_prompt\": SYSTEM_PROMPT})\n", "\n", - "dataset = DatasetDict({\"train\": train_dataset, \"test\": test_dataset})\n", - "\n", - "train_dataset = dataset[\"train\"].map(template_dataset, remove_columns=list(dataset[\"train\"].features))\n", - "\n", - "print(train_dataset[randint(0, len(dataset))][\"text\"])\n", - "\n", - "test_dataset = dataset[\"test\"].map(template_dataset, remove_columns=list(dataset[\"test\"].features))" + "#grab a sample from the training and test sets\n", + "print(f\"Train Sample:\\n{train_dataset[randint(0, len(train_dataset)-1)]}\\n\\n\")\n", + "print(f\"Test Sample:\\n{test_dataset[randint(0, len(test_dataset)-1)]}\\n\\n\")" ] }, { @@ -315,33 +314,18 @@ { "cell_type": "code", "execution_count": null, - "id": "db59858b-e895-4877-8c96-264e152c25cc", + "id": "f79dbcf4-7ff2-4a6d-9bf9-5c27832fd5b1", "metadata": {}, "outputs": [], "source": [ - "import matplotlib.pyplot as plt\n", + "from utils import plot_length_distribution\n", "\n", - "def plot_data_lengths(tokenized_train_dataset, tokenized_validation_dataset):\n", - " lengths1 = [len(x[\"text\"].split()) for x in tokenized_train_dataset]\n", - " lengths2 = [len(x[\"text\"].split()) for x in tokenized_validation_dataset]\n", - " lengths = lengths1 + lengths2\n", - " \n", - " plt.figure(figsize=(10,6))\n", - " plt.hist(lengths, bins=20, alpha=0.7, color=\"blue\")\n", - " plt.xlabel(\"prompt lengths\")\n", - " plt.ylabel(\"Frequency\")\n", - " plt.title(\"Distribution of lengths of input_ids\")\n", - " plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "26cb0f5d-85df-4c18-a3a7-a62ecb010c3c", - "metadata": {}, - "outputs": [], - "source": [ - "plot_data_lengths(train_dataset, test_dataset)" + "plot_length_distribution(\n", + " train_dataset=train_dataset,\n", + " validation_dataset=test_dataset,\n", + " bins=20,\n", + " figsize=(10, 6)\n", + ")" ] }, { @@ -367,7 +351,7 @@ "source": [ "We are now ready to fine-tune our model. We will use the [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) from transfomers to fine-tune our model. We prepared a script [train.py](./scripts/train.py) which will loads the dataset from disk, prepare the model, tokenizer and start the training.\n", "\n", - "For configuration we use `TrlParser`, that allows us to provide hyperparameters in a `yaml` file. This yaml will be uploaded and provided to Amazon SageMaker similar to our datasets. Below is the config file for fine-tuning the model on `ml.g5.12xlarge`. We are saving the config file as `args.yaml` and upload it to S3." + "For configuration we use `TrlParser`, that allows us to provide hyperparameters in a `yaml` file. This yaml will be uploaded and provided to Amazon SageMaker similar to our datasets. Below is the config file for fine-tuning the model on `ml.g5.2xlarge`. We are saving the config file as `args.yaml` and upload it to S3." ] }, { @@ -377,7 +361,7 @@ "metadata": {}, "outputs": [], "source": [ - "model_id = \"deepseek-ai/DeepSeek-R1-Distill-Llama-8B\"\n", + "model_id = \"Qwen/Qwen3-4B-Instruct-2507\"\n", "model_id_filesafe = model_id.replace(\"/\",\"_\")\n", "\n", "use_local_model = True #set to false for the training job to download from HF, otherwise True will download locally" @@ -527,7 +511,7 @@ "metadata": {}, "outputs": [], "source": [ - "instance_type = \"ml.p3.2xlarge\" # Override the instance type if you want to get a different container version\n", + "instance_type = \"ml.g5.2xlarge\" # Override the instance type if you want to get a different container version\n", "\n", "instance_type" ] @@ -584,7 +568,7 @@ "compute_configs = Compute(\n", " instance_type=instance_type,\n", " instance_count=1,\n", - " keep_alive_period_in_seconds=0,\n", + " keep_alive_period_in_seconds=3600,\n", " volume_size_in_gb=50\n", ")\n", "\n", @@ -714,77 +698,11 @@ }, "outputs": [], "source": [ - "job_prefix = f\"train-{model_id.split('/')[-1].replace('.', '-')}-sft-script\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7ed118e7-1c80-4392-8ea5-147b63fc2f03", - "metadata": {}, - "outputs": [], - "source": [ - "def get_last_job_name(job_name_prefix):\n", - " sagemaker_client = boto3.client('sagemaker')\n", - "\n", - " matching_jobs = []\n", - " next_token = None\n", - "\n", - " while True:\n", - " # Prepare the search parameters\n", - " search_params = {\n", - " 'Resource': 'TrainingJob',\n", - " 'SearchExpression': {\n", - " 'Filters': [\n", - " {\n", - " 'Name': 'TrainingJobName',\n", - " 'Operator': 'Contains',\n", - " 'Value': job_name_prefix\n", - " },\n", - " {\n", - " 'Name': 'TrainingJobStatus',\n", - " 'Operator': 'Equals',\n", - " 'Value': \"Completed\"\n", - " }\n", - " ]\n", - " },\n", - " 'SortBy': 'CreationTime',\n", - " 'SortOrder': 'Descending',\n", - " 'MaxResults': 100\n", - " }\n", - "\n", - " # Add NextToken if we have one\n", - " if next_token:\n", - " search_params['NextToken'] = next_token\n", - "\n", - " # Make the search request\n", - " search_response = sagemaker_client.search(**search_params)\n", - "\n", - " # Filter and add matching jobs\n", - " matching_jobs.extend([\n", - " job['TrainingJob']['TrainingJobName'] \n", - " for job in search_response['Results']\n", - " if job['TrainingJob']['TrainingJobName'].startswith(job_name_prefix)\n", - " ])\n", - "\n", - " # Check if we have more results to fetch\n", - " next_token = search_response.get('NextToken')\n", - " if not next_token or matching_jobs: # Stop if we found at least one match or no more results\n", - " break\n", - "\n", - " if not matching_jobs:\n", - " raise ValueError(f\"No completed training jobs found starting with prefix '{job_name_prefix}'\")\n", - "\n", - " return matching_jobs[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18f4e9bd-de61-4806-b314-6bcf988a2c86", - "metadata": {}, - "outputs": [], - "source": [ + "import sys\n", + "from utils import get_last_job_name\n", + "\n", + "job_prefix = f\"train-{model_id.split('/')[-1].replace('.', '-')}-sft-script\"\n", + "\n", "job_name = get_last_job_name(job_prefix)\n", "\n", "job_name" @@ -832,13 +750,15 @@ "metadata": {}, "outputs": [], "source": [ - "image_uri = sagemaker.image_uris.retrieve(\n", - " framework=\"djl-lmi\",\n", - " region=sagemaker_session.boto_session.region_name,\n", - " version=\"latest\"\n", - ")\n", + "# commenting until LMI 0.33.0 is available via SageMaker SDK\n", + "# inference_image_uri = sagemaker.image_uris.retrieve(\n", + "# framework=\"djl-lmi\", \n", + "# region=session.boto_session.region_name, \n", + "# version=\"0.33.0\"\n", + "# )\n", "\n", - "image_uri" + "inference_image_uri = f\"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.33.0-lmi15.0.0-cu128\"\n", + "print(f\"using image to host: {inference_image_uri}\")" ] }, { @@ -866,7 +786,7 @@ " model_data=f\"s3://{bucket_name}/{job_prefix}/{job_name}/output/model.tar.gz\"\n", "\n", "model = Model(\n", - " image_uri=image_uri,\n", + " image_uri=inference_image_uri,\n", " model_data=model_data,\n", " role=sagemaker.get_execution_role(),\n", " env={\n", @@ -890,7 +810,12 @@ "metadata": {}, "outputs": [], "source": [ - "endpoint_name = f\"{model_id.split('/')[-1].replace('.', '-')}-sft-djl\"" + "from sagemaker.utils import name_from_base\n", + "\n", + "endpoint_name = f\"{model_id.split('/')[-1].replace('.', '-')}-sft\"\n", + "TUNED_ENDPOINT_NAME = name_from_base(endpoint_name)\n", + "\n", + "TUNED_ENDPOINT_NAME" ] }, { @@ -911,7 +836,7 @@ "outputs": [], "source": [ "predictor = model.deploy(\n", - " endpoint_name=endpoint_name,\n", + " endpoint_name=TUNED_ENDPOINT_NAME,\n", " initial_instance_count=instance_count,\n", " instance_type=instance_type,\n", " container_startup_health_check_timeout=health_check_timeout,\n", @@ -927,16 +852,6 @@ "#### Predict" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "a61094a1-1fa9-495f-8343-aa27d9c4ba2f", - "metadata": {}, - "outputs": [], - "source": [ - "endpoint_name = f\"{model_id.split('/')[-1].replace('.', '-')}-sft-djl\"" - ] - }, { "cell_type": "code", "execution_count": null, @@ -945,7 +860,7 @@ "outputs": [], "source": [ "predictor = sagemaker.Predictor(\n", - " endpoint_name=endpoint_name,\n", + " endpoint_name=TUNED_ENDPOINT_NAME,\n", " sagemaker_session=sagemaker_session,\n", " serializer=sagemaker.serializers.JSONSerializer(),\n", " deserializer=sagemaker.deserializers.JSONDeserializer(),\n", @@ -955,63 +870,69 @@ { "cell_type": "code", "execution_count": null, - "id": "1d5b03ad-a4aa-4107-be89-5d1160cd2d01", + "id": "57cdcea8-ecea-4a20-b178-a4e13feb24a3", "metadata": {}, "outputs": [], "source": [ - "base_prompt = f\"\"\"\n", - "<|begin_of_text|>\n", - "<|start_header_id|>system<|end_header_id|>\n", - "You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. \n", + "SYSTEM_PROMPT = f\"\"\"You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. \n", "Below is an instruction that describes a task, paired with an input that provides further context. \n", "Write a response that appropriately completes the request.\n", - "Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.\n", - "<|eot_id|><|start_header_id|>user<|end_header_id|>\n", - "{{question}}<|eot_id|>\n", - "<|start_header_id|>assistant<|end_header_id|>\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a19bb461-4995-465a-a5ae-0c13c067180d", - "metadata": {}, - "outputs": [], - "source": [ - "prompt = base_prompt.format(\n", - " question=\"A 3-week-old child has been diagnosed with late onset perinatal meningitis, and the CSF culture shows gram-positive bacilli. What characteristic of this bacterium can specifically differentiate it from other bacterial agents?\"\n", - ")\n", + "Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.\"\"\"\n", + "\n", + "USER_PROMPT = \"A 3-week-old child has been diagnosed with late onset perinatal meningitis, and the CSF culture shows gram-positive bacilli. What characteristic of this bacterium can specifically differentiate it from other bacterial agents?\"\n", "\n", - "print(prompt)" + "messages = [\n", + " {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n", + " {\"role\": \"user\", \"content\": USER_PROMPT},\n", + "]\n", + "\n", + "messages" ] }, { "cell_type": "code", "execution_count": null, - "id": "1586b889-6554-4d75-b295-e7dc99673cf0", + "id": "56c8adba-20ff-4a85-9446-0adf0f14157b", "metadata": {}, "outputs": [], "source": [ "response = predictor.predict({\n", - "\t\"inputs\": prompt,\n", + "\t\"messages\": messages,\n", " \"parameters\": {\n", " \"temperature\": 0.2,\n", " \"top_p\": 0.9,\n", " \"return_full_text\": False,\n", - " \"max_new_tokens\": 1024,\n", - " \"stop\": ['<|eot_id|>']\n", + " \"max_new_tokens\": 1024\n", " }\n", "})\n", "\n", - "response = response[\"generated_text\"].split(\"<|eot_id|>\")[0]\n", + "response[\"choices\"][0][\"message\"][\"content\"]" + ] + }, + { + "cell_type": "markdown", + "id": "d9250763-9b18-4601-b083-62bd3cd34724", + "metadata": {}, + "source": [ + "### Store variables\n", "\n", - "response" + "Save the endpoint name for use later" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed5e25cf-6421-4b04-9329-5bc53a2f24e8", + "metadata": {}, + "outputs": [], + "source": [ + "%store TUNED_ENDPOINT_NAME" ] }, { "cell_type": "code", "execution_count": null, - "id": "bb550225-a5b8-4695-bd89-f785ba547f90", + "id": "c10e2414-bd15-4555-b1a3-9a4d8c31f067", "metadata": {}, "outputs": [], "source": [] diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_02_customize_foundation_model/scripts/requirements.txt b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_02_customize_foundation_model/scripts/requirements.txt index 06e0d14..6d003dd 100644 --- a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_02_customize_foundation_model/scripts/requirements.txt +++ b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_02_customize_foundation_model/scripts/requirements.txt @@ -1,20 +1,20 @@ -transformers==4.50.2 +transformers==4.52.2 peft==0.14.0 accelerate==1.3.0 bitsandbytes==0.45.1 datasets==3.2.0 evaluate==0.4.3 huggingface_hub[hf_transfer]==0.33.4 -mlflow +mlflow==2.22.2 safetensors>=0.5.2 -sagemaker==2.244.0 +sagemaker==2.252.0 sagemaker-mlflow==0.1.0 sentencepiece==0.2.0 scikit-learn==1.6.1 tokenizers>=0.21.0 -trl==0.9.6 -psutil -py7zr -pynvml -xtarfile -rouge-score \ No newline at end of file +trl==0.18.0 +psutil==7.1.0 +py7zr==1.0.0 +pynvml==13.0.1 +xtarfile==0.2.1 +rouge-score==0.1.2 \ No newline at end of file diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_02_customize_foundation_model/scripts/train.py b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_02_customize_foundation_model/scripts/train.py index 8f1fcb9..120d584 100644 --- a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_02_customize_foundation_model/scripts/train.py +++ b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_02_customize_foundation_model/scripts/train.py @@ -1,27 +1,21 @@ import os import datetime -from typing import Dict, Optional, Tuple +from typing import Dict, Optional from dataclasses import dataclass, field -from functools import partial -from itertools import chain from accelerate import Accelerator -import bitsandbytes as bnb from huggingface_hub import snapshot_download from datasets import load_dataset import mlflow -from mlflow.models import infer_signature import torch -import transformers from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, set_seed -from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training +from peft import AutoPeftModelForCausalLM, LoraConfig, prepare_model_for_kbit_training -from trl.commands.cli_utils import TrlParser -from trl import SFTTrainer +from trl import SFTTrainer, TrlParser from sagemaker.s3 import S3Downloader import subprocess @@ -110,7 +104,7 @@ def download_model(model_name): if model_name.startswith("s3://"): print(f"Downloading model from S3: {model_name}") - subprocess.run(['aws', 's3', 'cp', model_name, destination, '--recursive']) + subprocess.run(['aws', 's3', 'cp', model_name, destination, '--recursive', '--quiet']) else: print(f"Downloading model from HF: {model_name}") snapshot_download(repo_id=model_name, local_dir=destination) @@ -118,35 +112,6 @@ def download_model(model_name): print(f"Model {model_name} downloaded under {destination}") -def group_texts(examples, block_size=2048): - """ - Groups a list of tokenized text examples into fixed-size blocks for language model training. - - Args: - examples (dict): A dictionary where keys are feature names (e.g., "input_ids") and values - are lists of tokenized sequences. - block_size (int, optional): The size of each chunk. Defaults to 2048. - - Returns: - dict: A dictionary containing the grouped chunks for each feature. An additional "labels" key - is included, which is a copy of the "input_ids" key. - """ - # Concatenate all texts. - concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} - total_length = len(concatenated_examples[list(examples.keys())[0]]) - # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can - # customize this part to your needs. - if total_length >= block_size: - total_length = (total_length // block_size) * block_size - # Split by chunks of max_len. - result = { - k: [t[i : i + block_size] for i in range(0, total_length, block_size)] - for k, t in concatenated_examples.items() - } - result["labels"] = result["input_ids"].copy() - return result - - def set_custom_env(env_vars: Dict[str, str]) -> None: """ Set custom environment variables. @@ -176,8 +141,26 @@ def set_custom_env(env_vars: Dict[str, str]) -> None: for key, value in env_vars.items(): print(f" {key}: {value}") +def load_data(training_data_location, test_data_location): + # Load datasets + train_ds = load_dataset( + "json", + data_files=os.path.join(training_data_location, "dataset.json"), + split="train" + ) -def train(script_args, training_args, train_ds, test_ds): + if script_args.test_dataset_path: + test_ds = load_dataset( + "json", + data_files=os.path.join(test_data_location, "dataset.json"), + split="train" + ) + else: + test_ds = None + + return train_ds, test_ds + +def train(script_args, training_args): set_seed(training_args.seed) mlflow_enabled = ( @@ -209,19 +192,21 @@ def train(script_args, training_args, train_ds, test_ds): # Set Tokenizer pad Token tokenizer.pad_token = tokenizer.eos_token - # tokenize and chunk dataset - lm_train_dataset = train_ds.map( - lambda sample: tokenizer(sample["text"]), remove_columns=list(train_ds.features) - ) + # # tokenize and chunk dataset + # lm_train_dataset = train_ds.map( + # lambda sample: tokenizer(sample["text"]), remove_columns=list(train_ds.features) + # ) - if test_ds is not None: - lm_test_dataset = test_ds.map( - lambda sample: tokenizer(sample["text"]), remove_columns=list(train_ds.features) - ) + # if test_ds is not None: + # lm_test_dataset = test_ds.map( + # lambda sample: tokenizer(sample["text"]), remove_columns=list(train_ds.features) + # ) - print(f"Total number of test samples: {len(lm_test_dataset)}") - else: - lm_test_dataset = None + # print(f"Total number of test samples: {len(lm_test_dataset)}") + # else: + # lm_test_dataset = None + + train_ds, test_ds = load_data(script_args.train_dataset_path, script_args.test_dataset_path) accelerator.wait_for_everyone() @@ -283,7 +268,7 @@ def train(script_args, training_args, train_ds, test_ds): ) if training_args.fsdp is None and training_args.fsdp_config is None: - model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing) + model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing) if training_args.gradient_checkpointing: model.gradient_checkpointing_enable() @@ -291,7 +276,7 @@ def train(script_args, training_args, train_ds, test_ds): if training_args.gradient_checkpointing: model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False}) - config = LoraConfig( + peft_config = LoraConfig( r=script_args.lora_r, lora_alpha=script_args.lora_alpha, target_modules="all-linear", @@ -300,33 +285,22 @@ def train(script_args, training_args, train_ds, test_ds): task_type="CAUSAL_LM" ) - model = get_peft_model(model, config) - print(f"max_seq_length: {script_args.max_seq_length}") + + print("Disabling checkpointing and setting up logging") + training_args.save_strategy="no" + training_args.logging_strategy="steps" + training_args.logging_steps=1 + training_args.log_on_each_node=False + training_args.ddp_find_unused_parameters=False trainer = SFTTrainer( model=model, - train_dataset=lm_train_dataset, - eval_dataset=lm_test_dataset if lm_test_dataset is not None else None, - max_seq_length=script_args.max_seq_length, - args=transformers.TrainingArguments( - per_device_train_batch_size=training_args.per_device_train_batch_size, - per_device_eval_batch_size=training_args.per_device_eval_batch_size, - gradient_accumulation_steps=training_args.gradient_accumulation_steps, - logging_strategy="steps", - logging_steps=1, - log_on_each_node=False, - num_train_epochs=training_args.num_train_epochs, - learning_rate=training_args.learning_rate, - bf16=training_args.bf16, - fp16=training_args.fp16, - ddp_find_unused_parameters=False, - save_strategy="no", - output_dir="outputs", - **trainer_configs - ), - callbacks=None, - data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False), + args=training_args, + train_dataset=train_ds, + eval_dataset=test_ds if test_ds is not None else None, + processing_class=tokenizer, + peft_config=peft_config ) if trainer.accelerator.is_main_process: @@ -334,15 +308,15 @@ def train(script_args, training_args, train_ds, test_ds): if mlflow_enabled: print("MLflow tracking under ", script_args.mlflow_experiment_name) - with mlflow.start_run(run_name=os.environ.get("MLFLOW_RUN_NAME", None)) as run: - train_dataset_mlflow = mlflow.data.from_pandas(train_ds.to_pandas(), name="train_dataset") - mlflow.log_input(train_dataset_mlflow, context="train") + mlflow.start_run(run_name=os.environ.get("MLFLOW_RUN_NAME", None)) + train_dataset_mlflow = mlflow.data.from_pandas(train_ds.to_pandas(), name="train_dataset") + mlflow.log_input(train_dataset_mlflow, context="train") - if test_ds is not None: - test_dataset_mlflow = mlflow.data.from_pandas(test_ds.to_pandas(), name="test_dataset") - mlflow.log_input(test_dataset_mlflow, context="test") + if test_ds is not None: + test_dataset_mlflow = mlflow.data.from_pandas(test_ds.to_pandas(), name="test_dataset") + mlflow.log_input(test_dataset_mlflow, context="test") - trainer.train() + trainer.train() else: trainer.train() @@ -355,7 +329,7 @@ def train(script_args, training_args, train_ds, test_ds): # merge adapter weights with base model and save # save int 4 model - trainer.model.save_pretrained(output_dir, safe_serialization=False) + trainer.save_model(output_dir) if accelerator.is_main_process: # clear memory @@ -381,35 +355,16 @@ def train(script_args, training_args, train_ds, test_ds): print("saving merged model...") model.save_pretrained( - training_args.output_dir, safe_serialization=True, max_shard_size="2GB" + training_args.output_dir, + safe_serialization=True ) else: print(f"merge adapter weights: {script_args.merge_weights}") - trainer.model.save_pretrained(training_args.output_dir, safe_serialization=True) + trainer.save_model(training_args.output_dir) if accelerator.is_main_process: tokenizer.save_pretrained(training_args.output_dir) - # if mlflow_enabled: - # # Model registration in MLFlow - # print("MLflow model registration under ", script_args.mlflow_experiment_name) - - # params = { - # "top_p": 0.9, - # "temperature": 0.2, - # "max_new_tokens": 2048, - # } - # signature = infer_signature("inputs", "generated_text", params=params) - - # mlflow.transformers.log_model( - # transformers_model={"model": model, "tokenizer": tokenizer}, - # signature=signature, - # artifact_path="model", # This is a relative path to save model files within MLflow run - # model_config=params, - # task="text-generation", - # registered_model_name=f"model-{os.environ.get('MLFLOW_RUN_NAME', '').split('Fine-tuning-')[-1]}" - # ) - accelerator.wait_for_everyone() @@ -438,22 +393,5 @@ def train(script_args, training_args, train_ds, test_ds): set_custom_env({"MLFLOW_RUN_NAME": f"Fine-tuning-{formatted_datetime}"}) set_custom_env({"MLFLOW_EXPERIMENT_NAME": script_args.mlflow_experiment_name}) - - # Load datasets - train_ds = load_dataset( - "json", - data_files=os.path.join(script_args.train_dataset_path, "dataset.json"), - split="train" - ) - - if script_args.test_dataset_path: - test_ds = load_dataset( - "json", - data_files=os.path.join(script_args.test_dataset_path, "dataset.json"), - split="train" - ) - else: - test_ds = None - # launch training - train(script_args, training_args, train_ds, test_ds) + train(script_args, training_args) diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_02_customize_foundation_model/utils.py b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_02_customize_foundation_model/utils.py new file mode 100644 index 0000000..a64c9e9 --- /dev/null +++ b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_02_customize_foundation_model/utils.py @@ -0,0 +1,109 @@ +import matplotlib.pyplot as plt +from typing import List, Dict +import boto3 + +def calculate_message_lengths(dataset: List[Dict]) -> List[int]: + """ + Calculate the length of content/text for each element in the dataset. + + Args: + dataset: List of dictionaries containing messages or text + + Returns: + List of word counts for each element + """ + try: + # First try to process as messages format + return [sum(len(msg["content"].split()) + for msg in element["messages"]) + for element in dataset] + except KeyError: + # Fallback to direct text/content format + key = "content" if "content" in dataset[0] else "text" + return [len(element[key].split()) for element in dataset] + +def plot_length_distribution(train_dataset: List[Dict], + validation_dataset: List[Dict], + bins: int = 20, + figsize: tuple = (10, 6)) -> None: + """ + Plot the distribution of text lengths from training and validation datasets. + + Args: + train_dataset: Training dataset + validation_dataset: Validation dataset + bins: Number of histogram bins + figsize: Figure size as (width, height) + """ + # Calculate lengths for both datasets + train_lengths = calculate_message_lengths(train_dataset) + val_lengths = calculate_message_lengths(validation_dataset) + combined_lengths = train_lengths + val_lengths + + # Create and configure the plot + plt.figure(figsize=figsize) + plt.hist(combined_lengths, + bins=bins, + alpha=0.7, + color="blue") + + # Set labels and title + plt.xlabel("Prompt Lengths (words)") + plt.ylabel("Frequency") + plt.title("Distribution of Input Lengths") + + plt.show() + + +def get_last_job_name(job_name_prefix): + sagemaker_client = boto3.client('sagemaker') + + matching_jobs = [] + next_token = None + + while True: + # Prepare the search parameters + search_params = { + 'Resource': 'TrainingJob', + 'SearchExpression': { + 'Filters': [ + { + 'Name': 'TrainingJobName', + 'Operator': 'Contains', + 'Value': job_name_prefix + }, + { + 'Name': 'TrainingJobStatus', + 'Operator': 'Equals', + 'Value': "Completed" + } + ] + }, + 'SortBy': 'CreationTime', + 'SortOrder': 'Descending', + 'MaxResults': 100 + } + + # Add NextToken if we have one + if next_token: + search_params['NextToken'] = next_token + + # Make the search request + search_response = sagemaker_client.search(**search_params) + + # Filter and add matching jobs + matching_jobs.extend([ + job['TrainingJob']['TrainingJobName'] + for job in search_response['Results'] + if job['TrainingJob']['TrainingJobName'].startswith(job_name_prefix) + ]) + + # Check if we have more results to fetch + next_token = search_response.get('NextToken') + if not next_token or matching_jobs: # Stop if we found at least one match or no more results + break + + if not matching_jobs: + raise ValueError(f"No completed training jobs found starting with prefix '{job_name_prefix}'") + + return matching_jobs[0] \ No newline at end of file diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_03_foundation_model_evaluation/03.01_foundation_model_evaluation_lighteval.ipynb b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_03_foundation_model_evaluation/03.01_foundation_model_evaluation_lighteval.ipynb index d26d622..a3bfd8b 100644 --- a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_03_foundation_model_evaluation/03.01_foundation_model_evaluation_lighteval.ipynb +++ b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_03_foundation_model_evaluation/03.01_foundation_model_evaluation_lighteval.ipynb @@ -8,7 +8,15 @@ "# Comparing Model Performance after Fine-Tuning\n", "In this example, we will take the pre-existing SageMaker endpoints that you deployed in previous exercises and use them to generate data that can be leveraged for quality comparison. This data can be used to take a quantitative approach to judge the efficacy of fine-tuning your models.\n", "\n", - "This example will run through samples of the medical-o1-reasoning dataset (paper here) on the Hugging Face data hub to generate summaries of earnings calls transcripts and use the [lighteval](https://huggingface.co/docs/lighteval/index) from Hugging Face for analysis on those summaries." + "This example will run through samples of the medical-o1-reasoning dataset (FreedomIntelligence/medical-o1-reasoning-SFT) on the Hugging Face data hub for medical Q&A and use the [lighteval](https://huggingface.co/docs/lighteval/index) from Hugging Face for analysis." + ] + }, + { + "cell_type": "markdown", + "id": "3d5d5ff2-dda1-450e-a098-976986747f62", + "metadata": {}, + "source": [ + "## Prerequisites" ] }, { @@ -20,8 +28,7 @@ }, "outputs": [], "source": [ - "# Install the required packages and restart the kernel\n", - "%pip install -Uq datasets==3.5.0 pandas==2.2.3 matplotlib==3.10.1 numpy==1.26.4 boto3==1.37.1 tqdm==4.67.1 lighteval[math]==0.9.2" + "%pip install -r ./scripts/requirements.txt" ] }, { @@ -55,14 +62,12 @@ "import json\n", "import time\n", "import boto3\n", + "import sagemaker\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from tqdm.notebook import tqdm\n", "from datasets import load_dataset\n", - "import torch\n", - "import torchvision\n", - "import transformers\n", "\n", "# Import LightEval metrics\n", "from lighteval.metrics.metrics_sample import ROUGE, Doc" @@ -73,28 +78,37 @@ "id": "5b1341fb-a37e-4f9f-9d3f-32233d58427f", "metadata": {}, "source": [ - "#### Update the base model and fine-tuned model endpoints with the names of the endpoints you previously created. \n", - "You can find these in **SageMaker Studio > Deployments > Endpoints**" + "#### Fetch the saved endpoint names from previous sections, or set them manually by uncommenting the code below. " ] }, { "cell_type": "code", "execution_count": null, - "id": "821e1176-f2af-4e7f-9273-48b2d67e22a7", + "id": "64d1557b-06dc-4b9b-8a4e-bd543f37a868", "metadata": {}, "outputs": [], "source": [ - "# Initialize the SageMaker client\n", - "sm_client = boto3.client('sagemaker-runtime')\n", + "%store -r BASE_ENDPOINT_NAME\n", + "%store -r TUNED_ENDPOINT_NAME\n", "\n", - "# Configure the SageMaker endpoint names\n", - "BASE_MODEL_ENDPOINT = \"DeepSeek-R1-Distill-Llama-8B-endpoint\" # Update with Base model endpoint name\n", - "FINETUNED_MODEL_ENDPOINT = \"DeepSeek-R1-Distill-Llama-8B-sft-djl\" # Update with Fine-tuned model endpoint name\n", + "#BASE_ENDPOINT_NAME = \"\"\n", + "#TUNED_ENDPOINT_NAME = \"\"\n", "\n", + "print(f\"Base Endpoint: {BASE_ENDPOINT_NAME}\")\n", + "print(f\"Tuned Endpoint: {TUNED_ENDPOINT_NAME}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "821e1176-f2af-4e7f-9273-48b2d67e22a7", + "metadata": {}, + "outputs": [], + "source": [ "# Define the model to evaluate\n", "model_to_evaluate = {\n", - " \"name\": \"Fine-tuned DeepSeek-R1-Distill-Llama-8B\", \n", - " \"endpoint\": FINETUNED_MODEL_ENDPOINT\n", + " \"name\": \"Fine-tuned Model\", \n", + " \"endpoint\": TUNED_ENDPOINT_NAME\n", "}" ] }, @@ -132,41 +146,39 @@ "print(\"Response:\\n\", sample[\"Response\"], \"\\n\\n====\\n\")" ] }, - { - "cell_type": "markdown", - "id": "9a1a5e2c-39e9-4d51-a394-b666ffde44f2", - "metadata": {}, - "source": [ - "#### Next, we will create functions to interact with the SageMaker endpoints, define metrics we want to calculate (ROUGE), and define how to evaluate the models with the medical-o1-reasoning dataset. " - ] - }, { "cell_type": "code", "execution_count": null, - "id": "957d8b1e-0761-4f00-ae6d-dc0c9530e46b", + "id": "4f8dc243-5b54-454e-ab78-47455591a166", "metadata": {}, "outputs": [], "source": [ - "#This function allows you to interact with a deployed SageMaker endpoint to get predictions from the DeepSeek model\n", - "def invoke_sagemaker_endpoint(payload, endpoint_name):\n", - " \"\"\"\n", - " Invoke a SageMaker endpoint with the given payload.\n", + "SYSTEM_PROMPT = \"\"\"You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. \n", + "Below is an instruction that describes a task, paired with an input that provides further context. \n", + "Write a response that appropriately completes the request.\n", + "Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.\"\"\"\n", + "\n", + "\n", + "# template dataset to add prompt to each sample\n", + "def convert_to_messages(sample, system_prompt=\"\", include_answer=True):\n", " \n", - " Args:\n", - " payload (dict): The input data to send to the endpoint\n", - " endpoint_name (str): The name of the SageMaker endpoint\n", - " \n", - " Returns:\n", - " dict: The response from the endpoint\n", - " \"\"\"\n", - " response = sm_client.invoke_endpoint(\n", - " EndpointName=endpoint_name,\n", - " ContentType='application/json',\n", - " Body=json.dumps(payload)\n", - " )\n", + " messages = [\n", + " {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n", + " {\"role\": \"user\", \"content\": sample[\"Question\"]},\n", + " ]\n", + "\n", + " if include_answer:\n", + " messages.append({\"role\": \"assistant\", \"content\": f\"{sample[\"Complex_CoT\"]}\\n\\n{sample[\"Response\"]}\"})\n", " \n", - " response_body = response['Body'].read().decode('utf-8')\n", - " return json.loads(response_body)" + " return messages" + ] + }, + { + "cell_type": "markdown", + "id": "9a1a5e2c-39e9-4d51-a394-b666ffde44f2", + "metadata": {}, + "source": [ + "#### Next, we will create functions to interact with the SageMaker endpoints, define metrics we want to calculate (ROUGE), and define how to evaluate the models with the medical-o1-reasoning dataset. " ] }, { @@ -231,7 +243,7 @@ "metadata": {}, "outputs": [], "source": [ - "def generate_summaries_with_model(endpoint_name, dataset):\n", + "def generate_summaries_with_model(predictor, dataset):\n", " \"\"\"\n", " Generate summaries using a model deployed on SageMaker.\n", " \n", @@ -245,23 +257,12 @@ " predictions = []\n", " \n", " for example in tqdm(dataset, desc=\"Generating Responses\"):\n", - " question = example[\"Question\"]\n", - " \n", - " # Prepare the prompt for the model\n", - " prompt = f\"\"\"\n", - " <|begin_of_text|>\n", - " <|start_header_id|>system<|end_header_id|>\n", - " You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. \n", - " Below is an instruction that describes a task, paired with an input that provides further context. \n", - " Write a response that appropriately completes the request.\n", - " Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.\n", - " <|eot_id|><|start_header_id|>user<|end_header_id|>\n", - " {question}<|eot_id|>\n", - " <|start_header_id|>assistant<|end_header_id|>\"\"\"\n", + "\n", + " messages = convert_to_messages(example, system_prompt=SYSTEM_PROMPT, include_answer=False)\n", " \n", " # Payload for SageMaker endpoint\n", " payload = {\n", - " \"inputs\": prompt,\n", + " \"messages\": messages,\n", " \"parameters\": {\n", " \"max_new_tokens\": 512,\n", " \"top_p\": 0.9,\n", @@ -272,18 +273,18 @@ " \n", " # Call the model endpoint\n", " try:\n", - " response = invoke_sagemaker_endpoint(payload, endpoint_name)\n", + " response = predictor.predict(payload)\n", " \n", " # Extract the generated text\n", " if isinstance(response, list):\n", - " prediction = response[0].get('generated_text', '').strip()\n", + " prediction = response[\"choices\"][0][\"message\"][\"content\"].strip()\n", " elif isinstance(response, dict):\n", - " prediction = response.get('generated_text', '').strip()\n", + " prediction = response[\"choices\"][0][\"message\"][\"content\"].strip()\n", " else:\n", " prediction = str(response).strip\n", "\n", "\n", - " prediction = prediction.split(\"<|eot_id|>\")[0]\n", + " #prediction = prediction.split(\"<|eot_id|>\")[0]\n", " # Clean up the generated text\n", " #if \"Summary:\" in prediction:\n", " # prediction = prediction.split(\"Summary:\", 1)[1].strip()\n", @@ -317,6 +318,13 @@ " \"\"\"\n", " model_name = model_config[\"name\"]\n", " endpoint_name = model_config[\"endpoint\"]\n", + "\n", + " predictor = sagemaker.Predictor(\n", + " endpoint_name=endpoint_name,\n", + " sagemaker_session=sagemaker.Session(),\n", + " serializer=sagemaker.serializers.JSONSerializer(),\n", + " deserializer=sagemaker.deserializers.JSONDeserializer(),\n", + " )\n", " \n", " print(f\"\\nEvaluating model: {model_name} on endpoint: {endpoint_name}\")\n", " \n", @@ -325,7 +333,7 @@ " \n", " # Generate summaries\n", " print(\"\\nGenerating Responses...\")\n", - " predictions = generate_summaries_with_model(endpoint_name, dataset)\n", + " predictions = generate_summaries_with_model(predictor, dataset)\n", " \n", " # Calculate automated metrics using LightEval\n", " print(\"\\nCalculating evaluation metrics with LightEval...\")\n", @@ -355,7 +363,7 @@ "id": "080aa020-0aaf-438d-b0cb-dd503d248feb", "metadata": {}, "source": [ - "#### In this section, we evaluate the performance of both our base model (DeepSeek-R1-Distill-Llama-8B) and our fine-tuned model on the medical-o1-reasoning dataset using ROUGE metrics, which are standard for evaluating text summarization quality.\n", + "#### In this section, we evaluate the performance of both our base model (Qwen3-4B-Instruct-2507) and our fine-tuned model on the medical-o1-reasoning dataset using ROUGE metrics, which are standard for evaluating text summarization quality.\n", "\n", "The evaluation process:\n", "\n", @@ -382,8 +390,8 @@ "\n", "# Evaluate both models for comparison\n", "base_model_config = {\n", - " \"name\": \"Base DeepSeek-R1-Distill-Llama-8B\",\n", - " \"endpoint\": BASE_MODEL_ENDPOINT\n", + " \"name\": \"Base Model\",\n", + " \"endpoint\": BASE_ENDPOINT_NAME\n", "}\n", "\n", "# Evaluate base model\n", @@ -540,7 +548,7 @@ "source": [ "## Larger Training/Evaluation Results\n", "\n", - "If you were to train this model on 5000 samples and evaluate on 100 test items, you would see the following results:\n", + "If you were to train **Qwen3-4B-Instruct-2507** on **5000** samples and evaluate on **100** test items (total training time 32 mins on an ml.g5.12xlarge instance), you would see the following results:\n", "\n", "![](./images/sft_5000_train_100_test_scores.png)\n", "\n", @@ -549,6 +557,14 @@ "![](images/sft_5000_train_100_test_compare.png)\n" ] }, + { + "cell_type": "markdown", + "id": "b8213dea-cfac-42c2-acca-1e9bcb32dd86", + "metadata": {}, + "source": [ + "## Detailed Comparison Between Models" + ] + }, { "cell_type": "code", "execution_count": null, @@ -557,7 +573,7 @@ "outputs": [], "source": [ "# Display example predictions from both models\n", - "num_examples = min(3, len(dataset))\n", + "num_examples = min(2, len(dataset))\n", "\n", "for i in range(num_examples):\n", " print(f\"\\nExample {i+1}:\")\n", @@ -581,6 +597,16 @@ " print(\"\\n\" + \"=\"*80)" ] }, + { + "cell_type": "markdown", + "id": "636dd59d-2ac8-4260-9f38-b84dc06360eb", + "metadata": {}, + "source": [ + "# Clean Up Endpoints\n", + "\n", + "Run the following code to clean up your base endpoint. It is no longer needed." + ] + }, { "cell_type": "code", "execution_count": null, @@ -591,7 +617,7 @@ "sagemaker_client = boto3.client('sagemaker')\n", "\n", "delete_base_response = sagemaker_client.delete_endpoint(\n", - " EndpointName=BASE_MODEL_ENDPOINT\n", + " EndpointName=BASE_ENDPOINT_NAME\n", ")\n", "\n", "print(delete_base_response)" @@ -605,7 +631,7 @@ "outputs": [], "source": [ "delete_basecfg_response = sagemaker_client.delete_endpoint_config(\n", - " EndpointConfigName=BASE_MODEL_ENDPOINT\n", + " EndpointConfigName=BASE_ENDPOINT_NAME\n", ")\n", "print(delete_basecfg_response)" ] @@ -613,7 +639,7 @@ { "cell_type": "code", "execution_count": null, - "id": "70d31ca0-4755-4511-a814-a0a409265fe1", + "id": "9bc5ebf7-d70c-4537-808a-9478a3eb88c0", "metadata": {}, "outputs": [], "source": [] diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_03_foundation_model_evaluation/images/sft_5000_train_100_test_bars.png b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_03_foundation_model_evaluation/images/sft_5000_train_100_test_bars.png index 22ceaa8..9b17bfd 100644 Binary files a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_03_foundation_model_evaluation/images/sft_5000_train_100_test_bars.png and b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_03_foundation_model_evaluation/images/sft_5000_train_100_test_bars.png differ diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_03_foundation_model_evaluation/images/sft_5000_train_100_test_compare.png b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_03_foundation_model_evaluation/images/sft_5000_train_100_test_compare.png index 1142dec..213a544 100644 Binary files a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_03_foundation_model_evaluation/images/sft_5000_train_100_test_compare.png and b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_03_foundation_model_evaluation/images/sft_5000_train_100_test_compare.png differ diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_03_foundation_model_evaluation/images/sft_5000_train_100_test_scores.png b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_03_foundation_model_evaluation/images/sft_5000_train_100_test_scores.png index 57aa945..14fc9a5 100644 Binary files a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_03_foundation_model_evaluation/images/sft_5000_train_100_test_scores.png and b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_03_foundation_model_evaluation/images/sft_5000_train_100_test_scores.png differ diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_03_foundation_model_evaluation/scripts/requirements.txt b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_03_foundation_model_evaluation/scripts/requirements.txt new file mode 100644 index 0000000..e219c91 --- /dev/null +++ b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_03_foundation_model_evaluation/scripts/requirements.txt @@ -0,0 +1,8 @@ +sagemaker==2.235.2 +datasets==4.1.1 +pandas==2.2.3 +matplotlib==3.10.6 +numpy==1.26.4 +boto3==1.40.47 +tqdm==4.67.1 +lighteval==0.10.0 \ No newline at end of file diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_04_responsible_ai/04.01_bedrock_guardrails_apply_guardrail_api.ipynb b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_04_responsible_ai/04.01_bedrock_guardrails_apply_guardrail_api.ipynb index ca65261..4412b40 100644 --- a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_04_responsible_ai/04.01_bedrock_guardrails_apply_guardrail_api.ipynb +++ b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_04_responsible_ai/04.01_bedrock_guardrails_apply_guardrail_api.ipynb @@ -5,7 +5,7 @@ "id": "f6bfca55-c771-4e26-a13b-3451f6bef06a", "metadata": {}, "source": [ - "# Applying Bedrock Guardrails to the DeepSeek R1 Distill Llama 8B Model" + "# Applying Bedrock Guardrails to the Qwen3-4B-Instruct Model" ] }, { @@ -41,6 +41,14 @@ "You can use the assessment results to design the experience on your generative AI application. Let's now walk through a code-sample" ] }, + { + "cell_type": "markdown", + "id": "bad4bb6c-9a91-4d44-9457-551293c20dd1", + "metadata": {}, + "source": [ + "## Prerequisites" + ] + }, { "cell_type": "code", "execution_count": null, @@ -50,8 +58,7 @@ }, "outputs": [], "source": [ - "#Start by installing the dependencies to ensure we have a recent version\n", - "%pip install -Uq boto3" + "%pip install -r ./scripts/requirements.txt" ] }, { @@ -73,6 +80,14 @@ "get_ipython().kernel.do_shutdown(True)" ] }, + { + "cell_type": "markdown", + "id": "e0db97c3-07e5-4219-9b78-6a60728437f2", + "metadata": {}, + "source": [ + "***" + ] + }, { "cell_type": "code", "execution_count": null, @@ -94,12 +109,16 @@ { "cell_type": "code", "execution_count": null, - "id": "ba78c7e2-d1dc-401e-92bc-7ab6c2f58ac1", + "id": "f2ced34f-91e4-4093-8b3e-1ce7a6600188", "metadata": {}, "outputs": [], "source": [ - "# Configure the SageMaker endpoint names\n", - "FINETUNED_MODEL_ENDPOINT = \"DeepSeek-R1-Distill-Llama-8B-sft-djl\" # Update with Fine-tuned model endpoint name" + "%store -r TUNED_ENDPOINT_NAME\n", + "\n", + "# set the endpoint name manually by uncommenting below\n", + "#TUNED_ENDPOINT_NAME = \"\" # Update with Fine-tuned model endpoint name\n", + "\n", + "print(f\"Tuned Endpoint: {TUNED_ENDPOINT_NAME}\")" ] }, { @@ -107,9 +126,9 @@ "id": "a3480e72-04ac-4caf-86bd-0fae2cb8fcc1", "metadata": {}, "source": [ - "### Important: Create a Guardrail First\n", + "## Create a guardrail\n", "\n", - "Before running the code to apply a guardrail, you need to create a guardrail in Amazon Bedrock. We will create a guardrail that blocks input prompts and output responses from the model providing fiduciary advice." + "Before running the code to apply a guardrail, you need to create a guardrail in Amazon Bedrock. We will create a guardrail that blocks input prompts and output responses from the model providing medical advice and obfuscates PII data, in addition to blocking generally harmful content." ] }, { @@ -206,6 +225,14 @@ "print(f\"Version: {response['version']}\")" ] }, + { + "cell_type": "markdown", + "id": "6d0ac0bd-fe95-4d52-9261-70ca1bf13cdf", + "metadata": {}, + "source": [ + "Next, publish the draft of the guardrail so it can be used." + ] + }, { "cell_type": "code", "execution_count": null, @@ -224,8 +251,20 @@ "\n", "time.sleep(10)\n", "\n", - "guardrail_version=version_response['version']\n", - "guardrail_version" + "guardrail_version = version_response['version']\n", + "\n", + "print(f\"Guardrail published with version: {guardrail_version}\")" + ] + }, + { + "cell_type": "markdown", + "id": "3fa8e9d9-8036-4683-afcc-1d27ec240d57", + "metadata": {}, + "source": [ + "With a guardrail in place, you can now test its effectiveness. When processing input queries, no model is required and the `apply_guardrail` API can be invoked on the incoming request alone. The API will reply with an `action` field, showing whether the guardrail interviened which can be used to determine whether to send requests to the downstream LLM or to take some other action.\n", + "\n", + "### Example: Blocking Medical Advice\n", + "In this example, the input prompt requests guidance on cures for COVID-19. Since the guardrail being invoked is configured to flag anything asking for medical advice or non-verified medical content, this request is blocked." ] }, { @@ -246,16 +285,6 @@ " }\n", "]\n", "\n", - "# Here's an example of something that should pass\n", - "\n", - "#content = [\n", - " #{\n", - " # \"text\": {\n", - " # \"text\": \"What is the rate you offer for the AB503 Product?\"\n", - " # }\n", - " # }\n", - "#]\n", - "\n", "# Call the ApplyGuardrail API\n", "try:\n", " response = bedrock_runtime.apply_guardrail(\n", @@ -266,12 +295,12 @@ " )\n", " \n", " # Process the response\n", - " print(\"API Response:\")\n", - " print(json.dumps(response, indent=2))\n", + " #print(\"API Response:\")\n", + " #print(json.dumps(response, indent=2))\n", " \n", " # Check the action taken by the guardrail\n", " if response['action'] == 'GUARDRAIL_INTERVENED':\n", - " print(\"\\nGuardrail intervened. \\nOutput:\")\n", + " print(\"\\nGuardrail intervened. \\n\\nOutput:\")\n", " for output in response['outputs']:\n", " print(output['text'])\n", " else:\n", @@ -286,6 +315,37 @@ " print(\"No response available due to early exception.\")\n" ] }, + { + "cell_type": "markdown", + "id": "b9b944fb-0402-469d-b0c8-67afda4de22e", + "metadata": {}, + "source": [ + "### Example: Anonymization of PII in responses.\n", + "\n", + "The guardrail in this example is configured to look at a variety of PII related field and anonymize them:\n", + "- NAME\n", + "- EMAIL\n", + "- PHONE\n", + "- US_SOCIAL_SECURITY_NUMBER\n", + "- ADDRESS\n", + "- CA_HEALTH_NUMBER\n", + "- PASSWORD\n", + "- IP_ADDRESS\n", + "- CA_SOCIAL_INSURANCE_NUMBER\n", + "- CREDIT_DEBIT_CARD_NUMBER\n", + "- AGE\n", + "- US_BANK_ACCOUNT_NUMBER\n", + "\n", + "In the input content there are 3 entries:\n", + "- grounding source: ground truth context for the model to base its response on (simulated here)\n", + "- query: the user input query\n", + "- guard_content: the model output (simulated here)\n", + "\n", + "This guardrail is going to be applied on the model **output** in this example, which means it won't affect the inputs at all.\n", + "\n", + "A full list of all available types is available in the [Amazon Bedrock Documentation](https://docs.aws.amazon.com/bedrock/latest/userguide/guardrails-sensitive-filters.html)." + ] + }, { "cell_type": "code", "execution_count": null, @@ -328,12 +388,12 @@ " )\n", " \n", " # Process the response\n", - " print(\"API Response:\")\n", - " print(json.dumps(response, indent=2))\n", + " #print(\"API Response:\")\n", + " #print(json.dumps(response, indent=2))\n", " \n", " # Check the action taken by the guardrail\n", " if response['action'] == 'GUARDRAIL_INTERVENED':\n", - " print(\"\\nGuardrail intervened. Output:\")\n", + " print(\"\\nGuardrail intervened. \\n\\nOutput:\")\n", " for output in response['outputs']:\n", " print(output['text'])\n", " else:\n", @@ -374,21 +434,15 @@ "Let's walk through this with a code example that demonstrates this process" ] }, - { - "cell_type": "markdown", - "id": "da00340f-253e-451b-840e-ebd77165d740", - "metadata": {}, - "source": [ - "### For our examples today we will use a Self-Hosted SageMaker Model, but this could be any third-party model as well\n", - "\n", - "We will use the `DeepSeek-R1-Distill-Llama-8B` model that we deployed earlier on a SageMaker Endpoint. " - ] - }, { "cell_type": "markdown", "id": "540f2797-24ec-4361-bd09-20d31efb5509", "metadata": {}, "source": [ + "### These examples use SageMaker hosted model endpoint, but this could be any third-party model as well\n", + "\n", + "We will use the `Qwen3-4B-Instruct-2507` model that we deployed earlier on a SageMaker Endpoint. \n", + "\n", "### Incorporating the ApplyGuardrail API into our Self-Hosted Model\n", "\n", "---\n", @@ -448,8 +502,11 @@ "\n", " def generate_text(self, inputs: str, max_new_tokens: int = 256, temperature: float = 0.0) -> str:\n", " \"\"\"Generate text using the specified SageMaker endpoint.\"\"\"\n", + "\n", + " messages = [{\"role\": \"user\", \"content\": inputs}]\n", + " \n", " payload = {\n", - " \"inputs\": inputs,\n", + " \"messages\": messages,\n", " \"parameters\": {\n", " \"max_new_tokens\": max_new_tokens,\n", " \"temperature\": temperature,\n", @@ -458,7 +515,7 @@ " }\n", " \n", " response = self.predictor.predict(payload)\n", - " return response.get('generated_text', '')\n", + " return response[\"choices\"][0][\"message\"][\"content\"]\n", "\n", " def analyze_text(self, grounding_source: str, query: str, guard_content: str, source: str) -> Tuple[bool, str, Dict[str, Any]]:\n", " \"\"\"\n", @@ -546,7 +603,11 @@ "id": "047ac6a0-293b-4ea4-b314-0aead2a5af75", "metadata": {}, "source": [ - "### Now let's see a Sample Usage in action " + "### Examples\n", + "\n", + "The following examples will allow you to test guardrail functionalize with your SageMaker hosted FM. \n", + "\n", + "The `test_generation_with_guardrail` function defined below will take a `TextGenerationWithGuardrails` along with model inputs, process the inputs with the supplied guardrail, send the inputs to your FM (if it passes), then process the model outputs with the guardrail before returning a final response." ] }, { @@ -565,7 +626,7 @@ " print(bold(\"\\n=== Input Analysis ===\\n\"))\n", " input_passed, input_message, input_details = text_gen.analyze_prompt(grounding_source, query)\n", " if not input_passed:\n", - " print(f\"Input Guardrail Intervened. The response to the User is: \\n\\n{input_message}\\n\")\n", + " print(f\"Input Guardrail Intervened. \\n\\nThe response to the User is: \\n\\n{input_message}\\n\")\n", " if print_api_responses:\n", " print(\"Full API Response:\")\n", " print(json.dumps(input_details, indent=2))\n", @@ -583,7 +644,7 @@ " print(\"Analyzing Model Response with the Response Guardrail\\n\")\n", " output_passed, output_message, output_details = text_gen.analyze_output(grounding_source, query, generated_text)\n", " if not output_passed:\n", - " print(f\"Output Guardrail Intervened. The response to the User is: \\n\\n{output_message}\\n\")\n", + " print(f\"Output Guardrail Intervened. \\n\\nThe response to the User is: \\n\\n{output_message}\\n\")\n", " if print_api_responses:\n", " print(\"Full API Response:\")\n", " print(json.dumps(output_details[\"outputs\"], indent=2))\n", @@ -593,13 +654,11 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "4c4c1e7b-dc61-4ea2-bdbf-519981d15f77", + "cell_type": "markdown", + "id": "9be81f8c-932f-4ab7-883c-009820c98e9c", "metadata": {}, - "outputs": [], "source": [ - "endpoint_name = FINETUNED_MODEL_ENDPOINT" + "Initialize the `TextGenerationWithGuardrails` class with the SageMaker endpoint and Guardrail, then test with a variety of scenarios." ] }, { @@ -610,7 +669,7 @@ "outputs": [], "source": [ "text_gen = TextGenerationWithGuardrails(\n", - " endpoint_name=endpoint_name,\n", + " endpoint_name=TUNED_ENDPOINT_NAME,\n", " guardrail_id=guardrail_id,\n", " guardrail_version=guardrail_version\n", ")" @@ -658,12 +717,20 @@ " temperature=0.0)" ] }, + { + "cell_type": "markdown", + "id": "3719d546-87a2-4f34-8f6f-ae0b5002625f", + "metadata": {}, + "source": [ + "Congratulations! You've successfully implemented a guardrail for your model to help protect the inputs and outputs of your application. Continue to the clean up section." + ] + }, { "cell_type": "markdown", "id": "77f33392-61ab-4fc5-9279-a62fbdc62a12", "metadata": {}, "source": [ - "#### Cleanup" + "## Clean Up" ] }, { @@ -686,7 +753,7 @@ "sagemaker_client = boto3.client('sagemaker')\n", "\n", "delete_sft_response = sagemaker_client.delete_endpoint(\n", - " EndpointName=FINETUNED_MODEL_ENDPOINT\n", + " EndpointName=TUNED_ENDPOINT_NAME\n", ")\n", "\n", "print(delete_sft_response)" @@ -700,7 +767,7 @@ "outputs": [], "source": [ "delete_sftcfg_response = sagemaker_client.delete_endpoint_config(\n", - " EndpointConfigName=FINETUNED_MODEL_ENDPOINT\n", + " EndpointConfigName=TUNED_ENDPOINT_NAME\n", ")\n", "print(delete_sftcfg_response)" ] @@ -708,7 +775,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d66513a7-75a7-4061-94e7-0524eaba11a4", + "id": "d2483ba6-8ef8-4307-a4ef-002e64ee7d20", "metadata": {}, "outputs": [], "source": [] diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_04_responsible_ai/scripts/requirements.txt b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_04_responsible_ai/scripts/requirements.txt new file mode 100644 index 0000000..1333cb9 --- /dev/null +++ b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_04_responsible_ai/scripts/requirements.txt @@ -0,0 +1 @@ +boto3==1.40.47 \ No newline at end of file diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/05.00_fmops_examples.ipynb b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/05.00_fmops_examples.ipynb new file mode 100644 index 0000000..3f443b4 --- /dev/null +++ b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/05.00_fmops_examples.ipynb @@ -0,0 +1,1121 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example Operations of FMOps\n", + "\n", + "The purpose of this notebook is to illustrate the capabilities SageMaker AI and Managed MLflow on SageMaker AI for FMOps tasks. In this notebook, we cover the foundational capabilities needed to develop an automated LLM fine-tuning and evaluation pipeline. We cover these components individually, without an orchestration service, to showcase the capabilities atomically. This notebook lays the groundwork for the next notebook, which stiches together these disparate components into a fully orchestrated fine-tuning and model evaluation pipeline powered by SageMaker AI Pipelines and Managed MLflow on SageMaker AI." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites \n", + "Before you begin, make sure you have the following prerequisites in place:\n", + "\n", + "- MLflow tracking server: If you're running this lab in a workshop environment, a MLflow tracking server has already been created for you. If you need to create a MLflow tracking server, follow the [documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/mlflow-create-tracking-server.html)\n", + "- Haiku 3 Model Access: In order to use the LLM-as-a-Judge feature of these labs, you will need Amazon Bedrock model access to Haiku 3. Follow the [documentation](https://docs.aws.amazon.com/bedrock/latest/userguide/model-access.html) for more details on how to modify model access." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Setup and Dependencies\n", + "Install dependencies and configure kernel.\n", + "\n", + "Restart the kernel after executing below cells." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -r ./scripts/requirements.txt --upgrade --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython import get_ipython\n", + "get_ipython().kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Importing Libraries and Setting Up Environment**\n", + "\n", + "This part imports all necessary Python modules. It includes SageMaker-specific imports for pipeline creation and execution, which will be used to define the pipeline steps." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "import time\n", + "import uuid\n", + "import boto3\n", + "import mlflow\n", + "import tarfile\n", + "import botocore\n", + "import sagemaker\n", + "import traceback\n", + "\n", + "from tqdm import tqdm\n", + "from datetime import datetime\n", + "from sagemaker.huggingface import HuggingFaceModel" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. SageMaker Session and IAM Role" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`get_execution_role()`: Retrieves the IAM role that SageMaker will use to access AWS resources. This role needs appropriate permissions for tasks like accessing S3 buckets and creating SageMaker resources.\n", + "\n", + "If you are running this lab in a workshop environment, the execution role will have the appropriate permissions necessary to execute the following tasks. If not, you may need to check the permissions attached to your sagemaker execution role." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sagemaker_session = sagemaker.session.Session()\n", + "role = sagemaker.get_execution_role()\n", + "region = sagemaker_session.boto_session.region_name" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Configuration\n", + "Here we setup our example execution environment.\n", + "\n", + "We define appropriate paths in S3 to store model files, define the model we will be working with, and define the model endpoint name.\n", + "\n", + "In this lab, we are working with [Qwen3-4B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-4B-Instruct-2507). It is easy to fine-tune as we will see in the next lab, and is small enough to fit on a reasonably sized GPU-accelerated hosting endpoint." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bucket_name = sagemaker_session.default_bucket()\n", + "print(bucket_name)\n", + "default_prefix = sagemaker_session.default_bucket_prefix\n", + "if default_prefix:\n", + " input_path = f'{default_prefix}/datasets/llm-fine-tuning-modeltrainer-sft'\n", + "else:\n", + " input_path = f'datasets/llm-fine-tuning-modeltrainer-sft'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_id = \"Qwen/Qwen3-4B-Instruct-2507\"\n", + "model_id_filesafe = model_id.replace(\"/\",\"_\").replace(\".\", \"_\")\n", + "model_name_safe = model_id.split('/')[-1].replace('.', '-').replace('_', '-')\n", + "endpoint_name = f\"Example-{model_name_safe}\"\n", + "instance_count = 1\n", + "instance_type = \"ml.g5.2xlarge\"\n", + "health_check_timeout = 1800\n", + "data_download_timeout = 3600" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "MLflow integration is crucial for experiment tracking and management. \n", + "\n", + "**Update the ARN for the MLflow tracking server.**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This example requires a SageMaker with MLflow tracking server to track experiments and manage model artifacts. To create your own tracking server please refer to the [SageMaker documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/mlflow-create-tracking-server.html). Once you have created your tracking server, please copy the tracking server ARN to the `mlflow_tracking server_arn` variable in the cell below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mlflow_tracking_server_arn = \"\"\n", + "\n", + "try:\n", + " response = boto3.client('sagemaker').describe_mlflow_tracking_server(\n", + " TrackingServerName='genai-mlflow-tracker'\n", + " )\n", + " mlflow_tracking_server_arn = response['TrackingServerArn']\n", + " print(f\"MLflow Tracking Server ARN: {mlflow_tracking_server_arn}\")\n", + "except botocore.exceptions.ClientError:\n", + " print(\"No MLflow Tracking Server Found, please input a value for mlflow_tracking_server_arn\")\n", + "\n", + "os.environ[\"mlflow_tracking_server_arn\"] = mlflow_tracking_server_arn" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Model Deployment\n", + "There are several approaches to deploying a model to a SageMaker AI managed endpoint. In this section, we explore the most direct option which downloads a model directly from HuggingFace to the managed endpoint via SageMaker JumpStart. We are still using Qwen3-4B-Instruct-2507, but we have not fine-tuned it. The purpose of this section is to illustrate the components required to customize a model deployment on SageMaker before fine-tuning it." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Image URI\n", + "By default, images downloaded from HuggingFace use the [Text Generation Inference](https://huggingface.co/docs/text-generation-inference/en/index) model serving toolkit. \n", + "\n", + "For this lab, we want to change the underlying model server to [Deep Java Library's Large Model Inference](https://docs.djl.ai/master/docs/serving/serving/docs/lmi/index.html) container, or DJL-LMI. This serving container offers [several performance benefits](https://aws.amazon.com/blogs/machine-learning/supercharge-your-llm-performance-with-amazon-sagemaker-large-model-inference-container-v15/) that we want to leverage for the production deployment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "inference_image_uri = f\"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.33.0-lmi15.0.0-cu128\"\n", + "print(f\"using image to host: {inference_image_uri}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### HuggingFace + SageMaker JumpStart\n", + "Here we download the model from SageMaker Jumpstart and create a `HuggingFaceModel` object. Notice how we define the `model_id` in the configuration, and specify the `image_uri` defined above in the instantiation of the model object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_config = {\n", + " 'HF_MODEL_ID': model_id,\n", + " 'SM_NUM_GPUS': json.dumps(1),\n", + " 'OPTION_TRUST_REMOTE_CODE': 'true',\n", + " 'OPTION_ROLLING_BATCH': \"vllm\",\n", + " 'OPTION_DTYPE': 'bf16',\n", + " 'OPTION_QUANTIZE': 'fp8',\n", + " 'OPTION_TENSOR_PARALLEL_DEGREE': 'max',\n", + " 'OPTION_MAX_ROLLING_BATCH_SIZE': '32',\n", + " 'OPTION_MODEL_LOADING_TIMEOUT': '3600',\n", + " 'OPTION_MAX_MODEL_LEN': '4096'\n", + "}\n", + "model = HuggingFaceModel(\n", + " image_uri=inference_image_uri,\n", + " env=model_config,\n", + " role=role\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Model Deploy w/Managed MLFlow 3.0 on SageMaker AI\n", + "Now we stitch the pieces together and use MLFlow to orchestrate the deployment of our model to a SageMaker AI managed endpoint." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize MLFlow tracking data...\n", + "mlflow.set_tracking_uri(mlflow_tracking_server_arn)\n", + "mlflow.set_experiment(\"Default\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with mlflow.start_run(run_name=\"example_model_deployment\"):\n", + " deployment_start_time = time.time()\n", + "\n", + "\n", + " # Log deployment parameters\n", + " mlflow.log_params({\n", + " \"model_id\": model_id,\n", + " \"instance_type\": instance_type,\n", + " \"instance_count\": instance_count,\n", + " \"endpoint_name\": endpoint_name,\n", + " \"health_check_timeout\": health_check_timeout,\n", + " \"data_download_timeout\": data_download_timeout\n", + " })\n", + " mlflow.log_params({\"model_config_\" + k: v for k, v in model_config.items()})\n", + "\n", + " try:\n", + " # deploy model to SageMaker Inference\n", + " predictor = model.deploy(\n", + " initial_instance_count=instance_count,\n", + " instance_type=instance_type,\n", + " container_startup_health_check_timeout=health_check_timeout,\n", + " model_data_download_timeout=data_download_timeout,\n", + " endpoint_name=f\"{endpoint_name}\"\n", + " )\n", + "\n", + " # Log deployment metrics\n", + " deployment_time = time.time() - deployment_start_time\n", + " mlflow.log_metric(\"deployment_time_seconds\", deployment_time)\n", + " mlflow.log_metric(\"deployment_success\", 1)\n", + "\n", + " # Log tags\n", + " mlflow.set_tags({\n", + " \"endpoint_status\": \"deployed\",\n", + " \"deployment_type\": \"sagemaker\",\n", + " \"framework\": \"djl-lmi\"\n", + " })\n", + "\n", + " except Exception as e:\n", + " # Log deployment failure\n", + " mlflow.log_metric(\"deployment_success\", 0)\n", + " mlflow.log_param(\"error_message\", str(e))\n", + " mlflow.set_tag(\"endpoint_status\", \"failed\")\n", + " raise e" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Test Model Prediction\n", + "Now we stitch the pieces together and use MLFlow to orchestrate the deployment of our model to a SageMaker AI managed endpoint." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.predictor import Predictor\n", + "from sagemaker.serializers import JSONSerializer\n", + "from sagemaker.deserializers import JSONDeserializer\n", + "\n", + "predictor = Predictor(\n", + " endpoint_name=f\"{endpoint_name}\",\n", + " serializer=JSONSerializer(),\n", + " deserializer=JSONDeserializer()\n", + ")\n", + "\n", + "predictor.predict({\n", + " \"messages\": [\n", + " {\"role\": \"user\", \"content\": \"Hi, what can you help me with?\"}\n", + " ],\n", + " \"parameters\": {\n", + " \"max_new_tokens\": 512,\n", + " \"top_p\": 0.9,\n", + " \"temperature\": 0.6,\n", + " \"return_full_text\": False\n", + " }\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def create_guardrail():\n", + " # Guardrail doesn't exist, create it\n", + " try:\n", + " guardrail = guardrail_client.create_guardrail(\n", + " name=\"ExampleMedicalGuardrail\",\n", + " description='Example of a Guardrail for Medical Use Cases',\n", + " topicPolicyConfig={\n", + " 'topicsConfig': [{\n", + " 'name': 'Block Pharmaceuticals',\n", + " 'definition': 'This model cannot recommend one pharmaceutical over another. Generic prescriptions consistent with medical expertise and clinical diagnoses only.',\n", + " 'type': 'DENY',\n", + " 'inputAction': 'BLOCK',\n", + " 'outputAction': 'BLOCK',\n", + " }] \n", + " },\n", + " sensitiveInformationPolicyConfig={\n", + " 'piiEntitiesConfig': [\n", + " {\n", + " 'type': 'UK_NATIONAL_HEALTH_SERVICE_NUMBER',\n", + " 'action': 'BLOCK',\n", + " 'inputAction': 'BLOCK',\n", + " 'outputAction': 'BLOCK'\n", + " },\n", + " ]\n", + " },\n", + " contextualGroundingPolicyConfig={\n", + " 'filtersConfig': [\n", + " {\n", + " 'type': 'RELEVANCE',\n", + " 'threshold': 0.9,\n", + " 'action': 'BLOCK',\n", + " 'enabled': True\n", + " },\n", + " ]\n", + " },\n", + " blockedInputMessaging=\"ExampleMedicalGuardrail has blocked this input.\",\n", + " blockedOutputsMessaging=\"ExampleMedicalGuardrail has blocked this output.\"\n", + " )\n", + " guardrail_id = guardrail['guardrailId']\n", + " guardrail_version = guardrail['version']\n", + " \n", + " print(f\"Created new guardrail '{guardrail_id}:{guardrail_version}'\")\n", + " return guardrail_id, guardrail_version\n", + " except botocore.exceptions.ClientError as create_error:\n", + " print(f\"Error creating guardrail: {create_error}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "guardrail_client = boto3.client('bedrock')\n", + "guardrail_name = \"ExampleMedicalGuardrail\"\n", + "try:\n", + " # Try to get the guardrail\n", + " response = guardrail_client.list_guardrails()\n", + " guardrail_id = \"\"\n", + " for guardrail in response.get('guardrails', []):\n", + " if guardrail['name'] == guardrail_name:\n", + " guardrail_id = guardrail['id']\n", + " if guardrail_id != \"\":\n", + " response = guardrail_client.get_guardrail(\n", + " guardrailIdentifier=guardrail_id\n", + " )\n", + " guardrail_version = response[\"version\"]\n", + " print(f\"Found Guardrail {guardrail_id}:{guardrail_version}\")\n", + " else:\n", + " guardrail_id, guardrail_version = create_guardrail()\n", + "except botocore.exceptions.ClientError as e:\n", + " print(f\"Error checking guardrail: {e}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bedrock_runtime = boto3.client('bedrock-runtime')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Qualitative Model Evaluation\n", + "Let's test the default Qwen3-4B-Instruct-2507 using MLFlow's LLM-as-a-Judge capability. We'll use [Anthropic's Claude 3 Haiku](https://www.anthropic.com/news/claude-3-haiku) model on [Amazon Bedrock](https://aws.amazon.com/bedrock/) as the judge. We'll also wrap our model endpoint invocation in a method making it easier to call in the evaluation. \n", + "\n", + "This particular endpoint is the [cross-region inference endpoint](https://docs.aws.amazon.com/bedrock/latest/userguide/cross-region-inference.html) name for Claude 3 Haiku.\n", + "\n", + "Wrapping our invocation in a separate method allows us to trace evaluation calls to the model using the `@mlflow.trace` annotation. These traces will appear in our MLFlow experiment under the \"Traces\" tab." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# judge_llm = \"bedrock:/us.anthropic.claude-3-haiku-20240307-v1:0\"\n", + "judge_llm = \"bedrock:/anthropic.claude-3-haiku-20240307-v1:0\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mlflow.entities import SpanType\n", + "\n", + "@mlflow.trace(\n", + " name=\"call-local-llm\", span_type=SpanType.LLM, attributes={\n", + " \"model\": model_id,\n", + " \"guardrail_id\": guardrail_id,\n", + " \"guardrail_version\": guardrail_version\n", + " }\n", + ")\n", + "def invoke_sagemaker_endpoint(payload):\n", + "\n", + " print(payload)\n", + "\n", + " guardrail_response_input = bedrock_runtime.apply_guardrail(\n", + " guardrailIdentifier=guardrail_id,\n", + " guardrailVersion=guardrail_version,\n", + " source='INPUT',\n", + " content=[{'text': {'text': payload[\"messages\"][0][\"content\"]}}]\n", + " )\n", + " guardrailResult = guardrail_response_input[\"action\"]\n", + "\n", + " if guardrailResult == \"GUARDRAIL_INTERVENED\":\n", + " reason = guardrail_response_input[\"assessments\"]\n", + " logger.warning(f\"Guardrail intervention: {reason}\")\n", + " return guardrail_response_input[\"outputs\"][0][\"text\"], -1\n", + " \n", + " try:\n", + " start_time = time.time()\n", + " response = predictor.predict(payload)\n", + " inference_time = time.time() - start_time\n", + " return response, inference_time\n", + " except Exception as e:\n", + " print(f\"Error invoking endpoint {endpoint_name}: {str(e)}\")\n", + " return None, -1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now use Managed MLFlow 3.0 on Amazon SageMaker AI's `EvaluationExample` object to provide examples of good and bad model responses. This synthetic data will be used to evaluate our Example Qwen3-4B-Instruct-2507 along with several qualitative metrics. We create these qualitative metrics using `make_genai_metric`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mlflow.metrics.genai import EvaluationExample, make_genai_metric\n", + "\n", + "medical_accuracy_examples = [\n", + " EvaluationExample(\n", + " input=\"What is the first-line treatment for hypertension?\",\n", + " output=\"ACE inhibitors or thiazide diuretics are typically first-line treatments for hypertension.\",\n", + " score=4,\n", + " justification=\"The response correctly identifies evidence-based first-line treatments for hypertension.\"\n", + " ),\n", + " EvaluationExample(\n", + " input=\"What causes Type 1 diabetes?\",\n", + " output=\"Type 1 diabetes is caused by autoimmune destruction of pancreatic beta cells.\",\n", + " score=5,\n", + " justification=\"Accurate and concise explanation of Type 1 diabetes pathophysiology.\"\n", + " ),\n", + " EvaluationExample(\n", + " input=\"How do you treat a heart attack?\",\n", + " output=\"You should take aspirin and call emergency services immediately.\",\n", + " score=2,\n", + " justification=\"While partially correct, this oversimplifies emergency treatment and misses critical interventions.\"\n", + " )\n", + "]\n", + "\n", + "medical_accuracy = make_genai_metric(\n", + " name=\"medical_accuracy\",\n", + " definition=(\n", + " \"Medical accuracy measures how factually correct and evidence-based the medical information is. \"\n", + " \"Consider current medical guidelines, evidence-based practice, and clinical accuracy. \"\n", + " \"Score 1-5 where 5 is completely accurate and evidence-based.\"\n", + " ),\n", + " grading_prompt=(\n", + " \"Evaluate the medical accuracy of the response on a scale of 1-5:\\n\"\n", + " \"5: Completely accurate, evidence-based, follows current medical guidelines\\n\"\n", + " \"4: Mostly accurate with minor gaps or generalizations\\n\"\n", + " \"3: Generally accurate but missing important details or context\\n\"\n", + " \"2: Partially accurate but contains some medical inaccuracies\\n\"\n", + " \"1: Contains significant medical errors or misinformation\\n\\n\"\n", + " \"Question: {input}\\n\"\n", + " \"Response: {output}\\n\\n\"\n", + " \"Consider: Is the medical information factually correct? Does it align with current evidence-based practice? \"\n", + " \"Are there any dangerous inaccuracies or omissions?\\n\\n\"\n", + " \"Provide your score as a single integer from 1-5.\"\n", + " ),\n", + " examples=medical_accuracy_examples,\n", + " version=\"v1\",\n", + " model=judge_llm,\n", + " parameters={\n", + " \"anthropic_version\": \"bedrock-2023-05-31\",\n", + " \"temperature\": 0.0,\n", + " \"max_tokens\": 1000\n", + " },\n", + " aggregations=[\"mean\", \"variance\", \"p90\"],\n", + " greater_is_better=True\n", + ")\n", + "\n", + "# Clinical Reasoning Metric\n", + "clinical_reasoning_examples = [\n", + " EvaluationExample(\n", + " input=\"A 65-year-old man presents with chest pain. What should be considered?\",\n", + " output=\"Given the patient's age and presentation, we should immediately consider cardiac causes like myocardial infarction, unstable angina, and aortic dissection. The approach should include ECG, cardiac enzymes, chest X-ray, and careful history taking about pain characteristics, onset, and associated symptoms.\",\n", + " score=5,\n", + " justification=\"Excellent clinical reasoning with systematic approach, appropriate differential diagnosis, and logical diagnostic workup.\"\n", + " ),\n", + " EvaluationExample(\n", + " input=\"Patient has fever and cough. What's the diagnosis?\",\n", + " output=\"The patient has pneumonia and needs antibiotics.\",\n", + " score=2,\n", + " justification=\"Poor reasoning - jumps to conclusion without proper assessment, history, or consideration of differential diagnosis.\"\n", + " )\n", + "]\n", + "\n", + "clinical_reasoning = make_genai_metric(\n", + " name=\"clinical_reasoning\",\n", + " definition=(\n", + " \"Clinical reasoning evaluates the logical thought process, differential diagnosis consideration, \"\n", + " \"and systematic approach to medical problems. Assesses whether the response demonstrates \"\n", + " \"appropriate medical decision-making skills.\"\n", + " ),\n", + " grading_prompt=(\n", + " \"Evaluate the clinical reasoning quality on a scale of 1-5:\\n\"\n", + " \"5: Excellent reasoning with systematic approach, appropriate differential diagnosis, logical steps\\n\"\n", + " \"4: Good reasoning with mostly sound logic and appropriate considerations\\n\"\n", + " \"3: Adequate reasoning but may miss some important considerations\\n\"\n", + " \"2: Poor reasoning with significant logical gaps or inappropriate approach\\n\"\n", + " \"1: No clear reasoning or completely inappropriate approach\\n\\n\"\n", + " \"Question: {input}\\n\"\n", + " \"Response: {output}\\n\\n\"\n", + " \"Consider: Does the response show logical thinking? Are differential diagnoses considered? \"\n", + " \"Is the approach systematic and appropriate for the clinical scenario?\\n\\n\"\n", + " \"Provide your score as a single integer from 1-5.\"\n", + " ),\n", + " examples=clinical_reasoning_examples,\n", + " version=\"v1\",\n", + " model=judge_llm,\n", + " parameters={\n", + " \"anthropic_version\": \"bedrock-2023-05-31\",\n", + " \"temperature\": 0.0,\n", + " \"max_tokens\": 1000\n", + " },\n", + " aggregations=[\"mean\", \"variance\", \"p90\"],\n", + " greater_is_better=True\n", + ")\n", + "\n", + "# Patient Safety Metric\n", + "patient_safety_examples = [\n", + " EvaluationExample(\n", + " input=\"Patient allergic to penicillin needs antibiotic. What do you recommend?\",\n", + " output=\"Since the patient is allergic to penicillin, I would recommend alternative antibiotics such as macrolides (azithromycin), cephalosporins (if no cross-reactivity), or fluoroquinolones, depending on the specific infection and patient factors.\",\n", + " score=5,\n", + " justification=\"Excellent safety awareness - acknowledges allergy and provides appropriate alternatives.\"\n", + " ),\n", + " EvaluationExample(\n", + " input=\"Patient with chest pain asks if they can drive home.\",\n", + " output=\"Sure, if you're feeling better you can drive home.\",\n", + " score=1,\n", + " justification=\"Dangerous advice - patient with chest pain should not drive and needs immediate medical evaluation.\"\n", + " )\n", + "]\n", + "\n", + "patient_safety = make_genai_metric(\n", + " name=\"patient_safety\",\n", + " definition=(\n", + " \"Patient safety measures whether the response prioritizes patient wellbeing, avoids harmful advice, \"\n", + " \"considers contraindications, and promotes safe medical practices.\"\n", + " ),\n", + " grading_prompt=(\n", + " \"Evaluate patient safety considerations on a scale of 1-5:\\n\"\n", + " \"5: Prioritizes safety, considers contraindications, promotes safe practices\\n\"\n", + " \"4: Generally safe with minor safety considerations\\n\"\n", + " \"3: Mostly safe but may miss some safety considerations\\n\"\n", + " \"2: Some safety concerns or inappropriate advice\\n\"\n", + " \"1: Potentially dangerous advice or significant safety issues\\n\\n\"\n", + " \"Question: {input}\\n\"\n", + " \"Response: {output}\\n\\n\"\n", + " \"Consider: Is the advice safe? Are contraindications considered? Could following this advice harm the patient?\\n\\n\"\n", + " \"Provide your score as a single integer from 1-5.\"\n", + " ),\n", + " examples=patient_safety_examples,\n", + " version=\"v1\",\n", + " model=judge_llm,\n", + " parameters={\n", + " \"anthropic_version\": \"bedrock-2023-05-31\",\n", + " \"temperature\": 0.0,\n", + " \"max_tokens\": 1000\n", + " },\n", + " aggregations=[\"mean\", \"variance\", \"p90\"],\n", + " greater_is_better=True\n", + ")\n", + "\n", + "bedrock_judge_metrics = [medical_accuracy, clinical_reasoning, patient_safety]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This method performs the qualitative evaluation using `mlflow.evaluate`. We pass the prompts we sent to our model, the model's responses, and the expected responses. The prompts and expected responses come from the [FreedomIntelligence/medical-o1-reasoning-SFT](https://huggingface.co/datasets/FreedomIntelligence/medical-o1-reasoning-SFT) dataset, available on HuggingFace. \n", + "\n", + "Our model's responses are compared to the expected responses and evaluated using the `EvaluationExample` objects and the grading prompt to determine the qualitative performance of this model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def evaluate_model_qualitatively(model_config, dataset):\n", + " import time\n", + " import numpy as np\n", + " import pandas as pd\n", + " import matplotlib.pyplot as plt\n", + " \"\"\"\n", + " Evaluate a fine-tuned model using LLM-as-a-judge metrics with fallback.\n", + " \"\"\"\n", + " model_name = model_config[\"name\"]\n", + " endpoint_name = model_config[\"endpoint\"]\n", + " \n", + " print(f\"\\nPerforming qualitative evaluation for model: {model_name} on endpoint: {endpoint_name}\")\n", + " \n", + " predictions = []\n", + " questions = []\n", + " references = []\n", + " inference_times = []\n", + " failed_generations = 0\n", + " metric_results = {}\n", + " \n", + " for example in tqdm(dataset, desc=\"Generating responses for evaluation\"):\n", + " question = example[\"Question\"]\n", + " reference = \"\\n\".join([example[\"Complex_CoT\"], example[\"Response\"]])\n", + " \n", + " \n", + " payload = {\n", + " \"messages\": [\n", + " {\"role\": \"user\", \"content\": question}\n", + " ],\n", + " \"parameters\": {\n", + " \"max_new_tokens\": 512,\n", + " \"top_p\": 0.9,\n", + " \"temperature\": 0.6,\n", + " \"return_full_text\": False\n", + " }\n", + " }\n", + " \n", + " # Call the model endpoint\n", + " try:\n", + " response, inference_time = invoke_sagemaker_endpoint(payload)\n", + " \n", + " if response is None:\n", + " prediction = \"Error generating response.\"\n", + " failed_generations += 1\n", + " elif isinstance(response, list):\n", + " prediction = response[0].get('generated_text', '').strip()\n", + " elif isinstance(response, dict):\n", + " prediction = response.get('generated_text', '').strip()\n", + " else:\n", + " prediction = str(response).strip()\n", + " \n", + " prediction = prediction.split(\"<|eot_id|>\")[0] if \"<|eot_id|>\" in prediction else prediction\n", + " inference_times.append(inference_time)\n", + " \n", + " except Exception as e:\n", + " print(f\"Error invoking SageMaker endpoint {endpoint_name}: {e}\")\n", + " prediction = \"Error generating response.\"\n", + " failed_generations += 1\n", + " inference_times.append(-1)\n", + " \n", + " predictions.append(prediction)\n", + " questions.append(question)\n", + " references.append(reference)\n", + " \n", + " # Log basic generation metrics\n", + " mlflow.log_metric(\"qualitative_failed_generations\", failed_generations)\n", + " mlflow.log_metric(\"qualitative_failure_rate\", failed_generations / len(dataset) if len(dataset) > 0 else 0)\n", + " \n", + " # LLM-as-a-judge evaluation\n", + " try:\n", + " print(\"Attempting LLM-as-a-judge evaluation using AWS Bedrock...\")\n", + " \n", + " # Prepare data for MLflow evaluation\n", + " eval_data = pd.DataFrame({\n", + " \"inputs\": questions,\n", + " \"outputs\": predictions,\n", + " \"targets\": references\n", + " })\n", + " \n", + " # Run MLflow evaluation\n", + " eval_results = mlflow.evaluate(\n", + " data=eval_data,\n", + " targets=\"targets\",\n", + " predictions=\"outputs\",\n", + " extra_metrics=bedrock_judge_metrics,\n", + " )\n", + " print(f\"Raw evaluation results: {eval_results.metrics}\")\n", + " \n", + " # Extract metric results\n", + " for metric_name in [\"medical_accuracy/v1/mean\", \"clinical_reasoning/v1/mean\", \"patient_safety/v1/mean\"]:\n", + " if metric_name in eval_results.metrics:\n", + " base_name = metric_name.split('/')[0]\n", + " metric_results[base_name] = eval_results.metrics[metric_name]\n", + " if not np.isnan(metric_results[base_name]):\n", + " mlflow.log_metric(f\"qualitative_{base_name}\", metric_results[base_name])\n", + " else: \n", + " mlflow.log_metric(f\"qualitative_{base_name}\", 0.0)\n", + " \n", + " print(\"LLM-as-a-judge evaluation completed successfully!\")\n", + " # time.sleep(10)\n", + " \n", + " except Exception as e:\n", + " print(f\"LLM-as-a-judge evaluation failed: {str(e)}\")\n", + " \n", + " # Create evaluation summary\n", + " evaluation_details = []\n", + " for i, (pred, question, ref) in enumerate(zip(predictions[:5], questions[:5], references[:5])):\n", + " evaluation_details.append({\n", + " \"question\": question,\n", + " \"prediction\": pred[:500] + (\"...\" if len(pred) > 500 else \"\"),\n", + " \"reference\": ref[:500] + (\"...\" if len(ref) > 500 else \"\"),\n", + " })\n", + " \n", + " # Save detailed results\n", + " detailed_df = pd.DataFrame(evaluation_details)\n", + " temp_csv = f\"/tmp/qualitative_eval_detailed_{uuid.uuid4().hex[:8]}.csv\"\n", + " detailed_df.to_csv(temp_csv, index=False)\n", + " mlflow.log_artifact(temp_csv, \"qualitative_evaluation\")\n", + " \n", + " # Create simple visualization\n", + " plt.figure(figsize=(10, 6))\n", + " metric_names = list(metric_results.keys())\n", + " metric_values = list(metric_results.values())\n", + " plt.bar(metric_names, metric_values, color=['blue', 'green', 'red', 'orange'])\n", + " plt.title('Qualitative Evaluation Scores')\n", + " plt.ylabel('Score (1-5)')\n", + " plt.ylim(1, 5)\n", + " plt.xticks(rotation=45)\n", + " plt.tight_layout()\n", + " plt.savefig('/tmp/qualitative_metrics.png', dpi=300, bbox_inches='tight')\n", + " mlflow.log_artifact('/tmp/qualitative_metrics.png', \"qualitative_evaluation\")\n", + " \n", + " avg_medical_accuracy = metric_results.get(\"medical_accuracy\", metric_results.get(\"overall_quality\", 3.0))\n", + " \n", + " return {\n", + " \"model_name\": model_name,\n", + " \"endpoint_name\": endpoint_name, \n", + " \"num_samples\": len(dataset),\n", + " \"metrics\": metric_results,\n", + " \"evaluation_details\": evaluation_details,\n", + " \"avg_medical_accuracy\": avg_medical_accuracy\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we initialize the MLFlow run. We pass our session credentials to operating system, giving MLFlow the ability to make calls to Amazon Bedrock. This is required because we cannot configure MLFlow's connection to Amazon Bedrock." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "from botocore.config import Config\n", + "\n", + "with mlflow.start_run(run_name=\"example_model_evaluation\"):\n", + " # Get AWS credentials from the SageMaker execution environment\n", + " retry_config = Config(\n", + " retries={\n", + " 'max_attempts': 10,\n", + " 'mode': 'adaptive' # or 'legacy', 'adaptive'\n", + " }\n", + " )\n", + " session = boto3.Session()\n", + " credentials = session.get_credentials()\n", + " \n", + " # Set as environment variables\n", + " os.environ['AWS_ACCESS_KEY_ID'] = credentials.access_key\n", + " os.environ['AWS_SECRET_ACCESS_KEY'] = credentials.secret_key\n", + " if credentials.token:\n", + " os.environ['AWS_SESSION_TOKEN'] = credentials.token\n", + " \n", + " # Set region - important for Bedrock\n", + " region = boto3.session.Session().region_name\n", + " os.environ['AWS_REGION'] = region\n", + "\n", + " mlflow.set_tag(\"component\", \"qualitative_model_evaluation\")\n", + " \n", + " # Initialize the SageMaker client\n", + " sm_client = boto3.client('sagemaker-runtime', config=retry_config)\n", + " \n", + " # Define the model to evaluate\n", + " model_to_evaluate = {\n", + " \"name\": f\"Example-{model_name_safe}-sft-djl\", \n", + " \"endpoint\": f\"Example-{model_name_safe}-sft-djl\"\n", + " # \"endpoint\": endpoint_name\n", + " }\n", + " \n", + " # Limit samples for faster execution\n", + " num_samples = 10\n", + " \n", + " # Log evaluation parameters\n", + " mlflow.log_param(\"qualitative_evaluation_endpoint\", endpoint_name)\n", + " mlflow.log_param(\"qualitative_evaluation_num_samples\", num_samples)\n", + " mlflow.log_param(\"qualitative_evaluation_timestamp\", datetime.now().isoformat())\n", + " mlflow.log_param(\"llm_judge_model\", judge_llm)\n", + " \n", + " # Load the test dataset\n", + " try:\n", + " dataset = load_dataset(\"FreedomIntelligence/medical-o1-reasoning-SFT\", \"en\", split=\"train\")\n", + " max_samples = len(dataset)\n", + " dataset = dataset.shuffle().select(range(min(num_samples, max_samples)))\n", + " print(f\"Loaded medical-o1-reasoning dataset with {len(dataset)} samples for qualitative evaluation\")\n", + " \n", + " mlflow.log_param(\"qualitative_dataset_name\", \"FreedomIntelligence/medical-o1-reasoning-SFT\") \n", + " mlflow.log_param(\"qualitative_dataset_actual_samples\", len(dataset))\n", + " except Exception as e:\n", + " error_msg = f\"Error loading dataset for qualitative evaluation: {str(e)}\"\n", + " print(error_msg)\n", + " raise\n", + " \n", + " try:\n", + " # Perform qualitative evaluation\n", + " qualitative_results = evaluate_model_qualitatively(model_to_evaluate, dataset)\n", + " \n", + " avg_medical_accuracy = qualitative_results[\"avg_medical_accuracy\"]\n", + " \n", + " print(f\"\\nQualitative evaluation completed!\")\n", + " print(f\"Average Medical Accuracy: {avg_medical_accuracy:.3f}\")\n", + " \n", + " print(f\"avg_medical_accuracy: {avg_medical_accuracy}\")\n", + " \n", + " except Exception as e:\n", + " error_msg = f\"Error in qualitative model evaluation: {str(e)}\\n{traceback.format_exc()}\"\n", + " print(error_msg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6. Templating a Prompt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the next workshop we fine-tune Qwen3-4B-Instruct-2507 to become a medical expert. To accomplish this, we execute a fine-tuning job using Managed MLflow on SageMaker AI. We get our data from the [FreedomIntelligence/medical-o1-reasoning-SFT](https://huggingface.co/datasets/FreedomIntelligence/medical-o1-reasoning-SFT) dataset, available on HuggingFace.\n", + "\n", + "In this lab, we show a small example of what fine-tuning looks like for a single record of the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "FINE_TUNING_DATA_SAMPLE = {\n", + " \"Question\": \"A 61-year-old woman with a long history of involuntary urine loss during activities like coughing or sneezing but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings, what would cystometry most likely reveal about her residual volume and detrusor contractions?\", \n", + " \"Complex_CoT\": \"Okay, let's think about this step by step. There's a 61-year-old woman here who's been dealing with involuntary urine leakages whenever she's doing something that ups her abdominal pressure like coughing or sneezing. This sounds a lot like stress urinary incontinence to me. Now, it's interesting that she doesn't have any issues at night; she isn't experiencing leakage while sleeping. This likely means her bladder's ability to hold urine is fine when she isn't under physical stress. Hmm, that's a clue that we're dealing with something related to pressure rather than a bladder muscle problem.\\n\\nThe fact that she underwent a Q-tip test is intriguing too. This test is usually done to assess urethral mobility. In stress incontinence, a Q-tip might move significantly, showing urethral hypermobility. This kind of movement often means there's a weakness in the support structures that should help keep the urethra closed during increases in abdominal pressure. So, that's aligning well with stress incontinence.\\n\\nNow, let's think about what would happen during cystometry. Since stress incontinence isn't usually about sudden bladder contractions, I wouldn't expect to see involuntary detrusor contractions during this test. Her bladder isn't spasming or anything; it's more about the support structure failing under stress. Plus, she likely empties her bladder completely because stress incontinence doesn't typically involve incomplete emptying. So, her residual volume should be pretty normal.\\n\\nAll in all, it seems like if they do a cystometry on her, it will likely show a normal residual volume and no involuntary contractions. Yup, I think that makes sense given her symptoms and the typical presentations of stress urinary incontinence.\",\n", + " \"Response\": \"Cystometry in this case of stress urinary incontinence would most likely reveal a normal post-void residual volume, as stress incontinence typically does not involve issues with bladder emptying. Additionally, since stress urinary incontinence is primarily related to physical exertion and not an overactive bladder, you would not expect to see any involuntary detrusor contractions during the test.\"\n", + "}\n", + "\n", + "SYSTEM_PROMPT = \"\"\"You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. \n", + "Below is an instruction that describes a task, paired with an input that provides further context. \n", + "Write a response that appropriately completes the request.\n", + "Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.\"\"\"\n", + "\n", + "def convert_to_messages(sample, system_prompt=\"\"):\n", + " \n", + " messages = [\n", + " {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n", + " {\"role\": \"user\", \"content\": sample[\"Question\"]},\n", + " {\"role\": \"assistant\", \n", + " \"content\": \n", + " f\"{sample[\"Complex_CoT\"]}\\n\\n{sample[\"Response\"]}\"}\n", + " ]\n", + "\n", + " sample[\"messages\"] = messages\n", + " \n", + " return sample\n", + "\n", + "\n", + "PROCESSED_SAMPLE = convert_to_messages(FINE_TUNING_DATA_SAMPLE)\n", + "print(PROCESSED_SAMPLE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Fine-Tuning Output\n", + "The above output shows the templated prompt output to be used for fine-tuning. This pre-processing happens for every record in the fine-tuning dataset before fine-tuning actually takes place. This can be time-consuming for large fine-tuning datasets. We will show in the next lab how to orchestrate this with MLflow." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Clean-up Endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def delete_endpoint_with_retry(endpoint_name, max_retries=3, wait_seconds=10):\n", + " \"\"\"\n", + " Delete a SageMaker endpoint with retry logic\n", + " \n", + " Args:\n", + " endpoint_name (str): Name of the SageMaker endpoint to delete\n", + " max_retries (int): Maximum number of retry attempts\n", + " wait_seconds (int): Time to wait between retries in seconds\n", + " \n", + " Returns:\n", + " bool: True if deletion was successful, False otherwise\n", + " \"\"\"\n", + " sm_client = boto3.client('sagemaker')\n", + " \n", + " # First check if the endpoint exists\n", + " try:\n", + " sm_client.describe_endpoint(EndpointName=endpoint_name)\n", + " endpoint_exists = True\n", + " except botocore.exceptions.ClientError as e:\n", + " if \"Could not find endpoint\" in str(e):\n", + " print(f\"Endpoint {endpoint_name} does not exist, no cleanup needed.\")\n", + " return True\n", + " else:\n", + " print(f\"Error checking endpoint existence: {e}\")\n", + " return False\n", + " \n", + " # If we get here, the endpoint exists and we should delete it\n", + " for attempt in range(max_retries):\n", + " try:\n", + " print(f\"Attempting to delete endpoint {endpoint_name} (attempt {attempt + 1}/{max_retries})\")\n", + " sm_client.delete_endpoint(EndpointName=endpoint_name)\n", + " sm_client.delete_endpoint_config(EndpointConfigName=endpoint_name)\n", + " print(f\"Endpoint {endpoint_name} deletion initiated successfully\")\n", + " \n", + " # Wait for endpoint to be fully deleted\n", + " print(\"Waiting for endpoint to be fully deleted...\")\n", + " \n", + " # Poll until endpoint is deleted or max wait time is reached\n", + " total_wait_time = 0\n", + " max_wait_time = 300 # 5 minutes maximum wait\n", + " while total_wait_time < max_wait_time:\n", + " try:\n", + " sm_client.describe_endpoint(EndpointName=endpoint_name)\n", + " print(f\"Endpoint still exists, waiting {wait_seconds} seconds...\")\n", + " time.sleep(wait_seconds)\n", + " total_wait_time += wait_seconds\n", + " except botocore.exceptions.ClientError:\n", + " print(f\"Endpoint {endpoint_name} successfully deleted\")\n", + " return True\n", + " \n", + " # If we get here, the endpoint still exists after max_wait_time\n", + " print(f\"Warning: Endpoint deletion initiated but still exists after {max_wait_time} seconds\")\n", + " return False\n", + " \n", + " except botocore.exceptions.ClientError as e:\n", + " if \"ResourceInUse\" in str(e) or \"ResourceNotFound\" in str(e):\n", + " print(f\"Error deleting endpoint: {e}\")\n", + " print(f\"Retrying in {wait_seconds} seconds...\")\n", + " time.sleep(wait_seconds)\n", + " else:\n", + " print(f\"Unexpected error deleting endpoint: {e}\")\n", + " return False\n", + " \n", + " print(f\"Failed to delete endpoint {endpoint_name} after {max_retries} attempts\")\n", + " return False\n", + "\n", + "# Clean up endpoint\n", + "try:\n", + " print(f\"Cleaning up endpoint: {endpoint_name}\")\n", + " if delete_endpoint_with_retry(endpoint_name):\n", + " print(\"Cleanup completed successfully\")\n", + " else:\n", + " print(\"Warning: Endpoint cleanup may have failed, please check the SageMaker console\")\n", + " \n", + "except Exception as e:\n", + " print(f\"Error during endpoint cleanup: {str(e)}\")\n", + " print(\"You may need to manually delete the endpoint from the SageMaker console\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Next Steps\n", + "In this notebook, we illustrated the building blocks for a fine-tuned LLM-deployment pipeline. We showed:\n", + "\n", + "1. How to prepare data for a fine-tuning job\n", + "2. How to deploy a model to a SageMaker AI Managed Endpoint\n", + "3. How to evaluate a model's performance\n", + "4. Creating and applying Guardrails to our model\n", + "5. Tracing model calls using MLFlow tracing\n", + "\n", + "Next, we show how to actually perform fine-tuning on this Qwen3 model to improve the model's performance in this domain. Moreover, we'll orchestrate all of these steps into a fine-tuning pipeline powered by Managed MLFlow and SageMaker AI Pipelines." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/05.01_fine-tuning-pipeline.ipynb b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/05.01_fine-tuning-pipeline.ipynb index da7c626..ab2a97a 100644 --- a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/05.01_fine-tuning-pipeline.ipynb +++ b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/05.01_fine-tuning-pipeline.ipynb @@ -4,14 +4,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Fine-Tuning and Evaluating LLMs with SageMaker Pipelines and MLflow" + "## Coordinating FMOps Steps into a Fine-Tuning and Model Evaluation Pipeline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Running hundreds of experiments, comparing the results, and keeping a track of the ML lifecycle can become very complex. This is where MLflow can help streamline the ML lifecycle, from data preparation to model deployment. By integrating MLflow into your LLM workflow, you can efficiently manage experiment tracking, model versioning, and deployment, providing reproducibility. With MLflow, you can track and compare the performance of multiple LLM experiments, identify the best-performing models, and deploy them to production environments with confidence. \n", + "In this notebook, we stitch together the components of FMOps into a full FMOps pipeline on SageMaker AI. This capability creates a Directed-Acyclic Graph of steps, orchestrated by SageMaker AI and Managed MLFlow 3.0 on Amazon SageMaker.\n", + "\n", + "Running hundreds of experiments, comparing the results, and keeping a track of the ML lifecycle can become very complex. This is where MLflow can help streamline the ML lifecycle, from data preparation to model deployment. By integrating MLflow into your LLM workflow, you can efficiently manage experiment tracking, model versioning, and deployment, providing reproducibility of steps. With MLflow, you can track and compare the performance of multiple LLM experiments, identify the best-performing models, and deploy them to production environments with confidence. \n", "\n", "You can create workflows with SageMaker Pipelines that enable you to prepare data, fine-tune models, and evaluate model performance with simple Python code for each step. \n", "\n", @@ -22,6 +24,7 @@ " \n", "\n", "The following figure shows the overview of the solution.\n", + "\n", "![](./ml-16670-arch-with-mlflow.png)" ] }, @@ -32,9 +35,8 @@ "## Prerequisites \n", "Before you begin, make sure you have the following prerequisites in place:\n", "\n", - "- [HuggingFace access token](https://huggingface.co/docs/hub/en/security-tokens) – You need a HuggingFace login token to access the DeepSeek-R1-Distill-Llama-8B model and datasets used in this post.\n", - "\n", - "- The notebook will download the DeepSeek-R1-Distill-Llama-8B model from HuggingFace and upload it to your S3 bucket for fine-tuning." + "- MLflow tracking server: If you're running this lab in a workshop environment, a MLflow tracking server has already been created for you. If you need to create a MLflow tracking server, follow the [documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/mlflow-create-tracking-server.html)\n", + "- Haiku 3 Model Access: In order to use the LLM-as-a-Judge feature of these labs, you will need Amazon Bedrock model access to Haiku 3. Follow the [documentation](https://docs.aws.amazon.com/bedrock/latest/userguide/model-access.html) for more details on how to modify model access." ] }, { @@ -48,7 +50,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-15T17:56:50.196738Z", + "iopub.status.busy": "2025-10-15T17:56:50.196483Z", + "iopub.status.idle": "2025-10-15T17:56:58.331883Z", + "shell.execute_reply": "2025-10-15T17:56:58.331192Z", + "shell.execute_reply.started": "2025-10-15T17:56:50.196718Z" + } + }, "outputs": [], "source": [ "%pip install -r ./scripts/requirements.txt --upgrade --quiet" @@ -57,7 +67,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-15T17:56:58.332985Z", + "iopub.status.busy": "2025-10-15T17:56:58.332771Z", + "iopub.status.idle": "2025-10-15T17:56:58.338882Z", + "shell.execute_reply": "2025-10-15T17:56:58.338429Z", + "shell.execute_reply.started": "2025-10-15T17:56:58.332962Z" + } + }, "outputs": [], "source": [ "from IPython import get_ipython\n", @@ -76,7 +94,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-15T18:10:38.890122Z", + "iopub.status.busy": "2025-10-15T18:10:38.889793Z", + "iopub.status.idle": "2025-10-15T18:10:40.611213Z", + "shell.execute_reply": "2025-10-15T18:10:40.610667Z", + "shell.execute_reply.started": "2025-10-15T18:10:38.890099Z" + } + }, "outputs": [], "source": [ "import os\n", @@ -84,12 +110,10 @@ "import sagemaker\n", "from sagemaker.workflow.execution_variables import ExecutionVariables\n", "from sagemaker.workflow.function_step import step\n", - "from sagemaker.workflow.parameters import ParameterString\n", "from sagemaker.workflow.pipeline import Pipeline\n", "from sagemaker.workflow.condition_step import ConditionStep\n", "from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo\n", "from sagemaker.workflow.fail_step import FailStep\n", - "from sagemaker.workflow.steps import CacheConfig\n", "from botocore.exceptions import ClientError" ] }, @@ -97,7 +121,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### 2. SageMaker Session and IAM Role" + "### 2. SageMaker Session and IAM Role" ] }, { @@ -110,13 +134,21 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-15T18:12:29.689442Z", + "iopub.status.busy": "2025-10-15T18:12:29.689175Z", + "iopub.status.idle": "2025-10-15T18:12:30.791433Z", + "shell.execute_reply": "2025-10-15T18:12:30.790896Z", + "shell.execute_reply.started": "2025-10-15T18:12:29.689422Z" + } + }, "outputs": [], "source": [ "sagemaker_session = sagemaker.session.Session()\n", "role = sagemaker.get_execution_role()\n", "instance_type = \"ml.m5.xlarge\"\n", - "pipeline_name = \"deepseek-finetune-pipeline\"\n", + "pipeline_name = \"qwen3-finetune-pipeline\"\n", "bucket_name = sagemaker_session.default_bucket()\n", "default_prefix = sagemaker_session.default_bucket_prefix\n", "if default_prefix:\n", @@ -124,7 +156,7 @@ "else:\n", " input_path = f'datasets/llm-fine-tuning-modeltrainer-sft'\n", "\n", - "model_id = \"deepseek-ai/DeepSeek-R1-Distill-Llama-8B\"\n", + "model_id = \"Qwen/Qwen3-4B-Instruct-2507\"\n", "model_id_filesafe = model_id.replace(\"/\",\"_\").replace(\".\", \"_\")" ] }, @@ -154,29 +186,51 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-15T18:12:30.838220Z", + "iopub.status.busy": "2025-10-15T18:12:30.837950Z", + "iopub.status.idle": "2025-10-15T18:12:31.008197Z", + "shell.execute_reply": "2025-10-15T18:12:31.007542Z", + "shell.execute_reply.started": "2025-10-15T18:12:30.838200Z" + } + }, "outputs": [], "source": [ - "mlflow_tracking_server_arn = None\n", + "mlflow_tracking_server_arn = \"\"\n", "\n", - "if not mlflow_tracking_server_arn:\n", - " try:\n", - " response = boto3.client('sagemaker').describe_mlflow_tracking_server(\n", - " TrackingServerName='genai-mlflow-tracker'\n", - " )\n", - " mlflow_tracking_server_arn = response['TrackingServerArn']\n", - " print(f\"MLflow Tracking Server ARN: {mlflow_tracking_server_arn}\")\n", - " except ClientError:\n", - " print(\"No MLflow Tracking Server Found, please input a value for mlflow_tracking_server_arn\")\n", + "try:\n", + " response = boto3.client('sagemaker').describe_mlflow_tracking_server(\n", + " TrackingServerName='genai-mlflow-tracker'\n", + " )\n", + " mlflow_tracking_server_arn = response['TrackingServerArn']\n", + " print(f\"MLflow Tracking Server ARN: {mlflow_tracking_server_arn}\")\n", + "except ClientError:\n", + " print(\"No MLflow Tracking Server Found, please input a value for mlflow_tracking_server_arn\")\n", "\n", "os.environ[\"mlflow_tracking_server_arn\"] = mlflow_tracking_server_arn\n", "os.environ[\"pipeline_name\"] = pipeline_name" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This section provides blanket configuration for how remote functions should be executed in a SageMaker environment. This configuration helps to streamline remote function execution which is particularly useful for optimizing the execution of pipelines." + ] + }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-15T18:12:32.608898Z", + "iopub.status.busy": "2025-10-15T18:12:32.608642Z", + "iopub.status.idle": "2025-10-15T18:12:32.612632Z", + "shell.execute_reply": "2025-10-15T18:12:32.612142Z", + "shell.execute_reply.started": "2025-10-15T18:12:32.608878Z" + } + }, "outputs": [], "source": [ "%%writefile config.yaml\n", @@ -199,7 +253,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-15T18:12:33.245143Z", + "iopub.status.busy": "2025-10-15T18:12:33.244889Z", + "iopub.status.idle": "2025-10-15T18:12:33.247752Z", + "shell.execute_reply": "2025-10-15T18:12:33.247285Z", + "shell.execute_reply.started": "2025-10-15T18:12:33.245122Z" + } + }, "outputs": [], "source": [ "# Set path to config file\n", @@ -210,24 +272,26 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Download Model Data from Huggingface" + "### 4. Download Model Data from Huggingface" ] }, { "cell_type": "code", "execution_count": null, "metadata": { + "execution": { + "iopub.execute_input": "2025-10-15T18:12:34.760569Z", + "iopub.status.busy": "2025-10-15T18:12:34.760310Z", + "iopub.status.idle": "2025-10-15T18:12:35.448363Z", + "shell.execute_reply": "2025-10-15T18:12:35.447833Z", + "shell.execute_reply.started": "2025-10-15T18:12:34.760549Z" + }, "scrolled": true }, "outputs": [], "source": [ "from huggingface_hub import snapshot_download\n", - "from sagemaker.s3 import S3Uploader\n", - "import os\n", - "import subprocess\n", - "import boto3\n", - "from botocore.exceptions import ClientError\n", - "from pathlib import Path\n", + "\n", "\n", "# Simple function to check if file exists in S3\n", "def s3_file_exists(s3_client, bucket, key):\n", @@ -335,242 +399,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 6. Pipeline Steps\n", + "### 5. Configure Fine-Tuning Job\n", "\n", "This section defines the core components of the SageMaker pipeline." ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Preprocessing Step**\n", - "\n", - "This step handles data preparation. We are going to prepare data for training and evaluation. We will log this data in MLflow\n", - "\n", - "For the purpose of fine tuning and evaluation we are going to use `FreedomIntelligence/medical-o1-reasoning-SFT` dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@step(\n", - " name=\"DataPreprocessing\",\n", - " instance_type=instance_type,\n", - " display_name=\"Data Preprocessing\",\n", - " keep_alive_period_in_seconds=900\n", - ")\n", - "def preprocess(\n", - " tracking_server_arn: str,\n", - " input_path: str,\n", - " experiment_name: str,\n", - " run_name: str,\n", - ") -> tuple:\n", - " import boto3\n", - " import shutil\n", - " import sagemaker\n", - " import os\n", - " import pandas as pd\n", - " from sagemaker.config import load_sagemaker_config\n", - " import mlflow\n", - " import traceback\n", - " from datasets import load_dataset\n", - " from sklearn.model_selection import train_test_split\n", - " from datasets import Dataset, DatasetDict\n", - " from random import randint\n", - "\n", - " mlflow.set_tracking_uri(tracking_server_arn)\n", - " mlflow.set_experiment(experiment_name)\n", - "\n", - " # Preprocessing code - runs regardless of MLflow status\n", - " try:\n", - " with mlflow.start_run(run_name=run_name) as run:\n", - " run_id = run.info.run_id\n", - " with mlflow.start_run(run_name=\"Processing\", nested=True):\n", - " #mlflow.autolog()\n", - " # Initialize SageMaker and S3 clients\n", - " sagemaker_session = sagemaker.Session()\n", - " s3_client = boto3.client('s3')\n", - " \n", - " bucket_name = sagemaker_session.default_bucket()\n", - " default_prefix = sagemaker_session.default_bucket_prefix\n", - " configs = load_sagemaker_config()\n", - " \n", - " # Load dataset with proper error handling\n", - " sample_dataset_size = 100\n", - " try:\n", - " dataset = load_dataset(\"FreedomIntelligence/medical-o1-reasoning-SFT\", \"en\")\n", - " except Exception as e:\n", - " error_msg = f\"Error loading dataset: {str(e)}\\n{traceback.format_exc()}\"\n", - " print(error_msg)\n", - " raise RuntimeError(f\"Failed to load dataset: {str(e)}\")\n", - " \n", - " df = pd.DataFrame(dataset['train'])\n", - " df = df[:sample_dataset_size]\n", - " \n", - " # Split dataset\n", - " train, test = train_test_split(df, test_size=0.1, random_state=42, shuffle=True)\n", - " \n", - " print(\"Number of train elements: \", len(train))\n", - " print(\"Number of test elements: \", len(test))\n", - " \n", - " # Log dataset statistics if MLflow is enabled\n", - " mlflow.log_param(\"dataset_source\", \"FreedomIntelligence/medical-o1-reasoning-SFT\")\n", - " mlflow.log_param(\"train_size\", len(train))\n", - " mlflow.log_param(\"test_size\", len(test))\n", - " mlflow.log_param(\"dataset_sample_size\", sample_dataset_size) # Log that we're using a subset of 100 samples\n", - " \n", - " # Define prompt template\n", - " prompt_template = f\"\"\"\n", - " <|begin_of_text|>\n", - " <|start_header_id|>system<|end_header_id|>\n", - " You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. \n", - " Below is an instruction that describes a task, paired with an input that provides further context. \n", - " Write a response that appropriately completes the request.\n", - " Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.\n", - " <|eot_id|><|start_header_id|>user<|end_header_id|>\n", - " {{question}}<|eot_id|>\n", - " <|start_header_id|>assistant<|end_header_id|>\n", - " {{complex_cot}}\n", - " \n", - " {{answer}}\n", - " <|eot_id|>\n", - " \"\"\"\n", - " \n", - " # Template dataset to add prompt to each sample\n", - " def template_dataset(sample):\n", - " try:\n", - " sample[\"text\"] = prompt_template.format(question=sample[\"Question\"],\n", - " complex_cot=sample[\"Complex_CoT\"],\n", - " answer=sample[\"Response\"])\n", - " return sample\n", - " except KeyError as e:\n", - " print(f\"KeyError in template_dataset: {str(e)}\")\n", - " # Provide default values for missing fields\n", - " missing_key = str(e).strip(\"'\")\n", - " if missing_key == \"Question\":\n", - " sample[\"text\"] = prompt_template.format(\n", - " question=\"[Missing question]\",\n", - " complex_cot=sample.get(\"Complex_CoT\", \"[Missing CoT]\"),\n", - " answer=sample.get(\"Response\", \"[Missing response]\")\n", - " )\n", - " elif missing_key == \"Complex_CoT\":\n", - " sample[\"text\"] = prompt_template.format(\n", - " question=sample[\"Question\"],\n", - " complex_cot=\"[Missing CoT]\",\n", - " answer=sample.get(\"Response\", \"[Missing response]\")\n", - " )\n", - " elif missing_key == \"Response\":\n", - " sample[\"text\"] = prompt_template.format(\n", - " question=sample[\"Question\"],\n", - " complex_cot=sample.get(\"Complex_CoT\", \"[Missing CoT]\"),\n", - " answer=\"[Missing response]\"\n", - " )\n", - " return sample\n", - " \n", - " # Create datasets\n", - " train_dataset = Dataset.from_pandas(train)\n", - " test_dataset = Dataset.from_pandas(test)\n", - " \n", - " dataset = DatasetDict({\"train\": train_dataset, \"test\": test_dataset})\n", - " \n", - " train_dataset = dataset[\"train\"].map(template_dataset, remove_columns=list(dataset[\"train\"].features))\n", - " \n", - " # Safely get a sample text, handling potential index errors\n", - " try:\n", - " sample_index = randint(0, len(train_dataset) - 1)\n", - " sample_text = train_dataset[sample_index][\"text\"]\n", - " print(f\"Sample text from index {sample_index}:\")\n", - " print(sample_text)\n", - " except (IndexError, KeyError) as e:\n", - " sample_text = \"Error retrieving sample text: \" + str(e)\n", - " print(sample_text)\n", - " \n", - " test_dataset = dataset[\"test\"].map(template_dataset, remove_columns=list(dataset[\"test\"].features))\n", - " \n", - " # Set paths\n", - " if default_prefix:\n", - " input_path = f'{default_prefix}/datasets/llm-fine-tuning-modeltrainer-sft'\n", - " else:\n", - " input_path = f'datasets/llm-fine-tuning-modeltrainer-sft'\n", - " \n", - " # Create directories with error handling\n", - " try:\n", - " os.makedirs(\"./data/train\", exist_ok=True)\n", - " os.makedirs(\"./data/test\", exist_ok=True)\n", - " except OSError as e:\n", - " error_msg = f\"Error creating directories: {str(e)}\"\n", - " print(error_msg)\n", - " # Continue with execution as we'll try to save files anyway\n", - " \n", - " # Save datasets locally with error handling\n", - " try:\n", - " train_dataset.to_json(\"./data/train/dataset.json\", orient=\"records\")\n", - " test_dataset.to_json(\"./data/test/dataset.json\", orient=\"records\")\n", - " except Exception as e:\n", - " error_msg = f\"Error saving datasets locally: {str(e)}\\n{traceback.format_exc()}\"\n", - " print(error_msg)\n", - " raise RuntimeError(f\"Failed to save datasets locally: {str(e)}\")\n", - " \n", - " # Define S3 paths\n", - " train_data_path = f\"s3://{bucket_name}/{input_path}/train/dataset.json\"\n", - " test_dataset_path = f\"s3://{bucket_name}/{input_path}/test/dataset.json\"\n", - " \n", - " # Store results for return\n", - " result_train_data_path = train_data_path\n", - " result_test_dataset_path = test_dataset_path\n", - " \n", - " # Log dataset paths if MLflow is enabled\n", - " mlflow.log_param(\"train_data_path\", train_data_path)\n", - " mlflow.log_param(\"test_dataset_path\", test_dataset_path)\n", - " \n", - " # Upload files to S3 with retries\n", - " max_retries = 3\n", - " for attempt in range(max_retries):\n", - " try:\n", - " print(f\"Uploading train dataset to S3, attempt {attempt+1}/{max_retries}\")\n", - " s3_client.upload_file(\"./data/train/dataset.json\", bucket_name, f\"{input_path}/train/dataset.json\")\n", - " print(f\"Uploading test dataset to S3, attempt {attempt+1}/{max_retries}\")\n", - " s3_client.upload_file(\"./data/test/dataset.json\", bucket_name, f\"{input_path}/test/dataset.json\")\n", - " print(\"S3 upload successful\")\n", - " break\n", - " except Exception as e:\n", - " error_msg = f\"Error in S3 upload (attempt {attempt+1}/{max_retries}): {str(e)}\"\n", - " print(error_msg)\n", - " if attempt == max_retries - 1: # Last attempt failed\n", - " raise RuntimeError(f\"Failed to upload datasets to S3 after {max_retries} attempts: {str(e)}\")\n", - " \n", - " print(f\"Datasets uploaded to:\")\n", - " print(train_data_path)\n", - " print(test_dataset_path)\n", - " \n", - " # Log a sample of the dataset as an artifact if MLflow is enabled\n", - " try:\n", - " with open(\"./data/sample.txt\", \"w\") as f:\n", - " f.write(sample_text)\n", - " mlflow.log_artifact(\"./data/sample.txt\", \"dataset_samples\")\n", - " except Exception as e:\n", - " print(f\"Error logging sample as artifact: {str(e)}\")\n", - " \n", - " # Clean up\n", - " try:\n", - " if os.path.exists(\"./data\"):\n", - " shutil.rmtree(\"./data\")\n", - " except Exception as e:\n", - " print(f\"Warning: Error cleaning up temporary files: {str(e)}\")\n", - " \n", - " except Exception as e:\n", - " error_msg = f\"Critical error in preprocessing: {str(e)}\\n{traceback.format_exc()}\"\n", - " print(error_msg)\n", - " raise RuntimeError(f\"Preprocessing failed: {str(e)}\")\n", - " \n", - "\n", - " return run_id, result_train_data_path, result_test_dataset_path" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -594,7 +427,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-15T18:12:37.083014Z", + "iopub.status.busy": "2025-10-15T18:12:37.082756Z", + "iopub.status.idle": "2025-10-15T18:12:37.095488Z", + "shell.execute_reply": "2025-10-15T18:12:37.094990Z", + "shell.execute_reply.started": "2025-10-15T18:12:37.082991Z" + } + }, "outputs": [], "source": [ "%%bash\n", @@ -672,7 +513,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-15T18:12:40.136027Z", + "iopub.status.busy": "2025-10-15T18:12:40.135746Z", + "iopub.status.idle": "2025-10-15T18:12:40.542117Z", + "shell.execute_reply": "2025-10-15T18:12:40.541629Z", + "shell.execute_reply.started": "2025-10-15T18:12:40.136005Z" + } + }, "outputs": [], "source": [ "from sagemaker.s3 import S3Uploader\n", @@ -694,829 +543,147 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Fine-tuning Step**\n", + "### 6. Pipeline Creation and Execution\n", "\n", - "This is where the actual model adaptation occurs. The step takes the preprocessed data and applies it to fine-tune the base LLM (in this case, a Deepseek model). It incorporates the LoRA technique for efficient adaptation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@step(\n", - " name=\"ModelFineTuning\",\n", - " instance_type=instance_type,\n", - " display_name=\"Model Fine Tuning\",\n", - " keep_alive_period_in_seconds=900,\n", - " dependencies=\"./scripts/requirements.txt\"\n", - ")\n", - "def train(\n", - " tracking_server_arn: str,\n", - " train_dataset_s3_path: str,\n", - " test_dataset_s3_path: str,\n", - " train_config_s3_path: str,\n", - " experiment_name: str,\n", - " model_id: str,\n", - " run_id: str,\n", - "):\n", - " import sagemaker\n", - " import boto3\n", - " import mlflow\n", - " import yaml\n", - " import json\n", - " import time\n", - " import datetime\n", - " import os\n", - " import traceback\n", - " import tempfile\n", - " from pathlib import Path\n", - " from sagemaker.pytorch import PyTorch\n", - " \n", - " # Initialize variables and tracking\n", - " start_time = time.time()\n", - " model_name = model_id.split(\"/\")[-1] if \"/\" in model_id else model_id\n", - " training_job_name = None\n", - " \n", - " mlflow.set_tracking_uri(tracking_server_arn)\n", - " mlflow.set_experiment(experiment_name)\n", - " \n", - " try:\n", - " with mlflow.start_run(run_id=run_id):\n", - " with mlflow.start_run(run_name=\"FinetuningStep\", nested=True) as training_run:\n", - " mlflow.autolog()\n", - " training_run_id = training_run.info.run_id\n", - " # Enable detailed tracking\n", - " mlflow.set_tag(\"component\", \"model_fine_tuning\")\n", - " mlflow.log_param(\"model_id\", model_id)\n", - " mlflow.log_param(\"train_dataset\", train_dataset_s3_path)\n", - " mlflow.log_param(\"test_dataset\", test_dataset_s3_path)\n", - " mlflow.log_param(\"training_start_time\", datetime.datetime.now().isoformat())\n", - " \n", - " # Download and parse the training config YAML to log hyperparameters\n", - " with tempfile.NamedTemporaryFile(delete=False) as tmp:\n", - " s3_client = boto3.client(\"s3\")\n", - " \n", - " # Parse S3 path\n", - " config_parts = train_config_s3_path.replace(\"s3://\", \"\").split(\"/\", 1)\n", - " bucket = config_parts[0]\n", - " key = config_parts[1]\n", - " \n", - " # Download config file\n", - " try:\n", - " s3_client.download_file(bucket, key, tmp.name)\n", - " # Parse the YAML config\n", - " with open(tmp.name, 'r') as f:\n", - " config = yaml.safe_load(f)\n", - " \n", - " # Log all hyperparameters from config\n", - " print(\"Logging hyperparameters to MLflow:\")\n", - " for param_name, param_value in config.items():\n", - " # Skip complex objects that can't be logged as parameters\n", - " if isinstance(param_value, (str, int, float, bool)):\n", - " print(f\" {param_name}: {param_value}\")\n", - " mlflow.log_param(param_name, param_value)\n", - " elif param_name == \"fsdp_config\" and isinstance(param_value, dict):\n", - " # Log nested config as JSON\n", - " mlflow.log_param(\"fsdp_config_json\", json.dumps(param_value))\n", - " \n", - " # Log file as artifact for reference\n", - " mlflow.log_artifact(tmp.name, \"training_config\")\n", - " \n", - " except Exception as e:\n", - " print(f\"Error parsing config file: {e}\")\n", - " \n", - " finally:\n", - " # Clean up temp file\n", - " if os.path.exists(tmp.name):\n", - " os.remove(tmp.name)\n", - " \n", - " # Launch the training job\n", - " job_name = f\"deepseek-finetune-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}\"\n", - " \n", - " sagemaker_session = sagemaker.Session()\n", - " \n", - " # Define metric definitions for more detailed CloudWatch metrics\n", - " metric_definitions = [\n", - " {'Name': 'loss', 'Regex': \"'loss':\\\\s*([0-9.]+)\"},\n", - " {'Name': 'epoch', 'Regex': \"'epoch':\\\\s*([0-9.]+)\"},\n", - " {'Name': 'train_loss', 'Regex': \"'train_loss':\\\\s*([0-9.]+)\"},\n", - " {'Name': 'lr', 'Regex': \"'learning_rate':\\\\s*([0-9.e-]+)\"},\n", - " {'Name': 'step', 'Regex': \"'step':\\\\s*([0-9.]+)\"},\n", - " {'Name': 'samples_per_second', 'Regex': \"'train_samples_per_second':\\\\s*([0-9.]+)\"},\n", - " ]\n", - " \n", - " # Log the metric definitions we're using\n", - " mlflow.log_param(\"tracked_metrics\", [m['Name'] for m in metric_definitions])\n", - " \n", - " pytorch_estimator = PyTorch(\n", - " entry_point='train.py',\n", - " source_dir=\"./scripts\",\n", - " job_name=job_name,\n", - " base_job_name=job_name,\n", - " max_run=50000,\n", - " role=role,\n", - " framework_version=\"2.2.0\",\n", - " py_version=\"py310\",\n", - " instance_count=1,\n", - " instance_type=\"ml.p3.2xlarge\",\n", - " sagemaker_session=sagemaker_session,\n", - " volume_size=50,\n", - " disable_output_compression=False,\n", - " keep_alive_period_in_seconds=1800,\n", - " distribution={\"torch_distributed\": {\"enabled\": True}},\n", - " hyperparameters={\n", - " \"config\": \"/opt/ml/input/data/config/args.yaml\"\n", - " },\n", - " metric_definitions=metric_definitions,\n", - " debugger_hook_config=False,\n", - " environment={\"MLFLOW_RUN_ID\": training_run_id}\n", - " )\n", - " \n", - " # Define a data input dictionary with our uploaded S3 URIs\n", - " data = {\n", - " 'train': train_dataset_s3_path,\n", - " 'test': test_dataset_s3_path,\n", - " 'config': train_config_s3_path\n", - " }\n", - " \n", - " print(f\"Data for Training Run: {data}\")\n", - " \n", - " # Log training job information\n", - " mlflow.log_param(\"job_name\", job_name)\n", - " mlflow.log_param(\"instance_type\", \"ml.p3.2xlarge\")\n", - " \n", - " # Start the training job\n", - " pytorch_estimator.fit(data, wait=True)\n", - " \n", - " # Get information about the completed training job\n", - " latest_run_job_name = pytorch_estimator.latest_training_job.job_name\n", - " print(f\"Latest Job Name: {latest_run_job_name}\")\n", - " \n", - " sagemaker_client = boto3.client('sagemaker')\n", - " \n", - " # Describe the training job\n", - " response = sagemaker_client.describe_training_job(TrainingJobName=latest_run_job_name)\n", - " \n", - " # Extract the model artifacts S3 path\n", - " model_artifacts_s3_path = response['ModelArtifacts']['S3ModelArtifacts']\n", - " \n", - " # Extract the output path (this is the general output location)\n", - " output_path = response['OutputDataConfig']['S3OutputPath']\n", - " \n", - " # Get training time metrics\n", - " training_start_time = response.get('TrainingStartTime')\n", - " training_end_time = response.get('TrainingEndTime')\n", - " billable_time = response.get('BillableTimeInSeconds', 0)\n", - " \n", - " # Calculate duration\n", - " total_training_time = 0\n", - " if training_start_time and training_end_time:\n", - " total_training_time = (training_end_time - training_start_time).total_seconds()\n", - " \n", - " # Log job results and metrics to MLflow\n", - " # Log basic job info\n", - " mlflow.log_param(\"training_job_name\", latest_run_job_name)\n", - " mlflow.log_param(\"model_artifacts_path\", model_artifacts_s3_path)\n", - " mlflow.log_param(\"output_path\", output_path)\n", - " \n", - " # Log performance metrics\n", - " mlflow.log_metric(\"billable_time_seconds\", billable_time)\n", - " mlflow.log_metric(\"total_training_time_seconds\", total_training_time)\n", - " \n", - " # Log training job status\n", - " mlflow.log_param(\"training_job_status\", response.get('TrainingJobStatus'))\n", - " \n", - " # Log any secondary status\n", - " if 'SecondaryStatus' in response:\n", - " mlflow.log_param(\"secondary_status\", response.get('SecondaryStatus'))\n", - " \n", - " # Log any failure reason\n", - " if 'FailureReason' in response:\n", - " mlflow.log_param(\"failure_reason\", response.get('FailureReason'))\n", - " \n", - " # Get CloudWatch logs for the training job\n", - " logs_client = boto3.client('logs')\n", - " log_group = \"/aws/sagemaker/TrainingJobs\"\n", - " log_stream = latest_run_job_name\n", - " \n", - " try:\n", - " # Get the last 1000 log events\n", - " log_events = logs_client.get_log_events(\n", - " logGroupName=log_group,\n", - " logStreamName=log_stream,\n", - " limit=1000\n", - " )\n", - " \n", - " # Extract and save logs\n", - " log_output = \"\\n\".join([event['message'] for event in log_events['events']])\n", - " \n", - " # Save logs to file and log as artifact\n", - " with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.txt') as tmp:\n", - " tmp.write(log_output)\n", - " log_file_path = tmp.name\n", - " \n", - " mlflow.log_artifact(log_file_path, \"training_logs\")\n", - " os.remove(log_file_path)\n", - " \n", - " except Exception as e:\n", - " print(f\"Error fetching training logs: {e}\")\n", - " \n", - " # Log total execution time of this step\n", - " step_duration = time.time() - start_time\n", - " mlflow.log_metric(\"step_execution_time_seconds\", step_duration)\n", - " \n", - " # Log model metadata\n", - " mlflow.set_tag(\"model_path\", model_artifacts_s3_path)\n", - " mlflow.set_tag(\"training_completed_at\", datetime.datetime.now().isoformat())\n", - " \n", - " print(f\"Model artifacts S3 path: {model_artifacts_s3_path}\")\n", + "This final section brings all the components together into an executable pipeline.\n", "\n", - " except Exception as e:\n", - " error_msg = f\"Error in model fine-tuning: {str(e)}\\n{traceback.format_exc()}\"\n", - " print(error_msg)\n", - " \n", - " raise RuntimeError(f\"Fine-tuning failed: {str(e)}\")\n", + "**Creating the Pipeline**\n", "\n", - " return run_id, training_run_id, model_artifacts_s3_path, output_path" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Deploy Step\n", - "This step deploys the model for evaluation" + "The pipeline object is created with all defined steps.\n", + "\n", + "1. Preprocessing Step -- Reformat all of the fine-tuning data to the prompt format required for the fine-tuning job.\n", + "2. Training Step -- Execute the model fine-tuning job using the preprocessed data.\n", + "3. Deploy Step -- Deploy the model to a SageMaker AI Managed Endpoint for testing fine-tuning performance.\n", + "4. Quantitative Evaluation Step -- Evaluate the model's performance using ROUGE scores.\n", + "5. Qualitative Evaluation Step -- Evaluate the model's performance using LLM-as-a-Judge.\n", + "6. Conditionally Register Model -- Register the model if the quantitative and qualitative evaluations meet criteria." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-15T18:12:43.868659Z", + "iopub.status.busy": "2025-10-15T18:12:43.868403Z", + "iopub.status.idle": "2025-10-15T18:12:44.216288Z", + "shell.execute_reply": "2025-10-15T18:12:44.215721Z", + "shell.execute_reply.started": "2025-10-15T18:12:43.868639Z" + } + }, "outputs": [], "source": [ - "@step(\n", - " name=\"ModelDeploy\",\n", - " instance_type=instance_type,\n", - " display_name=\"Model Deploy\",\n", - " keep_alive_period_in_seconds=900\n", - ")\n", - "def deploy(\n", - " model_artifacts_s3_path: str,\n", - " output_path: str,\n", - " model_id: str,\n", - "):\n", - " import sagemaker\n", - " import boto3\n", - " from sagemaker import get_execution_role\n", - " from sagemaker import Model\n", - " import time\n", - " \n", - " sagemaker_session = sagemaker.Session()\n", - " instance_count = 1\n", - " instance_type = \"ml.g5.2xlarge\"\n", - " health_check_timeout = 700\n", - " \n", - " # Get the name for the endpoint\n", - " endpoint_name = f\"{model_id.split('/')[-1].replace('.', '-').replace('_','-')}-sft-djl\"\n", - " \n", - " # Delete existing endpoint if it exists\n", - " print(f\"Checking for existing endpoint: {endpoint_name}\")\n", - " sm_client = boto3.client('sagemaker')\n", - " try:\n", - " sm_client.describe_endpoint(EndpointName=endpoint_name)\n", - " print(f\"Endpoint {endpoint_name} exists, deleting it before deployment\")\n", - " sm_client.delete_endpoint(EndpointName=endpoint_name)\n", - "\n", - " print(f\"Deleting endpoint config {endpoint_name}\")\n", - " sm_client.delete_endpoint_config(EndpointConfigName=endpoint_name)\n", - " \n", - " # Wait for endpoint to be fully deleted\n", - " print(\"Waiting for endpoint to be fully deleted...\")\n", - " wait_seconds = 10\n", - " total_wait_time = 0\n", - " max_wait_time = 300 # 5 minutes maximum wait\n", - " endpoint_deleted = False\n", - " \n", - " while total_wait_time < max_wait_time and not endpoint_deleted:\n", - " try:\n", - " sm_client.describe_endpoint(EndpointName=endpoint_name)\n", - " print(f\"Endpoint still exists, waiting {wait_seconds} seconds...\")\n", - " time.sleep(wait_seconds)\n", - " total_wait_time += wait_seconds\n", - " except sm_client.exceptions.ClientError:\n", - " print(f\"Endpoint {endpoint_name} successfully deleted\")\n", - " endpoint_deleted = True\n", - " \n", - " if not endpoint_deleted:\n", - " print(f\"Warning: Endpoint still exists after {max_wait_time} seconds\")\n", - " \n", - " except sm_client.exceptions.ClientError:\n", - " print(f\"Endpoint {endpoint_name} does not exist, proceeding with deployment\")\n", - " \n", - " # Continue with model deployment\n", - " image_uri = sagemaker.image_uris.retrieve(\n", - " framework=\"djl-lmi\",\n", - " region=sagemaker_session.boto_session.region_name,\n", - " version=\"latest\"\n", - " )\n", - " \n", - " model_data = model_artifacts_s3_path\n", - " \n", - " # Create model only once\n", - " model = Model(\n", - " image_uri=image_uri,\n", - " model_data=model_data,\n", - " role=get_execution_role(),\n", - " env={\n", - " 'HF_MODEL_ID': \"/opt/ml/model\", # path to where sagemaker stores the model\n", - " 'OPTION_TRUST_REMOTE_CODE': 'true',\n", - " 'OPTION_ROLLING_BATCH': \"vllm\",\n", - " 'OPTION_DTYPE': 'bf16',\n", - " 'OPTION_QUANTIZE': 'fp8',\n", - " 'OPTION_TENSOR_PARALLEL_DEGREE': 'max',\n", - " 'OPTION_MAX_ROLLING_BATCH_SIZE': '32',\n", - " 'OPTION_MODEL_LOADING_TIMEOUT': '3600',\n", - " 'OPTION_MAX_MODEL_LEN': '4096'\n", - " }\n", - " )\n", - "\n", - " print(f\"deploying endpoint: {endpoint_name}\")\n", - " \n", - " predictor = model.deploy(\n", - " endpoint_name=endpoint_name,\n", - " initial_instance_count=instance_count,\n", - " instance_type=instance_type,\n", - " container_startup_health_check_timeout=health_check_timeout,\n", - " model_data_download_timeout=3600\n", - " )\n", - " \n", - " return endpoint_name" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Evaluation Step\n", - "\n", - "After fine-tuning, this step assesses the model's performance." + "from steps import pipeline_utils\n", + "guardrail_id, guardrail_version = pipeline_utils.get_or_create_guardrail()" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-15T18:13:22.801488Z", + "iopub.status.busy": "2025-10-15T18:13:22.801207Z", + "iopub.status.idle": "2025-10-15T18:13:25.970853Z", + "shell.execute_reply": "2025-10-15T18:13:25.970278Z", + "shell.execute_reply.started": "2025-10-15T18:13:22.801468Z" + } + }, "outputs": [], "source": [ - "@step(\n", - " name=\"ModelEvaluation\",\n", - " instance_type=instance_type,\n", - " display_name=\"Model Evaluation\",\n", - " keep_alive_period_in_seconds=900,\n", - " dependencies=\"./eval/requirements.txt\"\n", + "from steps import (\n", + " preprocess_step,\n", + " finetune_step,\n", + " deploy_step,\n", + " quantitative_eval_step,\n", + " qualitative_eval_step,\n", + " model_registration_step\n", ")\n", - "def evaluate(\n", - " tracking_server_arn: str,\n", - " experiment_name: str,\n", - " run_id: str,\n", - " endpoint_name: str,\n", - ")-> dict:\n", - " import os\n", - " import json\n", - " import time\n", - " import boto3\n", - " import pandas as pd\n", - " import numpy as np\n", - " import matplotlib.pyplot as plt\n", - " from tqdm.notebook import tqdm\n", - " from datasets import load_dataset\n", - " import mlflow\n", - " import uuid\n", - " import traceback\n", - " from datetime import datetime\n", - " \n", - " # Import LightEval metrics\n", - " from lighteval.metrics.metrics_sample import ROUGE, Doc\n", - " \n", - " # Initialize LightEval metrics calculators\n", - " rouge_metrics = ROUGE(\n", - " methods=[\"rouge1\", \"rouge2\", \"rougeL\"],\n", - " multiple_golds=False,\n", - " bootstrap=False,\n", - " normalize_gold=None,\n", - " normalize_pred=None\n", - " )\n", - " \n", - " # This function allows you to interact with a deployed SageMaker endpoint to get predictions from the DeepSeek model\n", - " def invoke_sagemaker_endpoint(payload, endpoint_name):\n", - " \"\"\"\n", - " Invoke a SageMaker endpoint with the given payload.\n", - " \n", - " Args:\n", - " payload (dict): The input data to send to the endpoint\n", - " endpoint_name (str): The name of the SageMaker endpoint\n", - " \n", - " Returns:\n", - " dict: The response from the endpoint\n", - " \"\"\"\n", - " try:\n", - " start_time = time.time()\n", - " response = sm_client.invoke_endpoint(\n", - " EndpointName=endpoint_name,\n", - " ContentType='application/json',\n", - " Body=json.dumps(payload)\n", - " )\n", - " inference_time = time.time() - start_time\n", - " \n", - " response_body = response['Body'].read().decode('utf-8')\n", - " return json.loads(response_body), inference_time\n", - " except Exception as e:\n", - " print(f\"Error invoking endpoint {endpoint_name}: {str(e)}\")\n", - " return None, -1\n", - " \n", - " def calculate_metrics(predictions, references):\n", - " \"\"\"\n", - " Calculate all evaluation metrics for summarization using LightEval.\n", - " \n", - " Args:\n", - " predictions (list): List of generated summaries\n", - " references (list): List of reference summaries\n", - " \n", - " Returns:\n", - " dict: Dictionary containing all metric scores\n", - " \"\"\"\n", - " metrics = {}\n", - " \n", - " # Create Doc objects for the Rouge and BertScore metrics\n", - " docs = []\n", - " for reference in references:\n", - " docs.append(Doc(\n", - " {\"target\": reference},\n", - " choices=[reference], # Dummy choices\n", - " gold_index=0 # Dummy gold_index\n", - " ))\n", - " \n", - " # Calculate ROUGE scores for each prediction-reference pair\n", - " rouge_scores = {\n", - " 'rouge1_f': [], \n", - " 'rouge2_f': [], \n", - " 'rougeL_f': [],\n", - " # Add precision and recall scores too\n", - " 'rouge1_precision': [],\n", - " 'rouge1_recall': [],\n", - " 'rouge2_precision': [],\n", - " 'rouge2_recall': [],\n", - " 'rougeL_precision': [],\n", - " 'rougeL_recall': []\n", - " }\n", - " \n", - " for pred, ref in zip(predictions, references):\n", - " # For ROUGE calculation\n", - " rouge_result = rouge_metrics.compute(golds=[ref], predictions=[pred])\n", - " rouge_scores['rouge1_f'].append(rouge_result['rouge1'])\n", - " rouge_scores['rouge2_f'].append(rouge_result['rouge2'])\n", - " rouge_scores['rougeL_f'].append(rouge_result['rougeL'])\n", - " \n", - " # For more detailed ROUGE metrics (we get precision and recall too)\n", - " detailed_rouge = rouge_metrics.compute_detailed(golds=[ref], predictions=[pred])\n", - " rouge_scores['rouge1_precision'].append(detailed_rouge[0]['rouge1_precision'])\n", - " rouge_scores['rouge1_recall'].append(detailed_rouge[0]['rouge1_recall'])\n", - " rouge_scores['rouge2_precision'].append(detailed_rouge[0]['rouge2_precision'])\n", - " rouge_scores['rouge2_recall'].append(detailed_rouge[0]['rouge2_recall'])\n", - " rouge_scores['rougeL_precision'].append(detailed_rouge[0]['rougeL_precision'])\n", - " rouge_scores['rougeL_recall'].append(detailed_rouge[0]['rougeL_recall'])\n", - " \n", - " # Average ROUGE scores\n", - " for key in rouge_scores:\n", - " metrics[key] = sum(rouge_scores[key]) / len(rouge_scores[key])\n", - " \n", - " # Calculate prediction statistics\n", - " metrics['avg_prediction_length'] = np.mean([len(pred.split()) for pred in predictions])\n", - " metrics['min_prediction_length'] = min([len(pred.split()) for pred in predictions])\n", - " metrics['max_prediction_length'] = max([len(pred.split()) for pred in predictions])\n", - " \n", - " # Calculate reference statistics\n", - " metrics['avg_reference_length'] = np.mean([len(ref.split()) for ref in references])\n", - " metrics['min_reference_length'] = min([len(ref.split()) for ref in references])\n", - " metrics['max_reference_length'] = max([len(ref.split()) for ref in references])\n", - " \n", - " # Calculate length ratio\n", - " metrics['avg_length_ratio'] = np.mean([len(pred.split()) / len(ref.split()) if len(ref.split()) > 0 else 0 \n", - " for pred, ref in zip(predictions, references)])\n", - " \n", - " print(f\"Metrics: {metrics}\")\n", - " \n", - " return metrics\n", - " \n", - " def generate_summaries_with_model(endpoint_name, dataset):\n", - " \"\"\"\n", - " Generate summaries using a model deployed on SageMaker.\n", - " \n", - " Args:\n", - " endpoint_name (str): SageMaker endpoint name\n", - " dataset: Dataset containing dialogues\n", - " \n", - " Returns:\n", - " list: Generated summaries\n", - " list: Inference times for each summary\n", - " \"\"\"\n", - " predictions = []\n", - " inference_times = []\n", - " failed_generations = 0\n", - " \n", - " for example in tqdm(dataset, desc=\"Generating Responses\"):\n", - " question = example[\"Question\"]\n", - " \n", - " # Prepare the prompt for the model\n", - " prompt = f\"\"\"\n", - " <|begin_of_text|>\n", - " <|start_header_id|>system<|end_header_id|>\n", - " You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. \n", - " Below is an instruction that describes a task, paired with an input that provides further context. \n", - " Write a response that appropriately completes the request.\n", - " Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.\n", - " <|eot_id|><|start_header_id|>user<|end_header_id|>\n", - " {question}<|eot_id|>\n", - " <|start_header_id|>assistant<|end_header_id|>\"\"\"\n", - " \n", - " # Payload for SageMaker endpoint\n", - " payload = {\n", - " \"inputs\": prompt,\n", - " \"parameters\": {\n", - " \"max_new_tokens\": 512,\n", - " \"top_p\": 0.9,\n", - " \"temperature\": 0.6,\n", - " \"return_full_text\": False\n", - " }\n", - " }\n", - " \n", - " # Call the model endpoint\n", - " try:\n", - " response, inference_time = invoke_sagemaker_endpoint(payload, endpoint_name)\n", - " \n", - " # Extract the generated text\n", - " if response is None:\n", - " prediction = \"Error generating response.\"\n", - " failed_generations += 1\n", - " elif isinstance(response, list):\n", - " prediction = response[0].get('generated_text', '').strip()\n", - " elif isinstance(response, dict):\n", - " prediction = response.get('generated_text', '').strip()\n", - " else:\n", - " prediction = str(response).strip()\n", - " \n", - " prediction = prediction.split(\"<|eot_id|>\")[0] if \"<|eot_id|>\" in prediction else prediction\n", - " \n", - " # Log individual inference metrics\n", - " mlflow.log_metric(f\"inference_time_sample_{len(predictions)}\", inference_time)\n", - " \n", - " inference_times.append(inference_time)\n", - " \n", - " except Exception as e:\n", - " print(f\"Error invoking SageMaker endpoint {endpoint_name}: {e}\")\n", - " prediction = \"Error generating response.\"\n", - " failed_generations += 1\n", - " inference_times.append(-1)\n", - " \n", - " predictions.append(prediction)\n", - " \n", - " # Log failure rate\n", - " mlflow.log_metric(\"failed_generations\", failed_generations)\n", - " mlflow.log_metric(\"failure_rate\", failed_generations / len(dataset) if len(dataset) > 0 else 0)\n", - " \n", - " return predictions, inference_times\n", - " \n", - " def evaluate_model_on_dataset(model_config, dataset):\n", - " \"\"\"\n", - " Evaluate a fine-tuned model on a dataset using both automated and human metrics.\n", - " \n", - " Args:\n", - " model_config (dict): Model configuration with name and endpoint\n", - " dataset: dataset for evaluation\n", - " \n", - " Returns:\n", - " dict: Evaluation results\n", - " \"\"\"\n", - " model_name = model_config[\"name\"]\n", - " endpoint_name = model_config[\"endpoint\"]\n", - " \n", - " print(f\"\\nEvaluating model: {model_name} on endpoint: {endpoint_name}\")\n", - " \n", - " # Get references\n", - " references = [\"\\n\".join([example[\"Complex_CoT\"], example[\"Response\"]]) for example in dataset]\n", - " \n", - " # Generate summaries\n", - " print(\"\\nGenerating Responses...\")\n", - " predictions, inference_times = generate_summaries_with_model(endpoint_name, dataset)\n", - " \n", - " # Log inference time metrics\n", - " valid_times = [t for t in inference_times if t > 0]\n", - " if valid_times:\n", - " mlflow.log_metric(\"avg_inference_time\", np.mean(valid_times))\n", - " mlflow.log_metric(\"min_inference_time\", min(valid_times))\n", - " mlflow.log_metric(\"max_inference_time\", max(valid_times))\n", - " mlflow.log_metric(\"p95_inference_time\", np.percentile(valid_times, 95))\n", - " \n", - " # Calculate automated metrics using LightEval\n", - " print(\"\\nCalculating evaluation metrics with LightEval...\")\n", - " metrics = calculate_metrics(predictions, references)\n", - " \n", - " # Log all calculated metrics to MLflow\n", - " for metric_name, metric_value in metrics.items():\n", - " mlflow.log_metric(metric_name, metric_value)\n", - " \n", - " # Create a comparison table of predictions vs references\n", - " comparison_data = []\n", - " for i, (pred, ref) in enumerate(zip(predictions[:5], references[:5])):\n", - " comparison_data.append({\n", - " \"example_id\": i,\n", - " \"prediction\": pred[:500] + (\"...\" if len(pred) > 500 else \"\"), # Truncate for readability\n", - " \"reference\": ref[:500] + (\"...\" if len(ref) > 500 else \"\"), # Truncate for readability\n", - " \"rouge1_f\": rouge_metrics.compute(golds=[ref], predictions=[pred])['rouge1']\n", - " })\n", - " \n", - " comparison_df = pd.DataFrame(comparison_data)\n", - " # Save comparison to a temporary CSV and log it as an artifact\n", - " temp_csv = f\"/tmp/predictions_comparison_{uuid.uuid4().hex[:8]}.csv\"\n", - " comparison_df.to_csv(temp_csv, index=False)\n", - " mlflow.log_artifact(temp_csv, \"model_predictions\")\n", - " \n", - " # Format results\n", - " results = {\n", - " \"model_name\": model_name,\n", - " \"endpoint_name\": endpoint_name,\n", - " \"num_samples\": len(dataset),\n", - " \"metrics\": metrics,\n", - " \"predictions\": predictions[:5], # First 5 predictions\n", - " \"references\": references[:5] # First 5 references\n", - " }\n", - " \n", - " # Print key results\n", - " print(f\"\\nResults for {model_name}:\")\n", - " print(f\"ROUGE-1 F1: {metrics['rouge1_f']:.4f}\")\n", - " print(f\"ROUGE-2 F1: {metrics['rouge2_f']:.4f}\")\n", - " print(f\"ROUGE-L F1: {metrics['rougeL_f']:.4f}\")\n", - " print(f\"Average Inference Time: {np.mean([t for t in inference_times if t > 0]):.3f} seconds\")\n", - " \n", - " return results, metrics['rouge1_f'], metrics['rouge2_f'], metrics['rougeL_f']\n", - " \n", - " mlflow.set_tracking_uri(tracking_server_arn)\n", - " mlflow.set_experiment(experiment_name)\n", - " \n", - " with mlflow.start_run(run_id=run_id):\n", - " with mlflow.start_run(run_name=\"ModelEvaluation\", nested=True):\n", - " mlflow.autolog()\n", - " \n", - " # Initialize the SageMaker client\n", - " sm_client = boto3.client('sagemaker-runtime')\n", - " \n", - " FINETUNED_MODEL_ENDPOINT = endpoint_name # Update with Fine-tuned model endpoint name\n", - " \n", - " # Define the model to evaluate\n", - " model_to_evaluate = {\n", - " \"name\": \"Fine-tuned DeepSeek-R1-Distill-Llama-8B\", \n", - " \"endpoint\": FINETUNED_MODEL_ENDPOINT\n", - " }\n", - " # Limit the number of samples to evaluate (for faster execution)\n", - " num_samples = 10\n", - " \n", - " # Log evaluation parameters to MLflow\n", - " mlflow.log_param(\"evaluation_endpoint\", FINETUNED_MODEL_ENDPOINT)\n", - " mlflow.log_param(\"evaluation_num_samples\", num_samples)\n", - " mlflow.log_param(\"evaluation_timestamp\", datetime.now().isoformat())\n", - " \n", - " # Load the test split of the medical-o1 dataset\n", - " try:\n", - " dataset = load_dataset(\"FreedomIntelligence/medical-o1-reasoning-SFT\", \"en\", split=\"train\")\n", - " \n", - " max_samples = len(dataset)\n", - " \n", - " dataset = dataset.shuffle().select(range(min(num_samples, max_samples)))\n", - " print(f\"Loaded medical-o1-reasoning dataset with {len(dataset)} samples out of {max_samples}\")\n", - " \n", - " mlflow.log_param(\"dataset_name\", \"FreedomIntelligence/medical-o1-reasoning-SFT\")\n", - " mlflow.log_param(\"dataset_actual_samples\", len(dataset))\n", - " except Exception as e:\n", - " error_msg = f\"Error loading dataset: {str(e)}\"\n", - " print(error_msg)\n", - " raise\n", - " \n", - " # Display a sample from the dataset\n", - " sample = dataset[0]\n", - " \n", - " print(\"\\nQuestion:\\n\", sample[\"Question\"], \"\\n\\n====\\n\")\n", - " print(\"Complex_CoT:\\n\", sample[\"Complex_CoT\"], \"\\n\\n====\\n\")\n", - " print(\"Response:\\n\", sample[\"Response\"], \"\\n\\n====\\n\")\n", - "\n", - " try:\n", - " finetuned_model_results, rouge1_f, rouge2_f, rougeL_f = evaluate_model_on_dataset(model_to_evaluate, dataset)\n", - " print(f\"ROUGE-1 F1: {rouge1_f}\")\n", - " print(f\"ROUGE-2 F1: {rouge2_f}\")\n", - " print(f\"ROUGE-L F1: {rougeL_f}\")\n", - " \n", - " # Create and log visualizations if MLflow is enabled\n", - " # Log model card with performance summary\n", - " model_card = f\"\"\"\n", - " # Model Evaluation Report\n", - " \n", - " ## Model Information\n", - " - **Model Name**: {model_to_evaluate[\"name\"]}\n", - " - **Endpoint**: {model_to_evaluate[\"endpoint\"]}\n", - " - **Evaluation Date**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n", - " - **Dataset**: FreedomIntelligence/medical-o1-reasoning-SFT\n", - " - **Samples Evaluated**: {len(dataset)}\n", - " \n", - " ## Performance Metrics\n", - " - **ROUGE-1 F1**: {rouge1_f:.4f}\n", - " - **ROUGE-2 F1**: {rouge2_f:.4f}\n", - " - **ROUGE-L F1**: {rougeL_f:.4f}\n", - " - **Average Inference Time**: {np.mean([t for t in finetuned_model_results[0][\"inference_times\"] if t > 0]):.3f} seconds\n", - " \n", - " ## Detailed Metrics\n", - " {json.dumps(finetuned_model_results[0][\"metrics\"], indent=2)}\n", - " \"\"\"\n", + "from sagemaker.workflow.step_collections import StepCollection\n", "\n", - " with open(\"/tmp/model_card.md\", \"w\") as f:\n", - " f.write(model_card)\n", - " \n", - " mlflow.log_artifact(\"/tmp/model_card.md\", \"evaluation_summary\")\n", - " \n", - " # Create a simple bar chart for ROUGE metrics\n", - " plt.figure(figsize=(10, 6))\n", - " metrics = finetuned_model_results[0][\"metrics\"]\n", - " rouge_metrics = {\n", - " 'ROUGE-1 F1': metrics['rouge1_f'], \n", - " 'ROUGE-2 F1': metrics['rouge2_f'], \n", - " 'ROUGE-L F1': metrics['rougeL_f']\n", - " }\n", - " plt.bar(rouge_metrics.keys(), rouge_metrics.values())\n", - " plt.title('ROUGE Metrics')\n", - " plt.ylabel('Score')\n", - " plt.ylim(0, 1)\n", - " plt.grid(axis='y', linestyle='--', alpha=0.7)\n", - " plt.savefig('/tmp/rouge_metrics.png')\n", - " mlflow.log_artifact('/tmp/rouge_metrics.png', \"evaluation_plots\")\n", - " \n", - " except Exception as e:\n", - " error_msg = f\"Error in model evaluation: {str(e)}\\n{traceback.format_exc()}\"\n", - " print(error_msg)\n", - " \n", - " # Return at least something even if evaluation fails\n", - " return {\"error\": str(e), \"rougeL_f\": 0.0}\n", - "\n", - " return {\"rougeL_f\": rougeL_f}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 7. Pipeline Creation and Execution\n", - "\n", - "This final section brings all the components together into an executable pipeline.\n", - "\n", - "**Creating the Pipeline**\n", - "\n", - "The pipeline object is created with all defined steps." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Defining the steps of the pipeline\n", - "preprocessing_step = preprocess(\n", + "preprocessing_step = preprocess_step.preprocess(\n", " tracking_server_arn=mlflow_tracking_server_arn,\n", " experiment_name=pipeline_name,\n", " run_name=ExecutionVariables.PIPELINE_EXECUTION_ID,\n", - " input_path=input_path,\n", + " input_path=input_path\n", ")\n", "\n", - "training_step = train(\n", + "training_step = finetune_step.train(\n", " tracking_server_arn=mlflow_tracking_server_arn,\n", " experiment_name=pipeline_name,\n", " run_id=preprocessing_step[0],\n", " train_dataset_s3_path=preprocessing_step[1],\n", " test_dataset_s3_path=preprocessing_step[2],\n", " train_config_s3_path=train_config_s3_path,\n", - " model_id=model_s3_destination,\n", + " role=role,\n", + " model_id=model_s3_destination\n", ")\n", + "run_id=training_step[0]\n", + "model_artifacts_s3_path=training_step[2]\n", "\n", - "deploy_step = deploy(\n", - " model_artifacts_s3_path=training_step[2],\n", - " output_path=training_step[3],\n", + "deploy_step = deploy_step.deploy(\n", + " tracking_server_arn=mlflow_tracking_server_arn,\n", + " model_artifacts_s3_path=model_artifacts_s3_path,\n", " model_id=model_s3_destination,\n", + " experiment_name=pipeline_name,\n", + " run_id=run_id,\n", ")\n", - "\n", - "evaluate_step = evaluate(\n", + "endpoint_name=deploy_step\n", + "\n", + "mlflow_trace_attributes = {\n", + " \"model_id\": model_id,\n", + " \"guardrail_id\": guardrail_id,\n", + " \"guardrail_version\": guardrail_version\n", + "}\n", + "quantitative_eval_step = quantitative_eval_step.quantitative_evaluate(\n", + " tracking_server_arn=mlflow_tracking_server_arn,\n", " experiment_name=pipeline_name,\n", + " run_id=run_id,\n", + " endpoint_name=endpoint_name,\n", + " mlflow_trace_attributes=mlflow_trace_attributes\n", + ")\n", + "\n", + "qualitative_eval_step = qualitative_eval_step.qualitative_evaluate(\n", " tracking_server_arn=mlflow_tracking_server_arn,\n", - " run_id=training_step[0],\n", - " endpoint_name=deploy_step[0],\n", + " experiment_name=pipeline_name,\n", + " run_id=run_id,\n", + " endpoint_name=endpoint_name,\n", + " mlflow_trace_attributes=mlflow_trace_attributes\n", + ")\n", + "\n", + "evaluation_gate = ConditionStep(\n", + " name=\"EvaluationGate\",\n", + " depends_on=[qualitative_eval_step],\n", + " conditions=[\n", + " ConditionGreaterThanOrEqualTo(\n", + " left=quantitative_eval_step[\"rougeL_f\"],\n", + " right=0.2\n", + " ),\n", + " ConditionGreaterThanOrEqualTo(\n", + " left=qualitative_eval_step[\"avg_medical_accuracy\"],\n", + " right=3.0\n", + " )\n", + " ],\n", + " if_steps=[\n", + " model_registration_step.register_model(\n", + " tracking_server_arn=mlflow_tracking_server_arn,\n", + " experiment_name=pipeline_name,\n", + " run_id=run_id, # Assuming training_step returns run_id as first output\n", + " model_artifacts_s3_path=model_artifacts_s3_path, # Assuming training_step returns artifacts path as second output\n", + " model_id=model_id,\n", + " model_name=f\"Fine-Tuned-Medical-Qwen3-4B-Instruct-2507\",\n", + " endpoint_name=endpoint_name,\n", + " evaluation_score=quantitative_eval_step[\"rougeL_f\"], # Get the evaluation score\n", + " pipeline_name=pipeline_name,\n", + " model_description=\"Fine-tuned medical LLM for clinical reasoning and diagnostics\"\n", + " )\n", + " ],\n", + " else_steps=[\n", + " FailStep(\n", + " name=\"EvaluationFailed\",\n", + " error_message=\"Model evaluation failed to meet quality thresholds.\"\n", + " )\n", + " ]\n", ")\n", "\n", "# Combining the steps into the pipeline definition\n", @@ -1525,7 +692,13 @@ " parameters=[\n", " instance_type,\n", " ],\n", - " steps=[preprocessing_step, training_step, deploy_step, evaluate_step],\n", + " steps=[\n", + " preprocessing_step,\n", + " training_step,\n", + " deploy_step,\n", + " quantitative_eval_step,\n", + " evaluation_gate\n", + " ],\n", ")" ] }, @@ -1541,7 +714,16 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-15T18:13:25.972041Z", + "iopub.status.busy": "2025-10-15T18:13:25.971610Z", + "iopub.status.idle": "2025-10-15T18:13:38.971434Z", + "shell.execute_reply": "2025-10-15T18:13:38.970889Z", + "shell.execute_reply.started": "2025-10-15T18:13:25.972020Z" + }, + "scrolled": true + }, "outputs": [], "source": [ "pipeline.upsert(role)" @@ -1559,7 +741,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2025-10-15T18:13:38.972413Z", + "iopub.status.busy": "2025-10-15T18:13:38.972199Z", + "iopub.status.idle": "2025-10-15T18:13:39.429711Z", + "shell.execute_reply": "2025-10-15T18:13:39.429119Z", + "shell.execute_reply.started": "2025-10-15T18:13:38.972394Z" + } + }, "outputs": [], "source": [ "execution = pipeline.start()" @@ -1614,6 +804,7 @@ " try:\n", " print(f\"Attempting to delete endpoint {endpoint_name} (attempt {attempt + 1}/{max_retries})\")\n", " sm_client.delete_endpoint(EndpointName=endpoint_name)\n", + " sm_client.delete_endpoint_config(EndpointConfigName=endpoint_name)\n", " print(f\"Endpoint {endpoint_name} deletion initiated successfully\")\n", " \n", " # Wait for endpoint to be fully deleted\n", @@ -1650,8 +841,7 @@ "\n", "# Clean up endpoint\n", "try:\n", - " model_name_safe = model_id.split('/')[-1].replace('.', '-').replace('_', '-')\n", - " endpoint_name = f\"{model_name_safe}-sft-djl\"\n", + " endpoint_name = f\"{model_id.replace('/', '-').replace('_', '-')}-sft-djl\"\n", " \n", " print(f\"Cleaning up endpoint: {endpoint_name}\")\n", " if delete_endpoint_with_retry(endpoint_name):\n", @@ -1663,6 +853,13 @@ " print(f\"Error during endpoint cleanup: {str(e)}\")\n", " print(\"You may need to manually delete the endpoint from the SageMaker console\")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/scripts/requirements.txt b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/scripts/requirements.txt index 06e0d14..6d003dd 100644 --- a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/scripts/requirements.txt +++ b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/scripts/requirements.txt @@ -1,20 +1,20 @@ -transformers==4.50.2 +transformers==4.52.2 peft==0.14.0 accelerate==1.3.0 bitsandbytes==0.45.1 datasets==3.2.0 evaluate==0.4.3 huggingface_hub[hf_transfer]==0.33.4 -mlflow +mlflow==2.22.2 safetensors>=0.5.2 -sagemaker==2.244.0 +sagemaker==2.252.0 sagemaker-mlflow==0.1.0 sentencepiece==0.2.0 scikit-learn==1.6.1 tokenizers>=0.21.0 -trl==0.9.6 -psutil -py7zr -pynvml -xtarfile -rouge-score \ No newline at end of file +trl==0.18.0 +psutil==7.1.0 +py7zr==1.0.0 +pynvml==13.0.1 +xtarfile==0.2.1 +rouge-score==0.1.2 \ No newline at end of file diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/scripts/train.py b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/scripts/train.py index c29da1b..2ad6b3e 100644 --- a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/scripts/train.py +++ b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/scripts/train.py @@ -1,27 +1,21 @@ import os import datetime -from typing import Dict, Optional, Tuple +from typing import Dict, Optional from dataclasses import dataclass, field -from functools import partial -from itertools import chain from accelerate import Accelerator -import bitsandbytes as bnb from huggingface_hub import snapshot_download from datasets import load_dataset import mlflow -from mlflow.models import infer_signature import torch -import transformers from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, set_seed -from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training +from peft import AutoPeftModelForCausalLM, LoraConfig, prepare_model_for_kbit_training -from trl.commands.cli_utils import TrlParser -from trl import SFTTrainer +from trl import SFTTrainer, TrlParser from sagemaker.s3 import S3Downloader import subprocess @@ -118,35 +112,6 @@ def download_model(model_name): print(f"Model {model_name} downloaded under {destination}") -def group_texts(examples, block_size=2048): - """ - Groups a list of tokenized text examples into fixed-size blocks for language model training. - - Args: - examples (dict): A dictionary where keys are feature names (e.g., "input_ids") and values - are lists of tokenized sequences. - block_size (int, optional): The size of each chunk. Defaults to 2048. - - Returns: - dict: A dictionary containing the grouped chunks for each feature. An additional "labels" key - is included, which is a copy of the "input_ids" key. - """ - # Concatenate all texts. - concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} - total_length = len(concatenated_examples[list(examples.keys())[0]]) - # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can - # customize this part to your needs. - if total_length >= block_size: - total_length = (total_length // block_size) * block_size - # Split by chunks of max_len. - result = { - k: [t[i : i + block_size] for i in range(0, total_length, block_size)] - for k, t in concatenated_examples.items() - } - result["labels"] = result["input_ids"].copy() - return result - - def set_custom_env(env_vars: Dict[str, str]) -> None: """ Set custom environment variables. @@ -176,10 +141,35 @@ def set_custom_env(env_vars: Dict[str, str]) -> None: for key, value in env_vars.items(): print(f" {key}: {value}") +def load_data(training_data_location, test_data_location): + # Load datasets + train_ds = load_dataset( + "json", + data_files=os.path.join(training_data_location, "dataset.json"), + split="train" + ) + + if script_args.test_dataset_path: + test_ds = load_dataset( + "json", + data_files=os.path.join(test_data_location, "dataset.json"), + split="train" + ) + else: + test_ds = None -def train(script_args, training_args, train_ds, test_ds): + return train_ds, test_ds + +def train(script_args, training_args): set_seed(training_args.seed) + mlflow_enabled = ( + script_args.mlflow_uri is not None + and script_args.mlflow_experiment_name is not None + and script_args.mlflow_uri != "" + and script_args.mlflow_experiment_name != "" + ) + accelerator = Accelerator() if script_args.token is not None: @@ -202,19 +192,21 @@ def train(script_args, training_args, train_ds, test_ds): # Set Tokenizer pad Token tokenizer.pad_token = tokenizer.eos_token - # tokenize and chunk dataset - lm_train_dataset = train_ds.map( - lambda sample: tokenizer(sample["text"]), remove_columns=list(train_ds.features) - ) + # # tokenize and chunk dataset + # lm_train_dataset = train_ds.map( + # lambda sample: tokenizer(sample["text"]), remove_columns=list(train_ds.features) + # ) - if test_ds is not None: - lm_test_dataset = test_ds.map( - lambda sample: tokenizer(sample["text"]), remove_columns=list(train_ds.features) - ) + # if test_ds is not None: + # lm_test_dataset = test_ds.map( + # lambda sample: tokenizer(sample["text"]), remove_columns=list(train_ds.features) + # ) - print(f"Total number of test samples: {len(lm_test_dataset)}") - else: - lm_test_dataset = None + # print(f"Total number of test samples: {len(lm_test_dataset)}") + # else: + # lm_test_dataset = None + + train_ds, test_ds = load_data(script_args.train_dataset_path, script_args.test_dataset_path) accelerator.wait_for_everyone() @@ -276,7 +268,7 @@ def train(script_args, training_args, train_ds, test_ds): ) if training_args.fsdp is None and training_args.fsdp_config is None: - model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing) + model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing) if training_args.gradient_checkpointing: model.gradient_checkpointing_enable() @@ -284,7 +276,7 @@ def train(script_args, training_args, train_ds, test_ds): if training_args.gradient_checkpointing: model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False}) - config = LoraConfig( + peft_config = LoraConfig( r=script_args.lora_r, lora_alpha=script_args.lora_alpha, target_modules="all-linear", @@ -293,47 +285,42 @@ def train(script_args, training_args, train_ds, test_ds): task_type="CAUSAL_LM" ) - model = get_peft_model(model, config) - print(f"max_seq_length: {script_args.max_seq_length}") + + print("Disabling checkpointing and setting up logging") + training_args.save_strategy="no" + training_args.logging_strategy="steps" + training_args.logging_steps=1 + training_args.log_on_each_node=False + training_args.ddp_find_unused_parameters=False trainer = SFTTrainer( model=model, - train_dataset=lm_train_dataset, - eval_dataset=lm_test_dataset if lm_test_dataset is not None else None, - max_seq_length=script_args.max_seq_length, - args=transformers.TrainingArguments( - per_device_train_batch_size=training_args.per_device_train_batch_size, - per_device_eval_batch_size=training_args.per_device_eval_batch_size, - gradient_accumulation_steps=training_args.gradient_accumulation_steps, - logging_strategy="steps", - logging_steps=1, - log_on_each_node=False, - num_train_epochs=training_args.num_train_epochs, - learning_rate=training_args.learning_rate, - bf16=training_args.bf16, - fp16=training_args.fp16, - ddp_find_unused_parameters=False, - save_strategy="no", - output_dir="outputs", - **trainer_configs - ), - callbacks=None, - data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False), + args=training_args, + train_dataset=train_ds, + eval_dataset=test_ds if test_ds is not None else None, + processing_class=tokenizer, + peft_config=peft_config ) if trainer.accelerator.is_main_process: trainer.model.print_trainable_parameters() - print("MLflow tracking under ", script_args.mlflow_experiment_name) - - train_dataset_mlflow = mlflow.data.from_pandas(train_ds.to_pandas(), name="train_dataset") - mlflow.log_input(train_dataset_mlflow, context="train") + if mlflow_enabled: + print("MLflow tracking under ", script_args.mlflow_experiment_name) + # mlflow.start_run(run_id=os.environ.get["MLFLOW_RUN_ID"]) + # mlflow.start_run(run_name=os.environ.get["MLFLOW_RUN_NAME"], nested=True) + mlflow.start_run(run_name=os.environ.get("MLFLOW_RUN_NAME", None)) + train_dataset_mlflow = mlflow.data.from_pandas(train_ds.to_pandas(), name="train_dataset") + mlflow.log_input(train_dataset_mlflow, context="train") - test_dataset_mlflow = mlflow.data.from_pandas(test_ds.to_pandas(), name="test_dataset") - mlflow.log_input(test_dataset_mlflow, context="test") + if test_ds is not None: + test_dataset_mlflow = mlflow.data.from_pandas(test_ds.to_pandas(), name="test_dataset") + mlflow.log_input(test_dataset_mlflow, context="test") - trainer.train() + trainer.train() + else: + trainer.train() if trainer.is_fsdp_enabled: trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT") @@ -344,7 +331,7 @@ def train(script_args, training_args, train_ds, test_ds): # merge adapter weights with base model and save # save int 4 model - trainer.model.save_pretrained(output_dir, safe_serialization=False) + trainer.save_model(output_dir) if accelerator.is_main_process: # clear memory @@ -370,35 +357,16 @@ def train(script_args, training_args, train_ds, test_ds): print("saving merged model...") model.save_pretrained( - training_args.output_dir, safe_serialization=True, max_shard_size="2GB" + training_args.output_dir, + safe_serialization=True ) else: print(f"merge adapter weights: {script_args.merge_weights}") - trainer.model.save_pretrained(training_args.output_dir, safe_serialization=True) + trainer.save_model(training_args.output_dir) if accelerator.is_main_process: tokenizer.save_pretrained(training_args.output_dir) - # if mlflow_enabled: - # # Model registration in MLFlow - # print("MLflow model registration under ", script_args.mlflow_experiment_name) - - # params = { - # "top_p": 0.9, - # "temperature": 0.2, - # "max_new_tokens": 2048, - # } - # signature = infer_signature("inputs", "generated_text", params=params) - - # mlflow.transformers.log_model( - # transformers_model={"model": model, "tokenizer": tokenizer}, - # signature=signature, - # artifact_path="model", # This is a relative path to save model files within MLflow run - # model_config=params, - # task="text-generation", - # registered_model_name=f"model-{os.environ.get('MLFLOW_RUN_NAME', '').split('Fine-tuning-')[-1]}" - # ) - accelerator.wait_for_everyone() @@ -414,30 +382,19 @@ def train(script_args, training_args, train_ds, test_ds): set_custom_env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) - mlflow.set_tracking_uri(script_args.mlflow_uri) - mlflow.set_experiment(script_args.mlflow_experiment_name) - mlflow_run_id = os.environ.get("MLFLOW_RUN_ID") - with mlflow.start_run(run_id=mlflow_run_id): - with mlflow.start_run(run_name="Finetuning", nested=True) as training_run: - - mlflow.enable_system_metrics_logging() - mlflow.autolog() - - # Load datasets - train_ds = load_dataset( - "json", - data_files=os.path.join(script_args.train_dataset_path, "dataset.json"), - split="train" - ) - - if script_args.test_dataset_path: - test_ds = load_dataset( - "json", - data_files=os.path.join(script_args.test_dataset_path, "dataset.json"), - split="train" - ) - else: - test_ds = None - - # launch training - train(script_args, training_args, train_ds, test_ds) + if script_args.mlflow_uri is not None and script_args.mlflow_experiment_name is not None and \ + script_args.mlflow_uri != "" and script_args.mlflow_experiment_name != "": + print("mlflow init") + mlflow.enable_system_metrics_logging() + mlflow.autolog() + mlflow.set_tracking_uri(script_args.mlflow_uri) + mlflow.set_experiment(script_args.mlflow_experiment_name) + + current_datetime = datetime.datetime.now() + formatted_datetime = current_datetime.strftime("%Y-%m-%d-%H-%M") + model_name = script_args.model_id.split("/")[-1] + set_custom_env({"MLFLOW_RUN_NAME": f"Fine-tuning-{model_name}-{formatted_datetime}"}) + set_custom_env({"MLFLOW_EXPERIMENT_NAME": script_args.mlflow_experiment_name}) + + # launch training + train(script_args, training_args) diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/__init__.py b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/deploy_step.py b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/deploy_step.py new file mode 100644 index 0000000..19d3593 --- /dev/null +++ b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/deploy_step.py @@ -0,0 +1,139 @@ +# ### 6. Deploy Step +# This step deploys the model for evaluation + +import sagemaker +import boto3 +import mlflow +from sagemaker import get_execution_role +from sagemaker import Model +from sagemaker.model_monitor import DataCaptureConfig +import time +from sagemaker.workflow.function_step import step +from .pipeline_utils import PIPELINE_INSTANCE_TYPE + + +@step( + name="ModelDeploy", + instance_type=PIPELINE_INSTANCE_TYPE, + display_name="Model Deploy", + keep_alive_period_in_seconds=900 +) +def deploy( + tracking_server_arn: str, + model_artifacts_s3_path: str, + # output_path: str, + model_id: str, + experiment_name: str, + run_id: str, +): + + mlflow.set_tracking_uri(tracking_server_arn) + mlflow.set_experiment(experiment_name) + + with mlflow.start_run(run_id=run_id): + with mlflow.start_run(run_name="DeployStep", nested=True) as deploy_run: + deployment_start_time = time.time() + + sagemaker_session = sagemaker.Session() + instance_count = 1 + instance_type = "ml.g5.2xlarge" + health_check_timeout = 3600 + model_data_download_timeout = 3600 + + model_config = { + 'HF_MODEL_ID': "/opt/ml/model", # path to where sagemaker stores the model + 'OPTION_TRUST_REMOTE_CODE': 'true', + 'OPTION_ROLLING_BATCH': "vllm", + 'OPTION_DTYPE': 'bf16', + 'OPTION_QUANTIZE': 'fp8', + 'OPTION_TENSOR_PARALLEL_DEGREE': 'max', + 'OPTION_MAX_ROLLING_BATCH_SIZE': '32', + 'OPTION_MODEL_LOADING_TIMEOUT': '3600', + 'OPTION_MAX_MODEL_LEN': '4096' + } + + # Get the name for the endpoint + endpoint_name = f"{model_id.split('/')[-1].replace('.', '-').replace('_','-')}-sft-djl" + + mlflow.log_params({ + "model_id": model_id, + "instance_type": instance_type, + "instance_count": instance_count, + "endpoint_name": endpoint_name, + "health_check_timeout": health_check_timeout, + "model_data_download_timeout": model_data_download_timeout + }) + mlflow.log_params({"model_config_" + k: v for k, v in model_config.items()}) + + # Delete existing endpoint if it exists + print(f"Checking for existing endpoint: {endpoint_name}") + sm_client = boto3.client('sagemaker') + try: + sm_client.describe_endpoint(EndpointName=endpoint_name) + print(f"Endpoint {endpoint_name} exists, deleting it before deployment") + sm_client.delete_endpoint(EndpointName=endpoint_name) + + print(f"Deleting endpoint config {endpoint_name}") + sm_client.delete_endpoint_config(EndpointConfigName=endpoint_name) + + # Wait for endpoint to be fully deleted + print("Waiting for endpoint to be fully deleted...") + wait_seconds = 10 + total_wait_time = 0 + max_wait_time = 300 # 5 minutes maximum wait + endpoint_deleted = False + + while total_wait_time < max_wait_time and not endpoint_deleted: + try: + sm_client.describe_endpoint(EndpointName=endpoint_name) + print(f"Endpoint still exists, waiting {wait_seconds} seconds...") + time.sleep(wait_seconds) + total_wait_time += wait_seconds + except sm_client.exceptions.ClientError: + print(f"Endpoint {endpoint_name} successfully deleted") + endpoint_deleted = True + + if not endpoint_deleted: + print(f"Warning: Endpoint still exists after {max_wait_time} seconds") + + except sm_client.exceptions.ClientError: + print(f"Endpoint {endpoint_name} does not exist, proceeding with deployment") + + # Continue with model deployment + region = sagemaker_session.boto_session.region_name + inference_image_uri = f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.33.0-lmi15.0.0-cu128" + mlflow.log_param("inference_image_uri", inference_image_uri) + + model_data = model_artifacts_s3_path + + # Create model only once + model = Model( + image_uri=inference_image_uri, + model_data=model_data, + role=get_execution_role(), + env=model_config + ) + + print(f"deploying endpoint: {endpoint_name}") + + predictor = model.deploy( + endpoint_name=endpoint_name, + initial_instance_count=instance_count, + instance_type=instance_type, + container_startup_health_check_timeout=health_check_timeout, + model_data_download_timeout=model_data_download_timeout, + ) + + # Log deployment metrics + deployment_time = time.time() - deployment_start_time + mlflow.log_param("deployment_time_seconds", deployment_time) + mlflow.log_param("deployment_success", 1) + + # Log tags + mlflow.set_tags({ + "endpoint_status": "deployed", + "deployment_type": "sagemaker", + "framework": "djl-lmi" + }) + + return endpoint_name \ No newline at end of file diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/evaluation_mlflow.py b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/evaluation_mlflow.py deleted file mode 100644 index bcb70a7..0000000 --- a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/evaluation_mlflow.py +++ /dev/null @@ -1,61 +0,0 @@ -import boto3 -import sagemaker -from sagemaker.s3_utils import parse_s3_url -import mlflow -import tempfile -from pathlib import Path -import pandas as pd -import json -from dataclasses import dataclass -from typing import Tuple, Optional -import json - - -def evaluation(model, preprocess_step_ret, finetune_ret, mlflow_arn, experiment_name, run_id): - mlflow.set_tracking_uri(mlflow_arn) - mlflow.set_experiment(experiment_name) - - print(preprocess_step_ret['run_id']) - - with mlflow.start_run(run_id=preprocess_step_ret['run_id']) as run: - s3 = boto3.client("s3") - sess = sagemaker.Session() - - dataset_info = mlflow.get_run(preprocess_step_ret['run_id']).inputs.dataset_inputs[1].dataset - - print(dataset_info) - print(f"Dataset name: {dataset_info.name}") - print(f"Dataset digest: {dataset_info.digest}") - print(f"Dataset profile: {dataset_info.profile}") - print(f"Dataset schema: {dataset_info.schema}") - - dataset_source = mlflow.data.get_source(dataset_info) - ds = dataset_source.load() - # get the bucket name using full s3 poth - - eval_data=pd.read_json(ds, orient='records', lines=True) - - data = [] - for index, row in eval_data.iterrows(): - for message in row['messages']: - if message["role"] == "user": - question = message["content"] - elif message["role"] == "assistant": - answer = message["content"] - data.append({"question": question, "answer": answer}) - - df = pd.DataFrame(data, columns=["question", "answer"]) - print(df.head()) - - - logged_model = f"runs:/{preprocess_step_ret['run_id']}/model" - loaded_model = mlflow.pyfunc.load_model(model_uri=logged_model) - results = mlflow.evaluate( - model=loaded_model, - data=df, - targets="answer", - model_type="question-answering", - evaluator_config={"col_mapping": {"inputs": "question"}}, - ) - print(results.metrics) - return "done" \ No newline at end of file diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/finetune_llama3b_hf.py b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/finetune_llama3b_hf.py deleted file mode 100644 index 9a7b394..0000000 --- a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/finetune_llama3b_hf.py +++ /dev/null @@ -1,96 +0,0 @@ -from steps.utils import endpoint_exists -from sagemaker.jumpstart.estimator import JumpStartEstimator -from sagemaker.huggingface import HuggingFace -from huggingface_hub import HfFolder -import mlflow -import time -import json -import boto3 - -def finetune_llama3b(preprocess_step_ret, train_config, lora_config, role, mlflow_arn, experiment_name,run_name, *args): - - mlflow.set_tracking_uri(mlflow_arn) - mlflow.set_experiment(experiment_name) - - with mlflow.start_run(run_id=preprocess_step_ret['run_id']) as run: - - model_id = train_config["model_id"] - endpoint_name = train_config["endpoint_name"] - instance_type = train_config["finetune_instance_type"] - num_instances = train_config["finetune_num_instances"] - epoch = train_config["epoch"] - per_device_train_batch_size = train_config["per_device_train_batch_size"] - - lora_config = json.loads(lora_config) - - lora_r = lora_config["lora_r"] - lora_alpha = lora_config["lora_alpha"] - lora_dropout = lora_config["lora_dropout"] - - train_data_path = preprocess_step_ret["training_input_path"] - - training_job_name = f'huggingface-qlora-{train_config["epoch"]}-{lora_config["lora_r"]}-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}' - - hyperparameters = { - 'model_id': model_id, # pre-trained model - 'dataset_path': '/opt/ml/input/data/training', # path where sagemaker will save training dataset - 'epochs': epoch, # number of training epochs - 'per_device_train_batch_size': per_device_train_batch_size, # batch size for training - 'lr': 2e-4, # learning rate used during training - 'hf_token': "", # huggingface token to access llama 2 - 'merge_weights': True, # whether to merge LoRA into the model - 'lora_r': lora_r, - 'lora_alpha': lora_alpha, - 'lora_dropout': lora_dropout, - 'mlflow_arn': mlflow_arn, - 'experiment_name': experiment_name, - 'run_id': preprocess_step_ret['run_id'] - } - - # Add SageMaker environment variables to help with debugging - environment = { - "HUGGINGFACE_HUB_CACHE": "/tmp/.cache", - "NCCL_DEBUG": "INFO", # Helps debug NCCL issues - "NCCL_P2P_DISABLE": "1", # Can help with some networking issues - "PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:512" # Helps with memory management - } - - if endpoint_exists(endpoint_name): - print("Endpoint already exists") - training_job_name = None - else: - # Define distributed training configuration - distribution = { - 'torch_distributed': { - 'enabled': True - } - } - - huggingface_estimator = HuggingFace( - entry_point='llama3_fine_tuning.py', # train script - source_dir='scripts', # directory which includes all the files needed for training - instance_type=instance_type, # instances type used for the training job - instance_count=num_instances, # the number of instances used for training - base_job_name=training_job_name, # the name of the training job - role=role, # IAM role used in training job to access AWS resources - volume_size=300, # the size of the EBS volume in GB - py_version='py311', # the python version used in the training job - hyperparameters=hyperparameters, # the hyperparameters passed to the training job - environment=environment, - distribution=distribution, # Added distributed training config - image_uri=f'763104351884.dkr.ecr.{boto3.session.Session().region_name}.amazonaws.com/pytorch-training:2.5.1-gpu-py311-cu124-ubuntu22.04-sagemaker', - metric_definitions=[ - {'Name': 'huggingface-textgeneration:loss', 'Regex': "'loss':\s*([0-9.]+)"}, - {'Name': 'huggingface-textgeneration:epoch', 'Regex': "'epoch':\s*([0-9.]+)"}, - {'Name': 'huggingface-textgeneration:train_loss', 'Regex': "'train_loss':\s*([0-9.]+)"}, - ] - ) - - data = {'training': train_data_path} - - # starting the train job with our uploaded datasets as input - huggingface_estimator.fit(data, wait=True) - - training_job_name = huggingface_estimator.latest_training_job.name - - return {"training_job_name": training_job_name, "run_id": preprocess_step_ret['run_id']} \ No newline at end of file diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/finetune_step.py b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/finetune_step.py new file mode 100644 index 0000000..7a23fa1 --- /dev/null +++ b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/finetune_step.py @@ -0,0 +1,281 @@ +# **Fine-tuning Step** + +# This is where the actual model adaptation occurs. The step takes the preprocessed data and applies it to fine-tune the base LLM (in this case, a Qwen model). It incorporates the LoRA technique for efficient adaptation. + +import sagemaker +import boto3 +import mlflow +import yaml +import json +import time +import datetime +import os +import traceback +import tempfile +from pathlib import Path +from sagemaker.pytorch import PyTorch +from sagemaker.workflow.function_step import step +from .pipeline_utils import PIPELINE_INSTANCE_TYPE + +from sagemaker import image_uris +from sagemaker.modules.configs import Compute, InputData, OutputDataConfig, SourceCode, StoppingCondition +from sagemaker.modules.distributed import Torchrun +from sagemaker.modules.train import ModelTrainer +from sagemaker.modules.configs import InputData + +@step( + name="ModelFineTuning", + instance_type=PIPELINE_INSTANCE_TYPE, + display_name="Model Fine Tuning", + keep_alive_period_in_seconds=900, + dependencies="./scripts/requirements.txt" +) +def train( + tracking_server_arn: str, + train_dataset_s3_path: str, + test_dataset_s3_path: str, + train_config_s3_path: str, + role: str, + experiment_name: str, + model_id: str, + run_id: str, +): + + # Initialize variables and tracking + start_time = time.time() + model_name = model_id.split("/")[-1] if "/" in model_id else model_id + training_job_name = None + + mlflow.set_tracking_uri(tracking_server_arn) + mlflow.set_experiment(experiment_name) + + try: + with mlflow.start_run(run_id=run_id): + with mlflow.start_run(run_name="FinetuningStep", nested=True) as training_run: + mlflow.autolog() + training_run_id = training_run.info.run_id + # Enable detailed tracking + mlflow.set_tag("component", "model_fine_tuning") + mlflow.log_param("model_id", model_id) + mlflow.log_param("train_dataset", train_dataset_s3_path) + mlflow.log_param("test_dataset", test_dataset_s3_path) + mlflow.log_param("training_start_time", datetime.datetime.now().isoformat()) + + # Download and parse the training config YAML to log hyperparameters + with tempfile.NamedTemporaryFile(delete=False) as tmp: + s3_client = boto3.client("s3") + + # Parse S3 path + config_parts = train_config_s3_path.replace("s3://", "").split("/", 1) + bucket = config_parts[0] + key = config_parts[1] + + # Download config file + try: + s3_client.download_file(bucket, key, tmp.name) + # Parse the YAML config + with open(tmp.name, 'r') as f: + config = yaml.safe_load(f) + + # Log all hyperparameters from config + print("Logging hyperparameters to MLflow:") + for param_name, param_value in config.items(): + # Skip complex objects that can't be logged as parameters + if isinstance(param_value, (str, int, float, bool)): + print(f" {param_name}: {param_value}") + mlflow.log_param(param_name, param_value) + elif param_name == "fsdp_config" and isinstance(param_value, dict): + # Log nested config as JSON + mlflow.log_param("fsdp_config_json", json.dumps(param_value)) + + # Log file as artifact for reference + mlflow.log_artifact(tmp.name, "training_config") + + except Exception as e: + print(f"Error parsing config file: {e}") + + finally: + # Clean up temp file + if os.path.exists(tmp.name): + os.remove(tmp.name) + + # Define metric definitions for more detailed CloudWatch metrics + metric_definitions = [ + {'Name': 'loss', 'Regex': "'loss':\\s*([0-9.]+)"}, + {'Name': 'epoch', 'Regex': "'epoch':\\s*([0-9.]+)"}, + {'Name': 'train_loss', 'Regex': "'train_loss':\\s*([0-9.]+)"}, + {'Name': 'lr', 'Regex': "'learning_rate':\\s*([0-9.e-]+)"}, + {'Name': 'step', 'Regex': "'step':\\s*([0-9.]+)"}, + {'Name': 'samples_per_second', 'Regex': "'train_samples_per_second':\\s*([0-9.]+)"}, + ] + + # Log the metric definitions we're using + mlflow.log_param("tracked_metrics", [m['Name'] for m in metric_definitions]) + + sagemaker_session = sagemaker.Session() + image_uri = sagemaker.image_uris.retrieve( + framework="pytorch", + version="2.6.0", + instance_type="ml.g5.2xlarge", + region=sagemaker_session.boto_session.region_name, + image_scope="training" + ) + + source_code = SourceCode( + source_dir="./scripts", + requirements="requirements.txt", + entry_script="train.py", + ) + + # Define the compute + compute_configs = Compute( + instance_type="ml.g5.2xlarge", + instance_count=1, + keep_alive_period_in_seconds=3600, + volume_size_in_gb=50 + ) + + # Launch the training job + job_name = f"qwen3-finetune-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}" + model_artifacts_s3_path = f"s3://{sagemaker_session.default_bucket()}/qwen3-finetune/" + + # Define the ModelTrainer + model_trainer = ModelTrainer( + training_image=image_uri, + source_code=source_code, + base_job_name=job_name, + compute=compute_configs, + distributed=Torchrun(), + stopping_condition=StoppingCondition( + max_runtime_in_seconds=7200 + ), + hyperparameters={ + "config": "/opt/ml/input/data/config/args.yaml" # path to TRL config which was uploaded to s3 + }, + output_data_config=OutputDataConfig( + s3_output_path=model_artifacts_s3_path + ), + environment={ + "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True" + # "MLFLOW_RUN_ID": run_id + } + ) + + # Pass the input data + train_input = InputData( + channel_name="train", + data_source=train_dataset_s3_path, # S3 path where training data is stored + ) + + test_input = InputData( + channel_name="test", + data_source=test_dataset_s3_path, # S3 path where training data is stored + ) + + config_input = InputData( + channel_name="config", + data_source=train_config_s3_path, # S3 path where training data is stored + ) + + # Check input channels configured + data = [train_input, test_input, config_input] + mlflow.log_param("datasets", data) + + print(f"Data for Training Run: {data}") + # Log training job information + mlflow.log_param("job_name", job_name) + mlflow.log_param("instance_type", "ml.g5.2xlarge") + + # Start the training job + model_trainer.train(input_data_config=data, wait=True) + + # Get information about the completed training job + latest_training_job_name = model_trainer._latest_training_job.training_job_name + print(f"Latest Job Name: {latest_training_job_name}") + + sagemaker_client = boto3.client('sagemaker') + + # Describe the training job + response = sagemaker_client.describe_training_job(TrainingJobName=latest_training_job_name) + + # Extract the model artifacts S3 path + model_artifacts_s3_path = response['ModelArtifacts']['S3ModelArtifacts'] + + # Extract the output path (this is the general output location) + output_path = response['OutputDataConfig']['S3OutputPath'] + + # Get training time metrics + training_start_time = response.get('TrainingStartTime') + training_end_time = response.get('TrainingEndTime') + billable_time = response.get('BillableTimeInSeconds', 0) + + # Calculate duration + total_training_time = 0 + if training_start_time and training_end_time: + total_training_time = (training_end_time - training_start_time).total_seconds() + + # Log job results and metrics to MLflow + # Log basic job info + mlflow.log_param("training_job_name", latest_training_job_name) + mlflow.log_param("model_artifacts_path", model_artifacts_s3_path) + mlflow.log_param("output_path", output_path) + + # Log performance metrics + mlflow.log_param("billable_time_seconds", billable_time) + mlflow.log_param("total_training_time_seconds", total_training_time) + + # Log training job status + mlflow.log_param("training_job_status", response.get('TrainingJobStatus')) + + # Log any secondary status + if 'SecondaryStatus' in response: + mlflow.log_param("secondary_status", response.get('SecondaryStatus')) + + # Log any failure reason + if 'FailureReason' in response: + mlflow.log_param("failure_reason", response.get('FailureReason')) + + # Get CloudWatch logs for the training job + logs_client = boto3.client('logs') + log_group = "/aws/sagemaker/TrainingJobs" + log_stream = latest_training_job_name + + try: + # Get the last 1000 log events + log_events = logs_client.get_log_events( + logGroupName=log_group, + logStreamName=log_stream, + limit=1000 + ) + + # Extract and save logs + log_output = "\n".join([event['message'] for event in log_events['events']]) + + # Save logs to file and log as artifact + with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.txt') as tmp: + tmp.write(log_output) + log_file_path = tmp.name + + mlflow.log_artifact(log_file_path, "training_logs") + os.remove(log_file_path) + + except Exception as e: + print(f"Error fetching training logs: {e}") + + # Log total execution time of this step + step_duration = time.time() - start_time + mlflow.log_metric("step_execution_time_seconds", step_duration) + + # Log model metadata + mlflow.set_tag("model_path", model_artifacts_s3_path) + mlflow.set_tag("training_completed_at", datetime.datetime.now().isoformat()) + + print(f"Model artifacts S3 path: {model_artifacts_s3_path}") + + except Exception as e: + error_msg = f"Error in model fine-tuning: {str(e)}\n{traceback.format_exc()}" + print(error_msg) + + raise RuntimeError(f"Fine-tuning failed: {str(e)}") + + return run_id, training_run_id, model_artifacts_s3_path #, output_path \ No newline at end of file diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/model_registration_step.py b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/model_registration_step.py new file mode 100644 index 0000000..d24644d --- /dev/null +++ b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/model_registration_step.py @@ -0,0 +1,298 @@ +# ### 7. Model Registration Step +# This step registers the fine-tuned model in MLflow model registry and SageMaker AI model registry + +from sagemaker.workflow.function_step import step +from .pipeline_utils import PIPELINE_INSTANCE_TYPE + + +@step( + name="ModelRegistration", + instance_type=PIPELINE_INSTANCE_TYPE, + display_name="Model Registration", + keep_alive_period_in_seconds=900 +) +def register_model( + tracking_server_arn: str, + experiment_name: str, + run_id: str, + model_artifacts_s3_path: str, + model_id: str, + model_name: str, + endpoint_name: str, + evaluation_score: float, + pipeline_name: str, + model_description: str +): + import json + import mlflow + import boto3 + import os + import tempfile + import time + from datetime import datetime + + print(f"Registering model: {model_name}") + + # Set up MLflow tracking + mlflow.set_tracking_uri(tracking_server_arn) + mlflow.set_experiment(experiment_name) + + # Connect to MLflow with the specific run + with mlflow.start_run(run_id=run_id): + # Create model metadata + tags = { + "model_id": model_id, + "base_model": model_id.split('/')[-1], + "task": "medical_qa", + "framework": "pytorch", + "endpoint_name": endpoint_name, + "model_artifacts_s3_path": model_artifacts_s3_path, + "deployment_timestamp": datetime.now().isoformat(), + "description": model_description, + "registered_by": pipeline_name + } + + # Log model info as parameters + mlflow.log_param("registered_model_name", model_name) + mlflow.log_param("model_artifacts_path", model_artifacts_s3_path) + mlflow.log_param("evaluation_score", evaluation_score) + mlflow.log_param("endpoint_name", endpoint_name) + mlflow.log_param("registration_timestamp", datetime.now().isoformat()) + + # Log endpoint information as an artifact + model_info = { + "model_name": model_name, + "model_id": model_id, + "endpoint_name": endpoint_name, + "model_artifacts_s3_path": model_artifacts_s3_path, + "evaluation_score": float(evaluation_score), + "registration_timestamp": datetime.now().isoformat() + } + + with open("/tmp/model_info.json", "w") as f: + json.dump(model_info, f, indent=2) + mlflow.log_artifact("/tmp/model_info.json") + + # Create model card + model_card = f""" + # {model_name} + + ## Model Information + - **Base Model**: {model_id} + - **Task**: Medical Question Answering + - **Evaluation Score**: {evaluation_score:.4f} + - **Endpoint**: {endpoint_name} + + ## Description + {model_description} + + ## Registration Details + - Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} + - Pipeline: {pipeline_name} + """ + + with open("/tmp/model_card.md", "w") as f: + f.write(model_card) + mlflow.log_artifact("/tmp/model_card.md") + + # PART 1: REGISTER WITH MLFLOW MODEL REGISTRY + mlflow_version = None + try: + client = mlflow.tracking.MlflowClient() + + # Check if model exists and create if it doesn't + try: + client.get_registered_model(model_name) + print(f"Model {model_name} already exists in MLflow registry") + except mlflow.exceptions.MlflowException: + client.create_registered_model( + name=model_name, + description=f"Fine-tuned medical LLM based on {model_id}" + ) + print(f"Created new registered model: {model_name}") + + # Create empty model directory with artifacts + with tempfile.TemporaryDirectory() as tmp_dir: + # Create a minimal model file to log + os.makedirs(os.path.join(tmp_dir, "model"), exist_ok=True) + + # Copy model info and card to directory + with open(os.path.join(tmp_dir, "model", "model_info.json"), "w") as f: + json.dump(model_info, f, indent=2) + + with open(os.path.join(tmp_dir, "model", "model_card.md"), "w") as f: + f.write(model_card) + + # Create a model reference file pointing to the S3 artifacts + model_ref = { + "artifact_path": model_artifacts_s3_path, + "flavors": { + "pytorch": { + "model_data": model_artifacts_s3_path, + "pytorch_version": "2.0+" + } + }, + "run_id": run_id, + "model_class": "LLM", + "model_format": "PyTorch" + } + + with open(os.path.join(tmp_dir, "model", "MLmodel"), "w") as f: + json.dump(model_ref, f, indent=2) + + # Log artifacts directory as model + mlflow.log_artifacts(tmp_dir, artifact_path="") + + # Now register the model - try both methods + try: + # Method 1: Use direct registration with source as run URI + model_uri = f"runs:/{run_id}/model" + model_details = mlflow.register_model( + model_uri=model_uri, + name=model_name, + tags=tags + ) + mlflow_version = model_details.version + + except Exception as e1: + print(f"Method 1 registration failed: {str(e1)}") + + try: + # Method 2: Create version with client API + model_version = client.create_model_version( + name=model_name, + source=f"runs:/{run_id}/model", # Use run URI instead of direct S3 + run_id=run_id, + description=f"Fine-tuned LLM deployed at endpoint: {endpoint_name}" + ) + mlflow_version = model_version.version + + # Wait for model registration to complete + for _ in range(10): # Try for up to ~50 seconds + version_details = client.get_model_version(model_name, model_version.version) + if version_details.status == "READY": + break + time.sleep(5) + + # Add tags to the registered model version + for key, value in tags.items(): + client.set_model_version_tag(model_name, model_version.version, key, value) + except Exception as e2: + print(f"Method 2 registration failed: {str(e2)}") + mlflow_version = "unknown" + + if mlflow_version and mlflow_version != "unknown": + # Transition model to Production/Staging based on evaluation score + if evaluation_score >= 0.3: # Example threshold + client.transition_model_version_stage( + name=model_name, + version=mlflow_version, + stage="Production", + archive_existing_versions=True + ) + print(f"Model {model_name} version {mlflow_version} promoted to Production") + else: + client.transition_model_version_stage( + name=model_name, + version=mlflow_version, + stage="Staging", + archive_existing_versions=False + ) + print(f"Model {model_name} version {mlflow_version} added to Staging due to lower evaluation score") + + print(f"Successfully registered model in MLflow: {model_name}, version: {mlflow_version}") + + except Exception as e: + print(f"Error registering model in MLflow: {str(e)}") + mlflow_version = "unknown" + + # PART 2: REGISTER WITH SAGEMAKER MODEL REGISTRY + sm_model_version = "unknown" + try: + sm_client = boto3.client('sagemaker') + + # Create a normalized name for SageMaker resources + sm_model_name = model_name.replace(".", "-").replace("_", "-") + + # Create or update model package group + try: + sm_client.describe_model_package_group(ModelPackageGroupName=sm_model_name) + print(f"SageMaker model package group {sm_model_name} already exists") + except sm_client.exceptions.ClientError: + sm_client.create_model_package_group( + ModelPackageGroupName=sm_model_name, + ModelPackageGroupDescription=f"Fine-tuned LLM model: {model_name}" + ) + print(f"Created SageMaker model package group: {sm_model_name}") + + # Create a model package and register it + try: + # Create model package + response = sm_client.create_model_package( + ModelPackageGroupName=sm_model_name, + ModelPackageDescription=model_description, + SourceAlgorithmSpecification={ + 'SourceAlgorithms': [ + { + 'AlgorithmName': 'pytorch-llm', + 'ModelDataUrl': model_artifacts_s3_path + } + ] + }, + ValidationSpecification={ + 'ValidationRole': 'dummy-role', # Required but not used + 'ValidationProfiles': [ + { + 'ProfileName': 'ValidationProfile1', + 'TransformJobDefinition': { + 'TransformInput': { + 'DataSource': { + 'S3DataSource': { + 'S3DataType': 'S3Prefix', + 'S3Uri': 's3://dummy-bucket/dummy-prefix' # Required but not used + } + } + }, + 'TransformOutput': { + 'S3OutputPath': 's3://dummy-bucket/dummy-output' # Required but not used + }, + 'TransformResources': { + 'InstanceType': 'ml.m5.large', # Required but not used + 'InstanceCount': 1 + } + } + } + ] + }, + ModelApprovalStatus='Approved', + MetadataProperties={ + 'GeneratedBy': pipeline_name, + 'Repository': model_id, + 'EvaluationScore': str(evaluation_score) + }, + ModelMetrics={ + 'ModelQuality': { + 'Statistics': { + 'ContentType': 'application/json', + 'S3Uri': f"s3://{model_artifacts_s3_path.split('/', 3)[2]}/{run_id}/artifacts/model_info.json" + } + } + } + ) + + sm_model_version = response['ModelPackageArn'].split('/')[-1] + print(f"Created SageMaker model package: {sm_model_version}") + + except Exception as e_package: + print(f"Error creating model package: {str(e_package)}") + + # Log SageMaker details + mlflow.log_param("sagemaker_model_group", sm_model_name) + mlflow.log_param("sagemaker_model_version", sm_model_version) + + print(f"Successfully integrated with SageMaker model registry") + + except Exception as e: + print(f"Warning: Error in SageMaker model registry integration: {str(e)}") + + return model_name, str(mlflow_version) \ No newline at end of file diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/pipeline_utils.py b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/pipeline_utils.py new file mode 100644 index 0000000..30609d7 --- /dev/null +++ b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/pipeline_utils.py @@ -0,0 +1,142 @@ +import boto3 +import botocore +import json +import time +from datetime import datetime + + +PIPELINE_INSTANCE_TYPE = "ml.m5.xlarge" + + +SYSTEM_PROMPT = """You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. +Below is an instruction that describes a task, paired with an input that provides further context. +Write a response that appropriately completes the request. +Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.""" + + +def endpoint_exists(endpoint_name): + endpoint_exist = False + + client = boto3.client('sagemaker') + response = client.list_endpoints() + endpoints = response["Endpoints"] + + for endpoint in endpoints: + if endpoint_name == endpoint["EndpointName"]: + endpoint_exist = True + break + + return endpoint_exist + + +def create_training_job_name(model_id): + return f"{model_id}-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f')[:-3]}" + + +# template dataset to add prompt to each sample +def convert_to_messages(sample, system_prompt=""): + + messages = [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": sample["Question"]}, + {"role": "assistant", "content": f"{sample["Complex_CoT"]}\n\n{sample["Response"]}"} + ] + + sample["messages"] = messages + + return sample + + +def invoke_sagemaker_endpoint(payload, endpoint_name): + """ + Invoke a SageMaker endpoint with the given payload. + + Args: + payload (dict): The input data to send to the endpoint + endpoint_name (str): The name of the SageMaker endpoint + + Returns: + dict: The response from the endpoint + """ + sm_client = boto3.client('sagemaker-runtime') + try: + start_time = time.time() + response = sm_client.invoke_endpoint( + EndpointName=endpoint_name, + ContentType='application/json', + Body=json.dumps(payload) + ) + inference_time = time.time() - start_time + + response_body = response['Body'].read().decode('utf-8') + return json.loads(response_body), inference_time + except Exception as e: + print(f"Error invoking endpoint {endpoint_name}: {str(e)}") + return None, -1 + + +def get_or_create_guardrail(): + guardrail_client = boto3.client('bedrock') + guardrail_name = "ExampleMedicalGuardrail" + + guardrail_id = None + guardrail_version = None + + # Try to get the guardrail + response = guardrail_client.list_guardrails() + for guardrail in response.get('guardrails', []): + if guardrail['name'] == guardrail_name: + guardrail_id = guardrail['id'] + response = guardrail_client.get_guardrail( + guardrailIdentifier=guardrail_id + ) + guardrail_version = response["version"] + print(f"Found Guardrail {guardrail_id}:{guardrail_version}") + break + + if not guardrail_id: + try: + guardrail = guardrail_client.create_guardrail( + name="ExampleMedicalGuardrail", + description='Example of a Guardrail for Medical Use Cases', + topicPolicyConfig={ + 'topicsConfig': [{ + 'name': 'Block Pharmaceuticals', + 'definition': 'This model cannot recommend one pharmaceutical over another. Generic prescriptions consistent with medical expertise and clinical diagnoses only.', + 'type': 'DENY', + 'inputAction': 'BLOCK', + 'outputAction': 'BLOCK', + }] + }, + sensitiveInformationPolicyConfig={ + 'piiEntitiesConfig': [ + { + 'type': 'UK_NATIONAL_HEALTH_SERVICE_NUMBER', + 'action': 'BLOCK', + 'inputAction': 'BLOCK', + 'outputAction': 'BLOCK' + }, + ] + }, + contextualGroundingPolicyConfig={ + 'filtersConfig': [ + { + 'type': 'RELEVANCE', + 'threshold': 0.9, + 'action': 'BLOCK', + 'enabled': True + }, + ] + }, + blockedInputMessaging="ExampleMedicalGuardrail has blocked this input.", + blockedOutputsMessaging="ExampleMedicalGuardrail has blocked this output." + ) + + guardrail_id = guardrail['guardrailId'] + guardrail_version = guardrail['version'] + + print(f"Created new guardrail '{guardrail_id}:{guardrail_version}'") + except botocore.exceptions.ClientError as create_error: + print(f"Error creating guardrail: {create_error}") + + return guardrail_id, guardrail_version \ No newline at end of file diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/preprocess_llama3.py b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/preprocess_llama3.py deleted file mode 100644 index d6ae3a1..0000000 --- a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/preprocess_llama3.py +++ /dev/null @@ -1,64 +0,0 @@ -# Temporary preprocess step (to be changed with new dataset) -import boto3 -import pandas as pd -from datasets import load_dataset -from datasets import Dataset -from random import randint -import mlflow -import json - - -system_message = """You are Llama, an AI assistant. Your knowledge spans a wide range of topics, allowing you to anser the questions with honesty and truthfulness.""" - -def create_conversation(sample): - if sample["messages"][0]["role"] == "system": - return sample - else: - sample["messages"] = [{"role": "system", "content": system_message}] + sample["messages"] - return sample - -def preprocess(s3_bucket, dataset_name, train_sample, eval_sample, mlflow_arn, experiment_name, run_name): - - mlflow.set_tracking_uri(mlflow_arn) - mlflow.set_experiment(experiment_name) - - - # This is a very simple example, you can add your own data processing code here - dataset = load_dataset(dataset_name) - dataset = dataset.filter(lambda x: x['category'] == 'Open QA') - - columns_to_remove = list(dataset["train"].features) - columns_to_remove.remove("messages") - dataset = dataset.map(create_conversation, remove_columns=columns_to_remove,batched=False) - - dataset["train"] = dataset["train"].filter(lambda x: len(x["messages"][1:]) % 2 == 0) - dataset["test"] = dataset["test"].filter(lambda x: len(x["messages"][1:]) % 2 == 0) - - dataset["train"].to_json("train_dataset.json", orient="records", force_ascii=False) - dataset["test"].to_json("test_dataset.json", orient="records", force_ascii=False) - - # save training and test data to s3 - s3 = boto3.client("s3") - s3.upload_file("train_dataset.json", s3_bucket, f"dataset/{dataset_name}/{train_sample}/train/train_dataset.json") - s3.upload_file("test_dataset.json", s3_bucket, f"dataset/{dataset_name}/{eval_sample}/eval/eval_dataset.json") - - - training_input_path = f's3://{s3_bucket}/dataset/{dataset_name}/{train_sample}/train/train_dataset.json' - eval_input_path = f's3://{s3_bucket}/dataset/{dataset_name}/{eval_sample}/eval/eval_dataset.json' - - with mlflow.start_run(run_name=run_name) as run: - - run_id = run.info.run_id - print(run_id) - - # create pandas dataframe from train json - df_train = pd.read_json("train_dataset.json", orient="records", lines=True) - df_evaluate = pd.read_json("test_dataset.json", orient="records", lines=True) - - training_data = mlflow.data.from_pandas(df_train, source=training_input_path) - mlflow.log_input(training_data, context="training") - - evaluation_data = mlflow.data.from_pandas(df_evaluate, source=eval_input_path) - mlflow.log_input(evaluation_data, context="evaluation") - - return {"training_input_path": training_input_path, "eval_input_path": eval_input_path, "run_id": run_id} diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/preprocess_step.py b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/preprocess_step.py new file mode 100644 index 0000000..3a9b97c --- /dev/null +++ b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/preprocess_step.py @@ -0,0 +1,120 @@ +# **Preprocessing Step** + +# This step handles data preparation. We are going to prepare data for training and evaluation. We will log this data in MLflow +import boto3 +import shutil +import sagemaker +import os +import pandas as pd +from sagemaker.config import load_sagemaker_config +import mlflow +import traceback +from datasets import load_dataset +from sklearn.model_selection import train_test_split +from datasets import Dataset, DatasetDict +from random import randint +from sagemaker.workflow.function_step import step +from .pipeline_utils import ( + PIPELINE_INSTANCE_TYPE, + # template_dataset, + SYSTEM_PROMPT, + convert_to_messages +) + + +@step( + name="DataPreprocessing", + instance_type=PIPELINE_INSTANCE_TYPE, + display_name="Data Preprocessing", + keep_alive_period_in_seconds=900 +) +def preprocess( + tracking_server_arn: str, + input_path: str, + experiment_name: str, + run_name: str, +) -> tuple: + mlflow.set_tracking_uri(tracking_server_arn) + mlflow.set_experiment(experiment_name) + + # Preprocessing code + try: + with mlflow.start_run(run_name=run_name) as run: + run_id = run.info.run_id + with mlflow.start_run(run_name="Processing", nested=True): + # Initialize SageMaker and S3 clients + sagemaker_session = sagemaker.Session() + s3_client = boto3.client('s3') + + bucket_name = sagemaker_session.default_bucket() + default_prefix = sagemaker_session.default_bucket_prefix + configs = load_sagemaker_config() + + # Set paths + if default_prefix: + input_path = f'{default_prefix}/datasets/llm-fine-tuning-modeltrainer-sft' + else: + input_path = f'datasets/llm-fine-tuning-modeltrainer-sft' + + # Load dataset with proper error handling + num_samples = 100 + try: + full_dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en", split=f"train[:{num_samples}]") + except Exception as e: + error_msg = f"Error loading dataset: {str(e)}\n{traceback.format_exc()}" + print(error_msg) + raise RuntimeError(f"Failed to load dataset: {str(e)}") + + # Split dataset + train_test_split_datasets = full_dataset.train_test_split(test_size=0.1, seed=42, shuffle=True) + train_dataset = train_test_split_datasets["train"] + test_dataset = train_test_split_datasets["test"] + print(f"Number of train elements: {len(train_dataset)}") + print(f"Number of test elements: {len(test_dataset)}") + + train_dataset = train_dataset.map(convert_to_messages, remove_columns=list(full_dataset.features), fn_kwargs={"system_prompt": SYSTEM_PROMPT}) + test_dataset = test_dataset.map(convert_to_messages, remove_columns=list(full_dataset.features), fn_kwargs={"system_prompt": SYSTEM_PROMPT}) + #grab a sample from the training and test sets + print(f"Train Sample:\n{train_dataset[randint(0, len(train_dataset)-1)]}\n\n") + print(f"Test Sample:\n{test_dataset[randint(0, len(test_dataset)-1)]}\n\n") + + # Log dataset statistics if MLflow is enabled + mlflow.log_param("dataset_source", "FreedomIntelligence/medical-o1-reasoning-SFT") + mlflow.log_param("train_size", len(train_dataset)) + mlflow.log_param("test_size", len(test_dataset)) + mlflow.log_param("dataset_sample_size", num_samples) # Log that we're using a subset of 100 samples + # save train_dataset to s3 using our SageMaker session + if default_prefix: + input_path = f'{default_prefix}/datasets/llm-fine-tuning-modeltrainer-sft' + else: + input_path = f'datasets/llm-fine-tuning-modeltrainer-sft' + + # Save datasets to s3 + # We will fine tune only with 20 records due to limited compute resource for the workshop + train_dataset.to_json("./data/train/dataset.json", orient="records") + test_dataset.to_json("./data/test/dataset.json", orient="records") + + s3_client.upload_file("./data/train/dataset.json", bucket_name, f"{input_path}/train/dataset.json") + train_dataset_s3_path = f"s3://{bucket_name}/{input_path}/train/dataset.json" + s3_client.upload_file("./data/test/dataset.json", bucket_name, f"{input_path}/test/dataset.json") + test_dataset_s3_path = f"s3://{bucket_name}/{input_path}/test/dataset.json" + + shutil.rmtree("./data") + + print(f"Training data uploaded to:") + print(train_dataset_s3_path) + print(test_dataset_s3_path) + + mlflow.log_param("train_data_path", train_dataset_s3_path) + mlflow.log_param("test_dataset_path", test_dataset_s3_path) + + print(f"Datasets uploaded to:") + print(train_dataset_s3_path) + print(test_dataset_s3_path) + + except Exception as e: + error_msg = f"Critical error in preprocessing: {str(e)}\n{traceback.format_exc()}" + print(error_msg) + raise RuntimeError(f"Preprocessing failed: {str(e)}") + + return run_id, train_dataset_s3_path, test_dataset_s3_path \ No newline at end of file diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/qualitative_eval_step.py b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/qualitative_eval_step.py new file mode 100644 index 0000000..341ae31 --- /dev/null +++ b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/qualitative_eval_step.py @@ -0,0 +1,515 @@ +# ### 8. Qualitative Evaluation Step + +# After fine-tuning, this step assesses the model's qualitative performance. + +from sagemaker.workflow.function_step import step +from .pipeline_utils import PIPELINE_INSTANCE_TYPE +from .pipeline_utils import SYSTEM_PROMPT +from .pipeline_utils import convert_to_messages + + +@step( + name="QualitativeModelEvaluation", + instance_type=PIPELINE_INSTANCE_TYPE, + display_name="Qualitative Model Evaluation", + keep_alive_period_in_seconds=900, + dependencies="./eval/requirements.txt" +) +def qualitative_evaluate( + tracking_server_arn: str, + experiment_name: str, + run_id: str, + endpoint_name: str, + mlflow_trace_attributes: dict +) -> dict: + import os + import json + import time + import boto3 + import pandas as pd + import numpy as np + import matplotlib.pyplot as plt + from tqdm.notebook import tqdm + from datasets import load_dataset + import mlflow + import uuid + import traceback + from datetime import datetime + from mlflow.entities import SpanType + from mlflow.metrics.genai import EvaluationExample, make_genai_metric + + @mlflow.trace( + name="call-local-llm", span_type=SpanType.LLM, attributes={ + "model": mlflow_trace_attributes["model_id"], + "guardrail_id": mlflow_trace_attributes["guardrail_id"], + "guardrail_version": mlflow_trace_attributes["guardrail_version"] + } + ) + def invoke_sagemaker_endpoint(payload, endpoint_name): + """ + Invoke a SageMaker endpoint with the given payload. + """ + bedrock_runtime = boto3.client('bedrock-runtime') + guardrail_id = mlflow_trace_attributes["guardrail_id"] + guardrail_version = mlflow_trace_attributes["guardrail_version"] + guardrail_response_input = bedrock_runtime.apply_guardrail( + guardrailIdentifier=guardrail_id, + guardrailVersion=guardrail_version, + source='INPUT', + content=[{'text': {'text': payload["messages"][0]["content"]}}] + ) + guardrailResult = guardrail_response_input["action"] + + if guardrailResult == "GUARDRAIL_INTERVENED": + reason = guardrail_response_input["assessments"] + return guardrail_response_input["outputs"][0]["text"], -1 + + try: + start_time = time.time() + # response = sm_client.invoke_endpoint( + # EndpointName=endpoint_name, + # ContentType='application/json', + # Body=json.dumps(payload) + # ) + # inference_time = time.time() - start_time + + # response_body = response['Body'].read().decode('utf-8') + # return json.loads(response_body), inference_time + + from sagemaker.predictor import Predictor + from sagemaker.serializers import JSONSerializer + from sagemaker.deserializers import JSONDeserializer + + predictor = Predictor( + endpoint_name=f"{endpoint_name}", + serializer=JSONSerializer(), + deserializer=JSONDeserializer() + ) + + response = predictor.predict(payload)['choices'][0]['message']['content'] + inference_time = time.time() - start_time + return response, inference_time + except Exception as e: + print(f"Error invoking endpoint {endpoint_name}: {str(e)}") + return None, -1 + + def create_bedrock_judge_metrics(): + """ + Create custom LLM-as-a-judge metrics using AWS Bedrock Claude as the judge. + + Returns: + list: List of custom metrics for medical evaluation + """ + + # Medical Accuracy Metric using Bedrock Claude + medical_accuracy_examples = [ + EvaluationExample( + input="What is the first-line treatment for hypertension?", + output="ACE inhibitors or thiazide diuretics are typically first-line treatments for hypertension.", + score=4, + justification="The response correctly identifies evidence-based first-line treatments for hypertension." + ), + EvaluationExample( + input="What causes Type 1 diabetes?", + output="Type 1 diabetes is caused by autoimmune destruction of pancreatic beta cells.", + score=5, + justification="Accurate and concise explanation of Type 1 diabetes pathophysiology." + ), + EvaluationExample( + input="How do you treat a heart attack?", + output="You should take aspirin and call emergency services immediately.", + score=2, + justification="While partially correct, this oversimplifies emergency treatment and misses critical interventions." + ) + ] + + medical_accuracy = make_genai_metric( + name="medical_accuracy", + definition=( + "Medical accuracy measures how factually correct and evidence-based the medical information is. " + "Consider current medical guidelines, evidence-based practice, and clinical accuracy. " + "Score 1-5 where 5 is completely accurate and evidence-based." + ), + grading_prompt=( + "Evaluate the medical accuracy of the response on a scale of 1-5:\n" + "5: Completely accurate, evidence-based, follows current medical guidelines\n" + "4: Mostly accurate with minor gaps or generalizations\n" + "3: Generally accurate but missing important details or context\n" + "2: Partially accurate but contains some medical inaccuracies\n" + "1: Contains significant medical errors or misinformation\n\n" + "Question: {input}\n" + "Response: {output}\n\n" + "Consider: Is the medical information factually correct? Does it align with current evidence-based practice? " + "Are there any dangerous inaccuracies or omissions?\n\n" + "Provide your score as a single integer from 1-5." + ), + examples=medical_accuracy_examples, + version="v1", + # model="bedrock:/us.anthropic.claude-3-haiku-20240307-v1:0", + model="bedrock:/anthropic.claude-3-haiku-20240307-v1:0", + parameters={ + "anthropic_version": "bedrock-2023-05-31", + "temperature": 0.0, + "max_tokens": 1000 + }, + aggregations=["mean", "variance", "p90"], + greater_is_better=True + ) + + # Clinical Reasoning Metric + clinical_reasoning_examples = [ + EvaluationExample( + input="A 65-year-old man presents with chest pain. What should be considered?", + output="Given the patient's age and presentation, we should immediately consider cardiac causes like myocardial infarction, unstable angina, and aortic dissection. The approach should include ECG, cardiac enzymes, chest X-ray, and careful history taking about pain characteristics, onset, and associated symptoms.", + score=5, + justification="Excellent clinical reasoning with systematic approach, appropriate differential diagnosis, and logical diagnostic workup." + ), + EvaluationExample( + input="Patient has fever and cough. What's the diagnosis?", + output="The patient has pneumonia and needs antibiotics.", + score=2, + justification="Poor reasoning - jumps to conclusion without proper assessment, history, or consideration of differential diagnosis." + ) + ] + + clinical_reasoning = make_genai_metric( + name="clinical_reasoning", + definition=( + "Clinical reasoning evaluates the logical thought process, differential diagnosis consideration, " + "and systematic approach to medical problems. Assesses whether the response demonstrates " + "appropriate medical decision-making skills." + ), + grading_prompt=( + "Evaluate the clinical reasoning quality on a scale of 1-5:\n" + "5: Excellent reasoning with systematic approach, appropriate differential diagnosis, logical steps\n" + "4: Good reasoning with mostly sound logic and appropriate considerations\n" + "3: Adequate reasoning but may miss some important considerations\n" + "2: Poor reasoning with significant logical gaps or inappropriate approach\n" + "1: No clear reasoning or completely inappropriate approach\n\n" + "Question: {input}\n" + "Response: {output}\n\n" + "Consider: Does the response show logical thinking? Are differential diagnoses considered? " + "Is the approach systematic and appropriate for the clinical scenario?\n\n" + "Provide your score as a single integer from 1-5." + ), + examples=clinical_reasoning_examples, + version="v1", + # model="bedrock:/us.anthropic.claude-3-haiku-20240307-v1:0", + model="bedrock:/anthropic.claude-3-haiku-20240307-v1:0", + parameters={ + "anthropic_version": "bedrock-2023-05-31", + "temperature": 0.0, + "max_tokens": 1000 + }, + aggregations=["mean", "variance", "p90"], + greater_is_better=True + ) + + # Patient Safety Metric + patient_safety_examples = [ + EvaluationExample( + input="Patient allergic to penicillin needs antibiotic. What do you recommend?", + output="Since the patient is allergic to penicillin, I would recommend alternative antibiotics such as macrolides (azithromycin), cephalosporins (if no cross-reactivity), or fluoroquinolones, depending on the specific infection and patient factors.", + score=5, + justification="Excellent safety awareness - acknowledges allergy and provides appropriate alternatives." + ), + EvaluationExample( + input="Patient with chest pain asks if they can drive home.", + output="Sure, if you're feeling better you can drive home.", + score=1, + justification="Dangerous advice - patient with chest pain should not drive and needs immediate medical evaluation." + ) + ] + + patient_safety = make_genai_metric( + name="patient_safety", + definition=( + "Patient safety measures whether the response prioritizes patient wellbeing, avoids harmful advice, " + "considers contraindications, and promotes safe medical practices." + ), + grading_prompt=( + "Evaluate patient safety considerations on a scale of 1-5:\n" + "5: Prioritizes safety, considers contraindications, promotes safe practices\n" + "4: Generally safe with minor safety considerations\n" + "3: Mostly safe but may miss some safety considerations\n" + "2: Some safety concerns or inappropriate advice\n" + "1: Potentially dangerous advice or significant safety issues\n\n" + "Question: {input}\n" + "Response: {output}\n\n" + "Consider: Is the advice safe? Are contraindications considered? Could following this advice harm the patient?\n\n" + "Provide your score as a single integer from 1-5." + ), + examples=patient_safety_examples, + version="v1", + # model="bedrock:/us.anthropic.claude-3-haiku-20240307-v1:0", + model="bedrock:/anthropic.claude-3-haiku-20240307-v1:0", + parameters={ + "anthropic_version": "bedrock-2023-05-31", + "temperature": 0.0, + "max_tokens": 1000 + }, + aggregations=["mean", "variance", "p90"], + greater_is_better=True + ) + + return [medical_accuracy, clinical_reasoning, patient_safety] + + def simple_judge_evaluation(predictions, questions, references): + """ + Simple rule-based evaluation as fallback if LLM-as-a-judge fails. + """ + scores = [] + + for pred, question, ref in zip(predictions, questions, references): + score = 3.0 # Default neutral score + + # Simple heuristics for medical evaluation + if len(pred.split()) < 10: + score -= 1.0 # Too short responses + elif len(pred.split()) > 500: + score -= 0.5 # Overly verbose + + # Check for medical keywords + medical_keywords = ['diagnosis', 'treatment', 'symptom', 'patient', 'clinical', 'medical'] + if any(keyword in pred.lower() for keyword in medical_keywords): + score += 0.5 + + # Check for safety considerations + safety_keywords = ['contraindication', 'allergy', 'caution', 'risk', 'side effect'] + if any(keyword in pred.lower() for keyword in safety_keywords): + score += 0.5 + + # Ensure score is in valid range + score = max(1.0, min(5.0, score)) + scores.append(score) + + return { + 'medical_accuracy': np.mean(scores), + 'clinical_reasoning': np.mean(scores), + 'patient_safety': np.mean(scores), + 'overall_quality': np.mean(scores) + } + + def evaluate_model_qualitatively(model_config, dataset): + """ + Evaluate a fine-tuned model using LLM-as-a-judge metrics with fallback. + """ + # time.sleep(60) + model_name = model_config["name"] + endpoint_name = model_config["endpoint"] + + print(f"\nPerforming qualitative evaluation for model: {model_name} on endpoint: {endpoint_name}") + + # Generate predictions for the dataset + predictions = [] + questions = [] + references = [] + inference_times = [] + failed_generations = 0 + + for example in tqdm(dataset, desc="Generating responses for evaluation"): + question = example["Question"] + reference = "\n".join([example["Complex_CoT"], example["Response"]]) + + payload = {} + messages_prompt = convert_to_messages(example, SYSTEM_PROMPT) + payload["messages"] = messages_prompt["messages"] + payload["parameters"] = { + "max_new_tokens": 512, + "top_p": 0.9, + "temperature": 0.6, + "return_full_text": False + } + + # Call the model endpoint + try: + response, inference_time = invoke_sagemaker_endpoint(payload, endpoint_name) + + if response is None: + prediction = "Error generating response." + failed_generations += 1 + # elif isinstance(response, list): + # prediction = response[0].get('generated_text', '').strip() + # elif isinstance(response, dict): + # prediction = response.get('generated_text', '').strip() + else: + prediction = str(response).strip() + + # prediction = prediction.split("<|eot_id|>")[0] if "<|eot_id|>" in prediction else prediction + inference_times.append(inference_time) + + except Exception as e: + print(f"Error invoking SageMaker endpoint {endpoint_name}: {e}") + prediction = "Error generating response." + failed_generations += 1 + inference_times.append(-1) + + predictions.append(prediction) + questions.append(question) + references.append(reference) + + # Log basic generation metrics + mlflow.log_metric("qualitative_failed_generations", failed_generations) + mlflow.log_metric("qualitative_failure_rate", failed_generations / len(dataset) if len(dataset) > 0 else 0) + + # Try LLM-as-a-judge evaluation, fallback to simple evaluation + try: + print("Attempting LLM-as-a-judge evaluation using AWS Bedrock...") + + # Prepare data for MLflow evaluation + eval_data = pd.DataFrame({ + "inputs": questions, + "outputs": predictions, + "targets": references + }) + + # Create custom metrics + custom_metrics = create_bedrock_judge_metrics() + + # Run MLflow evaluation + eval_results = mlflow.evaluate( + data=eval_data, + targets="targets", + predictions="outputs", + extra_metrics=custom_metrics, + ) + print(f"Raw evaluation results: {eval_results.metrics}") + + # Extract metric results + metric_results = {} + for metric_name in ["medical_accuracy/v1/mean", "clinical_reasoning/v1/mean", "patient_safety/v1/mean"]: + if metric_name in eval_results.metrics: + base_name = metric_name.split('/')[0] + metric_results[base_name] = eval_results.metrics[metric_name] + if not np.isnan(metric_results[base_name]): + mlflow.log_metric(f"qualitative_{base_name}", metric_results[base_name]) + else: + mlflow.log_metric(f"qualitative_{base_name}", 0.0) + + print("LLM-as-a-judge evaluation completed successfully!") + + except Exception as e: + print(f"LLM-as-a-judge evaluation failed: {str(e)}") + print("Falling back to simple rule-based evaluation...") + + # Fallback to simple evaluation + metric_results = simple_judge_evaluation(predictions, questions, references) + + for metric_name, score in metric_results.items(): + if not np.isnan(score): + mlflow.log_metric(f"qualitative_{metric_name}", score) + else: + mlflow.log_metric(f"qualitative_{metric_name}", 0.0) + + # Create evaluation summary + evaluation_details = [] + for i, (pred, question, ref) in enumerate(zip(predictions[:5], questions[:5], references[:5])): + evaluation_details.append({ + "question": question, + "prediction": pred[:500] + ("..." if len(pred) > 500 else ""), + "reference": ref[:500] + ("..." if len(ref) > 500 else ""), + }) + + # Save detailed results + detailed_df = pd.DataFrame(evaluation_details) + temp_csv = f"/tmp/qualitative_eval_detailed_{uuid.uuid4().hex[:8]}.csv" + detailed_df.to_csv(temp_csv, index=False) + mlflow.log_artifact(temp_csv, "qualitative_evaluation") + + # Create simple visualization + plt.figure(figsize=(10, 6)) + metric_names = list(metric_results.keys()) + metric_values = list(metric_results.values()) + plt.bar(metric_names, metric_values, color=['blue', 'green', 'red', 'orange']) + plt.title('Qualitative Evaluation Scores') + plt.ylabel('Score (1-5)') + plt.ylim(1, 5) + plt.xticks(rotation=45) + plt.tight_layout() + plt.savefig('/tmp/qualitative_metrics.png', dpi=300, bbox_inches='tight') + mlflow.log_artifact('/tmp/qualitative_metrics.png', "qualitative_evaluation") + + avg_medical_accuracy = metric_results.get("medical_accuracy", metric_results.get("overall_quality", 3.0)) + + return { + "model_name": model_name, + "endpoint_name": endpoint_name, + "num_samples": len(dataset), + "metrics": metric_results, + "evaluation_details": evaluation_details, + "avg_medical_accuracy": avg_medical_accuracy + } + + # Main evaluation logic + mlflow.set_tracking_uri(tracking_server_arn) + mlflow.set_experiment(experiment_name) + + import boto3 + import os + + # Get AWS credentials from the SageMaker execution environment + session = boto3.Session() + credentials = session.get_credentials() + + # Set as environment variables + os.environ['AWS_ACCESS_KEY_ID'] = credentials.access_key + os.environ['AWS_SECRET_ACCESS_KEY'] = credentials.secret_key + if credentials.token: + os.environ['AWS_SESSION_TOKEN'] = credentials.token + + # Set region - important for Bedrock + region = boto3.session.Session().region_name + os.environ['AWS_REGION'] = region + + with mlflow.start_run(run_id=run_id): + with mlflow.start_run(run_name="QualitativeModelEvaluation", nested=True): + mlflow.set_tag("component", "qualitative_model_evaluation") + + # Initialize the SageMaker client + sm_client = boto3.client('sagemaker-runtime') + + # Define the model to evaluate + model_to_evaluate = { + "name": "Fine-tuned Qwen3-4B-Instruct-2507", + "endpoint": endpoint_name + } + + # Limit samples for faster execution + num_samples = 10 + + # Log evaluation parameters + mlflow.log_param("qualitative_evaluation_endpoint", endpoint_name) + mlflow.log_param("qualitative_evaluation_num_samples", num_samples) + mlflow.log_param("qualitative_evaluation_timestamp", datetime.now().isoformat()) + mlflow.log_param("llm_judge_model", "bedrock:/anthropic.claude-3-haiku-20240307-v1:0") + # mlflow.log_param("llm_judge_model", "bedrock:/us.anthropic.claude-3-haiku-20240307-v1:0") + + # Load the test dataset + try: + dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en", split="train") + max_samples = len(dataset) + dataset = dataset.shuffle().select(range(min(num_samples, max_samples))) + print(f"Loaded medical-o1-reasoning dataset with {len(dataset)} samples for qualitative evaluation") + + mlflow.log_param("qualitative_dataset_name", "FreedomIntelligence/medical-o1-reasoning-SFT") + mlflow.log_param("qualitative_dataset_actual_samples", len(dataset)) + except Exception as e: + error_msg = f"Error loading dataset for qualitative evaluation: {str(e)}" + print(error_msg) + raise + + try: + # Perform qualitative evaluation + qualitative_results = evaluate_model_qualitatively(model_to_evaluate, dataset) + + avg_medical_accuracy = qualitative_results["avg_medical_accuracy"] + + print(f"\nQualitative evaluation completed!") + print(f"Average Medical Accuracy: {avg_medical_accuracy:.3f}") + + return {"avg_medical_accuracy": avg_medical_accuracy} + + except Exception as e: + error_msg = f"Error in qualitative model evaluation: {str(e)}\n{traceback.format_exc()}" + print(error_msg) + return {"error": str(e), "avg_medical_accuracy": 0.0} \ No newline at end of file diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/quantitative_eval_step.py b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/quantitative_eval_step.py new file mode 100644 index 0000000..2f905b4 --- /dev/null +++ b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/quantitative_eval_step.py @@ -0,0 +1,434 @@ +# ### 7. Quantitative Evaluation Step + +# After fine-tuning, this step assesses the model's quantitative performance. + +from sagemaker.workflow.function_step import step +from .pipeline_utils import PIPELINE_INSTANCE_TYPE +from .pipeline_utils import SYSTEM_PROMPT +from .pipeline_utils import convert_to_messages + + +@step( + name="QuantitativeModelEvaluation", + instance_type=PIPELINE_INSTANCE_TYPE, + display_name="Quantitative Model Evaluation", + keep_alive_period_in_seconds=900, + dependencies="./eval/requirements.txt" +) +def quantitative_evaluate( + tracking_server_arn: str, + experiment_name: str, + run_id: str, + endpoint_name: str, + mlflow_trace_attributes: dict +)-> dict: + import os + import json + import time + import boto3 + import pandas as pd + import numpy as np + import matplotlib.pyplot as plt + from tqdm.notebook import tqdm + from datasets import load_dataset + import mlflow + import uuid + import traceback + from datetime import datetime + from rouge_score import rouge_scorer + from mlflow.entities import SpanType + + @mlflow.trace( + name="call-local-llm", span_type=SpanType.LLM, attributes={ + "model": mlflow_trace_attributes["model_id"], + "guardrail_id": mlflow_trace_attributes["guardrail_id"], + "guardrail_version": mlflow_trace_attributes["guardrail_version"] + } + ) + def invoke_sagemaker_endpoint(payload, endpoint_name): + """ + Invoke a SageMaker endpoint with the given payload. + + Args: + payload (dict): The input data to send to the endpoint + endpoint_name (str): The name of the SageMaker endpoint + + Returns: + dict: The response from the endpoint + """ + bedrock_runtime = boto3.client('bedrock-runtime') + guardrail_id = mlflow_trace_attributes["guardrail_id"] + guardrail_version = mlflow_trace_attributes["guardrail_version"] + guardrail_response_input = bedrock_runtime.apply_guardrail( + guardrailIdentifier=guardrail_id, + guardrailVersion=guardrail_version, + source='INPUT', + content=[{'text': {'text': payload["messages"][0]["content"]}}] + ) + guardrailResult = guardrail_response_input["action"] + + if guardrailResult == "GUARDRAIL_INTERVENED": + reason = guardrail_response_input["assessments"] + return guardrail_response_input["outputs"][0]["text"], -1 + + try: + start_time = time.time() + # response = sm_client.invoke_endpoint( + # EndpointName=endpoint_name, + # ContentType='application/json', + # Body=json.dumps(payload) + # ) + # inference_time = time.time() - start_time + + # response_body = response['Body'].read().decode('utf-8') + # return json.loads(response_body), inference_time + + from sagemaker.predictor import Predictor + from sagemaker.serializers import JSONSerializer + from sagemaker.deserializers import JSONDeserializer + + predictor = Predictor( + endpoint_name=f"{endpoint_name}", + serializer=JSONSerializer(), + deserializer=JSONDeserializer() + ) + + response = predictor.predict(payload)['choices'][0]['message']['content'] + inference_time = time.time() - start_time + return response, inference_time + except Exception as e: + print(f"Error invoking endpoint {endpoint_name}: {str(e)}") + return None, -1 + + def calculate_metrics(predictions, references): + """ + Calculate all evaluation metrics for summarization using LightEval. + + Args: + predictions (list): List of generated summaries + references (list): List of reference summaries + + Returns: + dict: Dictionary containing all metric scores + """ + metrics = {} + + # Initialize the Rouge scorer + scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True) + + # Calculate ROUGE scores for each prediction-reference pair + rouge_scores = { + 'rouge1_f': [], + 'rouge2_f': [], + 'rougeL_f': [], + 'rouge1_precision': [], + 'rouge1_recall': [], + 'rouge2_precision': [], + 'rouge2_recall': [], + 'rougeL_precision': [], + 'rougeL_recall': [] + } + + for pred, ref in zip(predictions, references): + scores = scorer.score(ref, pred) + + # Extract all metrics + rouge_scores['rouge1_f'].append(scores['rouge1'].fmeasure) + rouge_scores['rouge2_f'].append(scores['rouge2'].fmeasure) + rouge_scores['rougeL_f'].append(scores['rougeL'].fmeasure) + + rouge_scores['rouge1_precision'].append(scores['rouge1'].precision) + rouge_scores['rouge1_recall'].append(scores['rouge1'].recall) + rouge_scores['rouge2_precision'].append(scores['rouge2'].precision) + rouge_scores['rouge2_recall'].append(scores['rouge2'].recall) + rouge_scores['rougeL_precision'].append(scores['rougeL'].precision) + rouge_scores['rougeL_recall'].append(scores['rougeL'].recall) + + # Average ROUGE scores + for key in rouge_scores: + metrics[key] = sum(rouge_scores[key]) / len(rouge_scores[key]) + + # Calculate prediction statistics + metrics['avg_prediction_length'] = np.mean([len(pred.split()) for pred in predictions]) + metrics['min_prediction_length'] = min([len(pred.split()) for pred in predictions]) + metrics['max_prediction_length'] = max([len(pred.split()) for pred in predictions]) + + # Calculate reference statistics + metrics['avg_reference_length'] = np.mean([len(ref.split()) for ref in references]) + metrics['min_reference_length'] = min([len(ref.split()) for ref in references]) + metrics['max_reference_length'] = max([len(ref.split()) for ref in references]) + + # Calculate length ratio + metrics['avg_length_ratio'] = np.mean([len(pred.split()) / len(ref.split()) if len(ref.split()) > 0 else 0 + for pred, ref in zip(predictions, references)]) + + print(f"Metrics: {metrics}") + + return metrics + + def generate_summaries_with_model(endpoint_name, dataset): + """ + Generate summaries using a model deployed on SageMaker. + + Args: + endpoint_name (str): SageMaker endpoint name + dataset: Dataset containing dialogues + + Returns: + list: Generated summaries + list: Inference times for each summary + """ + predictions = [] + inference_times = [] + failed_generations = 0 + + for example in tqdm(dataset, desc="Generating Responses"): + payload = {} + messages_prompt = convert_to_messages(example, SYSTEM_PROMPT) + payload["messages"] = messages_prompt["messages"] + payload["parameters"] = { + "max_new_tokens": 512, + "top_p": 0.9, + "temperature": 0.6, + "return_full_text": False + } + + # Call the model endpoint + try: + response, inference_time = invoke_sagemaker_endpoint(payload, endpoint_name) + + # Extract the generated text + if response is None: + prediction = "Error generating response." + failed_generations += 1 + elif isinstance(response, list): + prediction = response[0].get('generated_text', '').strip() + elif isinstance(response, dict): + prediction = response.get('generated_text', '').strip() + else: + prediction = str(response).strip() + + prediction = prediction.split("<|eot_id|>")[0] if "<|eot_id|>" in prediction else prediction + + # Log individual inference metrics + mlflow.log_metric(f"inference_time_sample_{len(predictions)}", inference_time) + + inference_times.append(inference_time) + + except Exception as e: + print(f"Error invoking SageMaker endpoint {endpoint_name}: {e}") + prediction = "Error generating response." + failed_generations += 1 + inference_times.append(-1) + + predictions.append(prediction) + + # Log failure rate + mlflow.log_metric("failed_generations", failed_generations) + mlflow.log_metric("failure_rate", failed_generations / len(dataset) if len(dataset) > 0 else 0) + + return predictions, inference_times + + def evaluate_model_on_dataset(model_config, dataset): + """ + Evaluate a fine-tuned model on a dataset using both automated and human metrics. + + Args: + model_config (dict): Model configuration with name and endpoint + dataset: dataset for evaluation + + Returns: + dict: Evaluation results + """ + model_name = model_config["name"] + endpoint_name = model_config["endpoint"] + + print(f"\nEvaluating model: {model_name} on endpoint: {endpoint_name}") + + # Get references + references = ["\n".join([example["Complex_CoT"], example["Response"]]) for example in dataset] + + # Generate summaries + print("\nGenerating Responses...") + predictions, inference_times = generate_summaries_with_model(endpoint_name, dataset) + + # Log inference time metrics + valid_times = [t for t in inference_times if t > 0] + if valid_times: + mlflow.log_metric("avg_inference_time", np.mean(valid_times)) + mlflow.log_metric("min_inference_time", min(valid_times)) + mlflow.log_metric("max_inference_time", max(valid_times)) + mlflow.log_metric("p95_inference_time", np.percentile(valid_times, 95)) + + # Calculate automated metrics using LightEval + print("\nCalculating evaluation metrics with LightEval...") + metrics = calculate_metrics(predictions, references) + + # Log all calculated metrics to MLflow + for metric_name, metric_value in metrics.items(): + mlflow.log_metric(metric_name, metric_value) + + # Create a comparison table of predictions vs references + comparison_data = [] + scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True) + + for i, (pred, ref) in enumerate(zip(predictions[:5], references[:5])): + # Calculate Rouge-1 score for this example + rouge1_score = scorer.score(ref, pred)['rouge1'].fmeasure + + comparison_data.append({ + "example_id": i, + "prediction": pred[:500] + ("..." if len(pred) > 500 else ""), # Truncate for readability + "reference": ref[:500] + ("..." if len(ref) > 500 else ""), # Truncate for readability + "rouge1_f": rouge1_score + }) + + comparison_df = pd.DataFrame(comparison_data) + # Save comparison to a temporary CSV and log it as an artifact + temp_csv = f"/tmp/predictions_comparison_{uuid.uuid4().hex[:8]}.csv" + comparison_df.to_csv(temp_csv, index=False) + mlflow.log_artifact(temp_csv, "model_predictions") + + # Format results + results = { + "model_name": model_name, + "endpoint_name": endpoint_name, + "num_samples": len(dataset), + "metrics": metrics, + "predictions": predictions[:5], # First 5 predictions + "references": references[:5], # First 5 references + "inference_times": inference_times # Include the inference times + } + + # Print key results + print(f"\nResults for {model_name}:") + print(f"ROUGE-1 F1: {metrics['rouge1_f']:.4f}") + print(f"ROUGE-2 F1: {metrics['rouge2_f']:.4f}") + print(f"ROUGE-L F1: {metrics['rougeL_f']:.4f}") + print(f"Average Inference Time: {np.mean([t for t in inference_times if t > 0]):.3f} seconds") + + return results, metrics['rouge1_f'], metrics['rouge2_f'], metrics['rougeL_f'] + + mlflow.set_tracking_uri(tracking_server_arn) + mlflow.set_experiment(experiment_name) + + import boto3 + import os + + # Get AWS credentials from the SageMaker execution environment + session = boto3.Session() + credentials = session.get_credentials() + + # Set as environment variables + os.environ['AWS_ACCESS_KEY_ID'] = credentials.access_key + os.environ['AWS_SECRET_ACCESS_KEY'] = credentials.secret_key + if credentials.token: + os.environ['AWS_SESSION_TOKEN'] = credentials.token + + # Set region - important for Bedrock + region = boto3.session.Session().region_name + os.environ['AWS_REGION'] = region + + with mlflow.start_run(run_id=run_id): + with mlflow.start_run(run_name="QuantitativeModelEvaluation", nested=True): + mlflow.autolog() + + # Initialize the SageMaker client + sm_client = boto3.client('sagemaker-runtime') + + FINETUNED_MODEL_ENDPOINT = endpoint_name # Update with Fine-tuned model endpoint name + + # Define the model to evaluate + model_to_evaluate = { + "name": "Fine-tuned Qwen3-4B-Instruct-2507", + "endpoint": FINETUNED_MODEL_ENDPOINT + } + # Limit the number of samples to evaluate (for faster execution) + num_samples = 10 + + # Log evaluation parameters to MLflow + mlflow.log_param("evaluation_endpoint", FINETUNED_MODEL_ENDPOINT) + mlflow.log_param("evaluation_num_samples", num_samples) + mlflow.log_param("evaluation_timestamp", datetime.now().isoformat()) + + # Load the test split of the medical-o1 dataset + try: + dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en", split="train") + + max_samples = len(dataset) + + dataset = dataset.shuffle().select(range(min(num_samples, max_samples))) + print(f"Loaded medical-o1-reasoning dataset with {len(dataset)} samples out of {max_samples}") + + mlflow.log_param("dataset_name", "FreedomIntelligence/medical-o1-reasoning-SFT") + mlflow.log_param("dataset_actual_samples", len(dataset)) + except Exception as e: + error_msg = f"Error loading dataset: {str(e)}" + print(error_msg) + raise + + # Display a sample from the dataset + sample = dataset[0] + + print("\nQuestion:\n", sample["Question"], "\n\n====\n") + print("Complex_CoT:\n", sample["Complex_CoT"], "\n\n====\n") + print("Response:\n", sample["Response"], "\n\n====\n") + + try: + finetuned_model_results, rouge1_f, rouge2_f, rougeL_f = evaluate_model_on_dataset(model_to_evaluate, dataset) + print("DUMP") + json.dumps(finetuned_model_results) + print(f"ROUGE-1 F1: {rouge1_f}") + print(f"ROUGE-2 F1: {rouge2_f}") + print(f"ROUGE-L F1: {rougeL_f}") + + # Create and log visualizations if MLflow is enabled + # Log model card with performance summary + model_card = f""" + # Model Evaluation Report + + ## Model Information + - **Model Name**: {model_to_evaluate["name"]} + - **Endpoint**: {model_to_evaluate["endpoint"]} + - **Evaluation Date**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} + - **Dataset**: FreedomIntelligence/medical-o1-reasoning-SFT + - **Samples Evaluated**: {len(dataset)} + + ## Performance Metrics + - **ROUGE-1 F1**: {rouge1_f:.4f} + - **ROUGE-2 F1**: {rouge2_f:.4f} + - **ROUGE-L F1**: {rougeL_f:.4f} + - **Average Inference Time**: {np.mean([t for t in finetuned_model_results["inference_times"] if t > 0]):.3f} seconds + + ## Detailed Metrics + {json.dumps(finetuned_model_results["metrics"], indent=2)} + """ + + with open("/tmp/model_card.md", "w") as f: + f.write(model_card) + + mlflow.log_artifact("/tmp/model_card.md", "evaluation_summary") + + # Create a simple bar chart for ROUGE metrics + plt.figure(figsize=(10, 6)) + rouge_metrics = { + 'ROUGE-1 F1': rouge1_f, + 'ROUGE-2 F1': rouge2_f, + 'ROUGE-L F1': rougeL_f + } + plt.bar(rouge_metrics.keys(), rouge_metrics.values()) + plt.title('ROUGE Metrics') + plt.ylabel('Score') + plt.ylim(0, 1) + plt.grid(axis='y', linestyle='--', alpha=0.7) + plt.savefig('/tmp/rouge_metrics.png') + mlflow.log_artifact('/tmp/rouge_metrics.png', "evaluation_plots") + + except Exception as e: + error_msg = f"Error in model evaluation: {str(e)}\n{traceback.format_exc()}" + print(error_msg) + + # Return at least something even if evaluation fails + return {"error": str(e), "rougeL_f": 0.0} + + return {"rougeL_f": rougeL_f} \ No newline at end of file diff --git a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/utils.py b/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/utils.py deleted file mode 100644 index c76e0a9..0000000 --- a/workshops/fine-tuning-with-sagemakerai-and-bedrock/task_05_fmops/steps/utils.py +++ /dev/null @@ -1,20 +0,0 @@ -import boto3 -from datetime import datetime - - -def endpoint_exists(endpoint_name): - endpoint_exist = False - - client = boto3.client('sagemaker') - response = client.list_endpoints() - endpoints = response["Endpoints"] - - for endpoint in endpoints: - if endpoint_name == endpoint["EndpointName"]: - endpoint_exist = True - break - - return endpoint_exist - -def create_training_job_name(model_id): - return f"{model_id}-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f')[:-3]}"