##### Copyright 2025 Google LLC.

In [None]:
# @title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Gemma-3N Mental Health & Emotional First Aid Assistant for offline inference
This notebook fine-tunes Gemma-3N (4B) model using Unsloth on mental health counseling conversations to create an emotional first aid assistant, locally.

**Author**: Nguyen Khanh Linh  
**GitHub**: [github.com/linhkid](https://github.com/linhkid)  
**LinkedIn**: [@Khanh Linh Nguyen](https://twitter.com/https://www.linkedin.com/in/linhnguyenkhanh/)

### Installation

In [1]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.55.4
!pip install --no-deps trl==0.22.2
import torch; torch._dynamo.config.recompile_limit = 64;

In [2]:
%%capture
!pip install --no-deps --upgrade timm # Only for Gemma 3N

### Load Gemma-3N Model

In [3]:
from unsloth import FastModel
import torch

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3n-E4B-it",
    dtype = None, # None for auto detection
    max_seq_length = 2048, # Increased for longer conversations
    load_in_4bit = True,  # 4 bit quantization to reduce memory
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.10.1: Fast Gemma3N patching. Transformers: 4.55.4.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.15G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.72G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/98.0 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

preprocessor_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.70M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

### Load Mental Health Datasets
We'll combine two datasets:
1. Mental health counseling conversations
2. Psychology Q&A dataset

In [4]:
from datasets import load_dataset, concatenate_datasets

print("Loading datasets...")

# Load the first dataset
mental_health_dataset = load_dataset("Amod/mental_health_counseling_conversations", split="train")

# Load the second dataset and rename columns to match the first
psychology_dataset = load_dataset("jkhedri/psychology-dataset", split="train")
psychology_dataset = psychology_dataset.rename_columns({
    "question": "Context",
    "response_j": "Response"
})

# Combine the datasets
combined_dataset = concatenate_datasets([
    mental_health_dataset.select_columns(['Context', 'Response']),
    psychology_dataset.select_columns(['Context', 'Response'])
])

# Split the combined dataset into train and evaluation sets
dataset = combined_dataset.train_test_split(test_size=0.1, seed=42) # Using 10% for evaluation

print("Datasets loaded.")
print(f"Combined dataset has {len(combined_dataset)} samples.")
print(f"Training dataset has {len(dataset['train'])} samples.")
print(f"Evaluation dataset has {len(dataset['test'])} samples.")

print("\nSample of raw data:")
print(f"Context: {dataset['train'][0]['Context'][:200]}...")
print(f"Response: {dataset['train'][0]['Response'][:200]}...")

Loading datasets...


README.md: 0.00B [00:00, ?B/s]

combined_dataset.json: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/3512 [00:00<?, ? examples/s]

data/part-00000-694db9fd-774c-4205-b938-(…):   0%|          | 0.00/1.59M [00:00<?, ?B/s]

data/part-00001-694db9fd-774c-4205-b938-(…):   0%|          | 0.00/96.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9846 [00:00<?, ? examples/s]

Datasets loaded.
Combined dataset has 13358 samples.
Training dataset has 12022 samples.
Evaluation dataset has 1336 samples.

Sample of raw data:
Context: I'm having trouble with my eating habits. I feel like I'm always bingeing and then feeling guilty about it....
Response: Eating disorders can be serious, but there is help available. Let's work on developing healthy eating habits, exploring possible underlying causes for your behavior, and possibly seeking out therapy o...


### Setup Chat Template for Gemma-3

In [5]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

### Format Dataset with Chat Template
Convert Context/Response pairs into Gemma-3 conversation format

In [6]:
def formatting_prompts_func(examples):
    convos = []
    for context, response in zip(examples["Context"], examples["Response"]):
        messages = [
            {"role": "user", "content": context},
            {"role": "assistant", "content": response},
        ]
        # add_generation_prompt=False to avoid a user turn at the end
        # Remove <bos> token as processor will add it
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False
        ).removeprefix('<bos>')
        convos.append(text)
    return {"text": convos}

# Apply the formatting function to both train and test splits
dataset["train"] = dataset["train"].map(formatting_prompts_func, batched=True)
dataset["test"] = dataset["test"].map(formatting_prompts_func, batched=True)

print("\nDatasets loaded and formatted.")
print(f"Combined dataset has {len(dataset)} samples.")
# Optional: Print a sample to verify the format
print("\nSample of formatted data:")
print(dataset["train"][0]["text"][:500])

Map:   0%|          | 0/12022 [00:00<?, ? examples/s]

Map:   0%|          | 0/1336 [00:00<?, ? examples/s]


Datasets loaded and formatted.
Combined dataset has 2 samples.

Sample of formatted data:
<start_of_turn>user
I'm having trouble with my eating habits. I feel like I'm always bingeing and then feeling guilty about it.<end_of_turn>
<start_of_turn>model
Eating disorders can be serious, but there is help available. Let's work on developing healthy eating habits, exploring possible underlying causes for your behavior, and possibly seeking out therapy or support groups.<end_of_turn>



### Add LoRA Adapters
We only update a small amount of parameters for efficient fine-tuning

In [7]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # Should leave on always!

    r = 16,          # Larger = higher accuracy, increased for mental health
    lora_alpha = 16, # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

Unsloth: Making `model.base_model.model.model.language_model` require gradients


### Setup Training Configuration

In [8]:
from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset["train"],
    eval_dataset = dataset["test"], # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        #num_train_epochs = 1, # Full training run
        max_steps = 100, # Or use max_steps for quick testing
        learning_rate = 2e-4,
        logging_steps = 50,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
        gradient_checkpointing = True, # Disable gradient checkpointing as a workaround
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/12022 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/1336 [00:00<?, ? examples/s]

### Train Only on Assistant Responses
This helps increase accuracy by masking out the user inputs during training

In [9]:
from unsloth.chat_templates import train_on_responses_only

trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

Map (num_proc=12):   0%|          | 0/12022 [00:00<?, ? examples/s]

Map (num_proc=12):   0%|          | 0/1336 [00:00<?, ? examples/s]

### Verify Masking
Let's check that the instruction masking is working correctly

In [10]:
print("Full conversation:")
print(tokenizer.decode(trainer.train_dataset[5]["input_ids"]))
print("\n" + "="*50 + "\n")
print("Masked (only assistant response will be trained):")
print(tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[5]["labels"]]).replace(tokenizer.pad_token, " "))

Full conversation:
<bos><start_of_turn>user
I'm struggling with grief and loss.<end_of_turn>
<start_of_turn>model
Grief and loss can be a difficult experience, but it's possible to work through the emotions and find healing. Let's work together to identify any underlying beliefs or experiences that may be contributing to your grief and develop strategies to manage the emotions. It may also be helpful to seek out support from loved ones or grief-specific therapy or support groups.<end_of_turn>



Masked (only assistant response will be trained):
                  Grief and loss can be a difficult experience, but it's possible to work through the emotions and find healing. Let's work together to identify any underlying beliefs or experiences that may be contributing to your grief and develop strategies to manage the emotions. It may also be helpful to seek out support from loved ones or grief-specific therapy or support groups.<end_of_turn>



### Check Memory Stats

In [11]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA L4. Max memory = 22.161 GB.
9.379 GB of memory reserved.


### Train the Model!
To resume a training run, set `trainer.train(resume_from_checkpoint = True)`

In [12]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 12,022 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 38,420,480 of 7,888,398,672 (0.49% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
50,2.6311
100,1.9844


### Training Stats

In [13]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

652.2686 seconds used for training.
10.87 minutes used for training.
Peak reserved memory = 14.281 GB.
Peak reserved memory for training = 4.902 GB.
Peak reserved memory % of max memory = 64.442 %.
Peak reserved memory for training % of max memory = 22.12 %.


### Test the Mental Health Assistant
Let's test the fine-tuned model on mental health queries

In [14]:
from transformers import TextStreamer

def chat_with_mental_health_assistant(user_message, max_new_tokens=256):
    messages = [{
        "role": "user",
        "content": [{"type": "text", "text": user_message}]
    }]

    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt = True,
        return_tensors = "pt",
        tokenize = True,
        return_dict = True,
    ).to("cuda")

    _ = model.generate(
        **inputs,
        max_new_tokens = max_new_tokens,
        temperature = 0.7,  # Lower temperature for more focused responses
        top_p = 0.9,
        top_k = 50,
        streamer = TextStreamer(tokenizer, skip_prompt = True),
    )

In [15]:
# Test 1: Anxiety
print("=" * 50)
print("User: I've been feeling very anxious lately and I don't know why.")
print("=" * 50)
chat_with_mental_health_assistant("I've been feeling very anxious lately and I don't know why.")

User: I've been feeling very anxious lately and I don't know why.
It's common to experience anxiety without a clear trigger. Let's explore your recent experiences and see if we can identify any patterns or potential stressors. We can also discuss relaxation techniques and coping strategies to help manage your anxiety.<end_of_turn>


In [16]:
# Test 2: Stress
print("\n" + "=" * 50)
print("User: I'm feeling overwhelmed with work and personal life. What should I do?")
print("=" * 50)
chat_with_mental_health_assistant("I'm feeling overwhelmed with work and personal life. What should I do?")


User: I'm feeling overwhelmed with work and personal life. What should I do?
It's common to feel overwhelmed with work and personal life at times. Let's work together to identify your priorities and develop a plan to manage your time and energy effectively. This may include delegating tasks, setting boundaries, and practicing self-care.<end_of_turn>


In [17]:
# Test 3: Depression symptoms
print("\n" + "=" * 50)
print("User: I've lost interest in things I used to enjoy. Is this normal?")
print("=" * 50)
chat_with_mental_health_assistant("I've lost interest in things I used to enjoy. Is this normal?")


User: I've lost interest in things I used to enjoy. Is this normal?
It's normal to experience a temporary loss of interest in things you used to enjoy, especially after a stressful period. However, if this persists for more than a few weeks or is accompanied by other symptoms like fatigue, sleep disturbances, or changes in appetite, it could be a sign of depression. It's important to consult with a healthcare professional for proper diagnosis and treatment.<end_of_turn>


In [18]:
# Test 4: Self-care
print("\n" + "=" * 50)
print("User: What are some healthy coping mechanisms for stress?")
print("=" * 50)
chat_with_mental_health_assistant("What are some healthy coping mechanisms for stress?")


User: What are some healthy coping mechanisms for stress?
Healthy coping mechanisms for stress include exercise, spending time in nature, practicing mindfulness and meditation, engaging in hobbies, and connecting with loved ones. It's also important to prioritize sleep and a balanced diet.<end_of_turn>


### Save the Model
Save the LoRA adapters locally

In [19]:
model.save_pretrained("gemma-3n-mental-health")  # Local saving
tokenizer.save_pretrained("gemma-3n-mental-health")
print("Model saved successfully!")

Model saved successfully!


### Optional: Push to Hugging Face Hub

In [20]:
# Uncomment and add your token to upload
# model.push_to_hub("YOUR_HF_USERNAME/gemma-3n-mental-health", token = "hf_...")
# tokenizer.push_to_hub("YOUR_HF_USERNAME/gemma-3n-mental-health", token = "hf_...")

### Load Model for Future Use

In [21]:
# To load the fine-tuned model later:
if False:  # Set to True to load
    from unsloth import FastModel
    model, tokenizer = FastModel.from_pretrained(
        model_name = "gemma-3n-mental-health",
        max_seq_length = 2048,
        load_in_4bit = True,
    )

### Export to GGUF (for llama.cpp)

In [22]:
# Save to GGUF format for local deployment
if False:  # Set to True to export
    model.save_pretrained_gguf(
        "gemma-3n-mental-health-gguf",
        quantization_type = "Q8_0",  # Q8_0, BF16, or F16
    )

### Export to float16 (for deployment)

In [23]:
# Save merged model in float16
if True:  # Set to True to export
    model.save_pretrained_merged("gemma-3n-mental-health-f16", tokenizer)

config.json: 0.00B [00:00, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Checking cache directory for required files...
Cache check failed: model-00001-of-00004.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  25%|██▌       | 1/4 [00:10<00:31, 10.66s/it]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  50%|█████     | 2/4 [00:36<00:39, 19.57s/it]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  75%|███████▌  | 3/4 [02:12<00:54, 54.33s/it]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.66G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|██████████| 4/4 [02:21<00:00, 35.31s/it]
Unsloth: Merging weights into 16bit: 100%|██████████| 4/4 [00:53<00:00, 13.27s/it]


Unsloth: Merge process complete. Saved to `/content/gemma-3n-mental-health-f16`


In [None]:
import torch
torch.cuda.empty_cache()
print("CUDA cache cleared.")

## Local Inference for fully merged model

Only run if you have enough VRAM. Otherwise, you can choose to deploy to cloud

In [2]:
from unsloth import FastModel
from transformers import TextStreamer
import torch # Import torch for dtype

# Load the merged model with bfloat16 precision
model, tokenizer = FastModel.from_pretrained(
    model_name = "gemma-3n-mental-health-f16", # Path to the merged model
    max_seq_length = 2048,
    dtype = torch.bfloat16, # Use bfloat16 for the merged model
    load_in_4bit = False, # Not needed for merged model
)

def chat_with_merged_model(user_message, max_new_tokens=256):
    messages = [{
        "role": "user",
        "content": [{"type": "text", "text": user_message}]
    }]

    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt = True,
        return_tensors = "pt",
        tokenize = True,
        return_dict = True,
    ).to("cuda") # Move to CUDA for inference

    _ = model.generate(
        **inputs,
        max_new_tokens = max_new_tokens,
        temperature = 0.7,
        top_p = 0.9,
        top_k = 50,
        streamer = TextStreamer(tokenizer, skip_prompt = True),
    )

# Example usage
q = "I've been feeling very anxious lately and I don't know why."
print("User: {}".format(q))
#chat_with_merged_model(q) #uncomment if you can run locally

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.10.1: Fast Gemma3N patching. Transformers: 4.55.4.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

User: I've been feeling very anxious lately and I don't know why.


In [3]:
chat_with_merged_model(q) #uncomment if you can run locally

It's common to experience anxiety without a clear trigger. Let's explore your life and see if there are any potential stressors or patterns that might be contributing to your anxiety. We can also discuss relaxation techniques and coping strategies to help manage your symptoms.<end_of_turn>


#Custom (does not work rn): Import Model to Vertex AI


## Deploy to Vertex AI as Custom Model

In [27]:
from google.colab import auth
auth.authenticate_user()

## Define GCP Project and Bucket

### Subtask:
Specify your Google Cloud project ID and a Cloud Storage bucket name where the model will be stored.

In [None]:
PROJECT_ID = "project-id" # @param {type:"string"}
BUCKET_NAME = "bucket-name" # @param {type:"string"}
REGION = "region" # @param {type:"string"}

# Ensure the bucket name is valid
if not BUCKET_NAME or "your-gcp-bucket-name" in BUCKET_NAME:
    raise ValueError("Please replace 'your-gcp-bucket-name' with your actual bucket name.")

# Create the bucket if it doesn't exist, explicitly setting the project
!gsutil -p {PROJECT_ID} mb -l {REGION} gs://{BUCKET_NAME}

print(f"Project ID: {PROJECT_ID}")
print(f"Bucket Name: {BUCKET_NAME}")
print(f"Region: {REGION}")

## Upload Merged Model to Cloud Storage

Upload the saved merged model files (from the `gemma-3n-mental-health-f16` directory) to the specified Cloud Storage bucket.

In [None]:
# Upload the merged model directory to the bucket
!gsutil -m cp -r gemma-3n-mental-health-f16 gs://{BUCKET_NAME}/gemma-3n-mental-health-f16

print(f"Model uploaded to gs://{BUCKET_NAME}/gemma-3n-mental-health-f16")

In [None]:
from google.cloud import aiplatform

# Initialize Vertex AI
aiplatform.init(project=PROJECT_ID, location=REGION)

# Define model parameters
MODEL_DISPLAY_NAME = "gemma-3n-mental-health-assistant"
ARTIFACT_URI = f"gs://{BUCKET_NAME}/gemma-3n-mental-health-f16"
SERVING_CONTAINER_IMAGE = "us-docker.pkg.dev/vertex-ai/vertex-ai-huggingface-inference/transformers-tgi-nvidia-l4:latest" # Example image for L4 GPU

# Import the model
model = aiplatform.Model.upload(
    display_name=MODEL_DISPLAY_NAME,
    artifact_uri=ARTIFACT_URI,
    serving_container_image_uri=SERVING_CONTAINER_IMAGE,
    sync=True, # Wait for the import to complete
)

print(f"Model uploaded to Vertex AI: {model.resource_name}")

## Create Vertex AI Endpoint

Create a Vertex AI Endpoint resource where the model will be deployed for online predictions.

In [None]:
# Create an endpoint
endpoint = aiplatform.Endpoint.create(
    display_name=f"{MODEL_DISPLAY_NAME}_endpoint",
    project=PROJECT_ID,
    location=REGION,
)

print(f"Endpoint created: {endpoint.resource_name}")

## Deploy Model to Endpoint

Deploy the imported Vertex AI Model to the created Endpoint. This step involves configuring the deployment, such as specifying the machine type and the number of replicas.

In [None]:
# Define deployment parameters
MACHINE_TYPE = "g2-standard-8" # @param {type:"string"} # Machine type with L4 GPU
ACCELERATOR_TYPE = "NVIDIA_L4" # @param {type:"string"} # L4 GPU
ACCELERATOR_COUNT = 1 # @param {type:"integer"}

# Deploy the model to the endpoint
endpoint.deploy(
    model=model,
    deployed_model_display_name=MODEL_DISPLAY_NAME,
    machine_type=MACHINE_TYPE,
    accelerator_type=ACCELERATOR_TYPE,
    accelerator_count=ACCELERATOR_COUNT,
    sync=True, # Wait for the deployment to complete
)

print(f"Model deployed to endpoint: {endpoint.resource_name}")

## Important Disclaimer

⚠️ **This model is for educational and research purposes only.** It should NOT replace professional mental health care.

- This AI assistant provides general emotional support and information
- It is NOT a substitute for professional therapy or medical advice
- In crisis situations, please contact:
  - **National Suicide Prevention Lifeline**: 988 (US)
  - **Crisis Text Line**: Text HOME to 741741
  - **International Association for Suicide Prevention**: https://www.iasp.info/resources/Crisis_Centres/

Always consult with qualified mental health professionals for diagnosis and treatment.