In [2]:
def copy_file_to_docker(file_path, docker_path):
    """Copy a file from local to GCP VM and then into the Docker container."""
    filename = file_path.split("/")[-1] if "/" in file_path else file_path.split("\\")[-1]
    app_data_path = !dir "%APPDATA%"
    app_data_path = app_data_path[3].split()[-1]
    app_data_path = app_data_path.replace("\\", "/")
    username = app_data_path.split("/")[2]
    # Copy file to VM
    !gcloud compute scp --zone=us-central1-a {file_path} load-testing-instance:/home/{username}
    # Copy file from VM to Docker container
    copy_command = f"docker cp /home/{username}/{filename} load-testing-container:{docker_path}"
    !gcloud compute ssh load-testing-instance --zone=us-central1-a --command="{copy_command}"

def run_docker_command(inner_command):
    """
    Run a command inside load-testing-container on the GCP VM via SSH using docker exec.
    """
    docker_exec_template = "docker exec load-testing-container bash -c '{cmd}'"
    docker_command = docker_exec_template.format(cmd=inner_command)
    ssh_command = f"gcloud compute ssh load-testing-instance --zone=us-central1-a --command=\"{docker_command}\""
    !{ssh_command}

def copy_dir_to_docker(dir_path, docker_path):
    """Copy a directory from local to GCP VM and then into the Docker container."""
    dir_name = dir_path.split("/")[-1] if "/" in dir_path else dir_path.split("\\")[-1]
    username = dir_path.split("/")[2] if "/" in dir_path else dir_path.split("\\")[-2]
    # Clean up the VM directory 
    rm_command = f"sudo rm -rf /home/{username}/{dir_name}"  # Clean up the VM directory after copying
    docker_rm_command = f"docker exec load-testing-container rm -rf {docker_path}{dir_name}"  # Clean up the Docker directory
    !gcloud compute ssh load-testing-instance --zone=us-central1-a --command="{rm_command}"
    !gcloud compute ssh load-testing-instance --zone=us-central1-a --command="{docker_rm_command}"
    # Copy directory to VM
    print(f"gcloud compute scp --zone=us-central1-a --recurse {dir_path} load-testing-instance:/home/{username}/")
    !gcloud compute scp --zone=us-central1-a --recurse {dir_path} load-testing-instance:/home/{username}/
    # Copy directory from VM to Docker container
    copy_command = f"docker cp /home/{username}/{dir_name} load-testing-container:{docker_path}"
    !gcloud compute ssh load-testing-instance --zone=us-central1-a --command="{copy_command}"

def run_vm_command(command):
    """
    Run a command on the GCP VM via SSH.
    """
    ssh_command = f"gcloud compute ssh load-testing-instance --zone=us-central1-a --command=\"{command}\""
    !{ssh_command}

def copy_docker_file_to_local(docker_path, local_path):
    """Copy a file or directory from the Docker container to local."""
    # Get username from VM
    app_data_path = !dir "%APPDATA%"
    app_data_path = app_data_path[3].split()[-1]
    app_data_path = app_data_path.replace("\\", "/")
    username = app_data_path.split("/")[2]
    filename = docker_path.split("/")[-1] if "/" in docker_path else docker_path.split("\\")[-1]
    
    # Copy file from Docker container to VM home directory
    copy_command = f"docker cp load-testing-container:{docker_path} /home/{username}/{filename}"
    !gcloud compute ssh load-testing-instance --zone=us-central1-a --command="{copy_command}"
    # Copy file from VM to local
    copy_command = f"gcloud compute scp --zone=us-central1-a load-testing-instance:/home/{username}/{filename} {local_path}"
    !{copy_command}

In [None]:
# [SKIP] RUN LOAD TESTING VM
vm_name = "load-testing-instance"
snapshot_name = "load-testing-instance-snapshot"
zone = "us-central1-a"
!gcloud compute disks create {vm_name}-disk --source-snapshot={snapshot_name} --zone={zone}
!gcloud compute instances create {vm_name} --zone={zone} --disk=name={vm_name}-disk,boot=yes,auto-delete=yes --machine-type=e2-standard-8
# !gcloud compute firewall-rules create allow-all --direction=INGRESS --priority=1000 --network=default --action=ALLOW --rules=all --source-ranges=0.0.0.0/0

NAME                        ZONE           SIZE_GB  TYPE         STATUS
load-testing-instance-disk  us-central1-a  30       pd-standard  READY


Created [https://www.googleapis.com/compute/v1/projects/fast-learner-project/zones/us-central1-a/disks/load-testing-instance-disk].


NAME                   ZONE           MACHINE_TYPE   PREEMPTIBLE  INTERNAL_IP  EXTERNAL_IP   STATUS
load-testing-instance  us-central1-a  e2-standard-8               10.128.0.17  34.41.103.73  RUNNING


Created [https://www.googleapis.com/compute/v1/projects/fast-learner-project/zones/us-central1-a/instances/load-testing-instance].


In [5]:
# CHECK IF VM IS RUNNING
vm_name = "load-testing-instance"
vm_status = !gcloud compute instances describe {vm_name} --zone={zone} --format="get(status)"
if vm_status[0] == "RUNNING":
    print(f"VM {vm_name} is running.")
else:
    print(f"VM {vm_name} is not running. Current status: {vm_status[0]}")

VM load-testing-instance is not running. Current status: ERROR: (gcloud.compute.instances.describe) Could not fetch resource:


In [1]:
# [SKIP] Include in user docker so it run docker command without sudo
# run_vm_command("sudo usermod -aG docker $USER")

# Restart vm
# !gcloud compute instances stop load-testing-instance --zone=us-central1-a
!gcloud compute instances start load-testing-instance --zone=us-central1-a

Starting instance(s) load-testing-instance...
...........done.
Updated [https://compute.googleapis.com/compute/v1/projects/fast-learner-project/zones/us-central1-a/instances/load-testing-instance].
Instance internal IP is 10.128.0.17
Instance external IP is 34.69.115.89


In [7]:
run_vm_command("docker ps")

CONTAINER ID   IMAGE     COMMAND   CREATED   STATUS    PORTS     NAMES


In [8]:
# RUN LOAD TESTING CONTAINER
# docker_command = "docker start -d --name load-testing-container auliadil/load-testing-rodrigues:v1 tail -f /dev/null"
docker_command = "docker start load-testing-container"
gcloud_template = "gcloud compute ssh load-testing-instance --zone=us-central1-a"
!{gcloud_template} --command="{docker_command}"

load-testing-container


In [2]:
# CHECK IF CONTAINER IS RUNNING
container_status = !gcloud compute ssh load-testing-instance --zone={zone} --command="docker ps -q --filter 'name=load-testing-container'"
if container_status:
    print("Container is running.")
else:
    print("Container is not running")

Container is running.


In [12]:
# COPY FILE
copy_file_to_docker("locust-testing.py", "/app/large-scale-online-learning/MLOps-Architecture/Serialization_Datasets/")
# copy_file_to_docker("check-kafka-lag.py", "/app/large-scale-online-learning/MLOps-Architecture/Serialization_Datasets/")

# app_data_path = !dir "%APPDATA%"
# app_data_path = app_data_path[3].split()[-1]
# app_data_path = app_data_path.replace("\\", "/")
# copy_dir_to_docker(f'{app_data_path}/gcloud', "/root/.config/gcloud")


locust-testing.py         | 3 kB |   3.4 kB/s | ETA: 00:00:00 | 100%


In [None]:

# Check copy file
path = "/app/large-scale-online-learning/MLOps-Architecture/Serialization_Datasets"
run_docker_command("cat " + path + "/locust-testing.py")
# run_docker_command("cat " + path + "/check-kafka-lag.py")

In [24]:
# Check GCloud
projects = !gcloud config get-value project
set_project_command = f"gcloud config set project {projects[0]}"
get_kubectl_command = "gcloud container clusters get-credentials two-node-cluster --zone us-central1-a"
run_docker_command(set_project_command)
run_docker_command(get_kubectl_command)
run_docker_command("kubectl get pods")
# run_docker_command("gcloud config get-value project")

Updated property [core/project].
Fetching cluster endpoint and auth data.
kubeconfig entry generated for two-node-cluster.


NAME                              READY   STATUS    RESTARTS   AGE
api-inferencia-7dc65cbb55-4mxqf   1/1     Running   0          135m
api-inferencia-7dc65cbb55-4vvkv   1/1     Running   0          135m
api-inferencia-7dc65cbb55-kmxhz   1/1     Running   0          135m
api-inferencia-7dc65cbb55-qblvk   1/1     Running   0          135m
api-update-74d8cbcf86-sbt2j       1/1     Running   0          135m
mlflow-6f49c6df8c-fljfb           1/1     Running   0          112m


In [6]:
# RUN THE TEST
locust_path = "/app/large-scale-online-learning/MLOps-Architecture/Serialization_Datasets/locust-testing.py"
check_kafka_lag_dir_path = "/app/large-scale-online-learning/MLOps-Architecture/Serialization_Datasets"
loads = 10000
locust_command = f"locust -f {locust_path} --users {loads} --spawn-rate {loads} --headless --csv=result_testing"
run_docker_command(f"cd {check_kafka_lag_dir_path} && nohup python3 check-kafka-lag.py > kafka_lag.log 2>&1 &")
run_docker_command(f"cd /app/large-scale-online-learning/ && source ../.python-venv/bin/activate && nohup {locust_command} > locust.log 2>&1 &")

In [7]:
# check whether the script is running
run_docker_command("ps aux | grep check-kafka-lag.py")
run_docker_command("ps aux | grep locust-testing.py")

root        5564  0.0  0.0   4324   244 ?        S    17:08   0:00 bash -c cd /app/large-scale-online-learning/MLOps-Architecture/Serialization_Datasets && nohup python3 check-kafka-lag.py > kafka_lag.log 2>&1 &
root        5565  0.0  0.0  90664 11952 ?        Sl   17:08   0:00 python3 check-kafka-lag.py
root        5637 50.0  0.0   4324  3296 ?        Ss   17:09   0:00 bash -c ps aux | grep check-kafka-lag.py
root        5644  0.0  0.0   3528  1652 ?        S    17:09   0:00 grep check-kafka-lag.py
root        5617  0.0  0.0   4324  2056 ?        S    17:08   0:00 bash -c cd /app/large-scale-online-learning/ && source ../.python-venv/bin/activate && nohup locust -f /app/large-scale-online-learning/MLOps-Architecture/Serialization_Datasets/locust-testing.py --users 10000 --spawn-rate 10000 --headless --csv=result_testing > locust.log 2>&1 &
root        5618  103  2.6 2201812 867144 ?      Rl   17:08   0:42 /root/venv/bin/python3 /root/venv/bin/locust -f /app/large-scale-online-learning

In [8]:
# kill the process if it is running
run_docker_command("pkill -f check-kafka-lag.py")
run_docker_command("pkill -f locust-testing.py")

In [51]:
# CHECK IF KAFKA LAG LOG IS GENERATED
run_docker_command("cat /app/large-scale-online-learning/MLOps-Architecture/Serialization_Datasets/kafka_lag_log.csv | tail -n 10")

# clean kafka lag log
# run_docker_command("rm -rf /app/large-scale-online-learning/MLOps-Architecture/Serialization_Datasets/kafka_lag_log.csv")

1755252960.2039907,2428
1755253023.9973176,1837
1755253087.9653482,934
1755253151.6806755,150
1755253215.071712,0
1755253278.1660268,0
1755253341.2906547,0
1755253404.3765507,0
1755253467.6990356,0
1755253530.9016178,0


In [49]:
# CHECK IF LOCUST LOGS ARE GENERATED
run_docker_command("cat /app/large-scale-online-learning/locust.log | tail -n 10")

# Clean up locust log
# run_docker_command("rm -rf /app/large-scale-online-learning/locust.log")

POST     /predict                                                                      368286  3901(1.06%) |  16087       4  147961   8800 |  568.80        0.00
--------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|-----------
         Aggregated                                                                    368286  3901(1.06%) |  16087       4  147961   8800 |  568.80        0.00

Type     Name                                                                          # reqs      # fails |    Avg     Min     Max    Med |   req/s  failures/s
--------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|-----------
POST     /predict                                                                      370188  3901(1.05%) |  16124       4  147961   8800 |  503.20        0.00
--------|------------------------

In [None]:
# Check if Kubernetes pod is working
# update_pods = !kubectl get pods | findstr /R "upd"
# !kubectl exec {update_pods[0].split()[0]} -- ls
# !kubectl exec {update_pods[0].split()[0]} -- cat message_log.csv

# inference_pods = !kubectl get pods | findstr /R "inf"
# if inference_pods:
#     print(f"Inference pod found: {inference_pods[0]}")
#     !kubectl exec {inference_pods[0].split()[0]} -- ls
#     !kubectl exec {inference_pods[0].split()[0]} -- cat message_log.csv

In [None]:
# CHECK Resource Usage of Load Testing VM
!gcloud compute ssh load-testing-instance --zone=us-central1-a --command="top -b -n 1 | head -n 20"

In [52]:
# GET VARIABLES
with open("etc/variables.txt", "r") as f:
    for line in f:
        line = line.strip()
        if line and not line.startswith("#"):
            key, value = line.split("=", 1)
            key = key.strip()
            value = value.strip()
            globals()[key] = value

experiment = eval(experiment) if isinstance(experiment, str) else experiment
if isinstance(experiment, list):
    experiment_name = experiment[0]
    experiment_file = experiment[1]

In [53]:
# Stop all running Python Load Testing scripts
run_docker_command("pkill -f check-kafka-lag.py")
run_docker_command("pkill -f locust-testing.py")


Recommendation: To check for possible causes of SSH connectivity issues and get
recommendations, rerun the ssh command with the --troubleshoot option.

gcloud compute ssh load-testing-instance --project=fast-learner-project --zone=us-central1-a --troubleshoot

Or, to investigate an IAP tunneling issue:

gcloud compute ssh load-testing-instance --project=fast-learner-project --zone=us-central1-a --troubleshoot --tunnel-through-iap

ERROR: (gcloud.compute.ssh) [C:\Users\muhammad.aulia\AppData\Local\Google\Cloud SDK\google-cloud-sdk\bin\sdk\plink.exe] exited with return code [1].


In [None]:
# GET INFERENCE RESULTS
inference_pods = !kubectl get pods | findstr /R "inf"
if inference_pods:
    filenames = !kubectl exec {inference_pods[0].split()[0]} -- ls | findstr /R "load_model"


for filename in filenames:
    !kubectl exec {inference_pods[0].split()[0]} -- tar -czf /app/{filename}.tar.gz /app/{filename}
    experiment_folder = f"experiment-results\\{experiment_name + '-' + str(loads) + 'VU'}\\inference-results"
    !mkdir {experiment_folder}
    !kubectl cp default/{inference_pods[0].split()[0]}:/app/{filename}.tar.gz {experiment_folder}/{filename}.tar.gz

In [None]:
# GET UPDATE RESULTS
pods = !kubectl get pods | findstr /R "upd"
!kubectl exec {pods[0].split()[0]} -- tar -czf /app/for_auc.tar.gz /app/for_auc.csv
!kubectl exec {pods[0].split()[0]} -- tar -czf /app/message_log.tar.gz /app/message_log.csv
!kubectl exec {pods[0].split()[0]} -- tar -czf /app/model_upload_latency.tar.gz /app/model_upload_latency.csv
experiment_folder = f"experiment-results\\{experiment_name + '-' + str(loads) + 'VU'}\\update-results"
!mkdir {experiment_folder}
run_docker_command(f"kubectl cp default/{pods[0].split()[0]}:/app/for_auc.tar.gz app/for_auc.tar.gz")
copy_docker_file_to_local("/app/for_auc.tar.gz", f"{experiment_folder}/for_auc.tar.gz")
run_docker_command(f"kubectl cp default/{pods[0].split()[0]}:/app/message_log.tar.gz app/message_log.tar.gz")
copy_docker_file_to_local("/app/message_log.tar.gz", f"{experiment_folder}/message_log.tar.gz")
run_docker_command(f"kubectl cp default/{pods[0].split()[0]}:/app/model_upload_latency.tar.gz app/model_upload_latency.tar.gz")
copy_docker_file_to_local("/app/model_upload_latency.tar.gz", f"{experiment_folder}/model_upload_latency.tar.gz")

tar: Removing leading `/' from member names

for_auc.tar.gz            | 32 kB |  32.0 kB/s | ETA: 00:04:28 |   0%
for_auc.tar.gz            | 64 kB |  64.0 kB/s | ETA: 00:02:13 |   0%
for_auc.tar.gz            | 736 kB | 368.0 kB/s | ETA: 00:00:21 |   8%
for_auc.tar.gz            | 1504 kB | 501.3 kB/s | ETA: 00:00:14 |  17%
for_auc.tar.gz            | 2432 kB | 608.0 kB/s | ETA: 00:00:10 |  28%
for_auc.tar.gz            | 3136 kB | 627.2 kB/s | ETA: 00:00:08 |  36%
for_auc.tar.gz            | 4032 kB | 672.0 kB/s | ETA: 00:00:06 |  46%
for_auc.tar.gz            | 4768 kB | 681.1 kB/s | ETA: 00:00:05 |  55%
for_auc.tar.gz            | 5664 kB | 708.0 kB/s | ETA: 00:00:04 |  65%
for_auc.tar.gz            | 6432 kB | 714.7 kB/s | ETA: 00:00:03 |  74%
for_auc.tar.gz            | 7296 kB | 729.6 kB/s | ETA: 00:00:01 |  84%
for_auc.tar.gz            | 8032 kB | 730.2 kB/s | ETA: 00:00:00 |  93%
for_auc.tar.gz            | 8610 kB | 782.8 kB/s | ETA: 00:00:00 | 100%


In [38]:
# Apply this export KUBECTL_REMOTE_COMMAND_WEBSOCKETS=false on Windows
!set KUBECTL_REMOTE_COMMAND_WEBSOCKETS=false

In [None]:
# check file size in /app
!kubectl exec {pods[0].split()[0]} -- ls -lh /app

In [None]:
# CHECK MFLOW REPO
# !kubectl exec {pods[0].split()[0]} -- ls /mlartifacts

In [58]:
# GET MLFLOW RESULT
pods = !kubectl get pods | findstr /R "mlflow"
!kubectl exec {pods[0].split()[0]} -- tar -czf /mlartifacts/mlflow-results.tar.gz /mlartifacts/1
!kubectl exec {pods[0].split()[0]} -- tar -czf /mlartifacts/mlflow-db.tar.gz /mlartifacts/mlflow.db
experiment_folder = f"experiment-results\\{experiment_name + '-' + str(loads) + 'VU'}\\mlflow-results"
!kubectl cp default/{pods[0].split()[0]}:/mlartifacts/mlflow-results.tar.gz {experiment_folder}/mlflow-results.tar.gz
!kubectl cp default/{pods[0].split()[0]}:/mlartifacts/mlflow-db.tar.gz {experiment_folder}/mlflow-db.tar.gz 

tar: Removing leading `/' from member names
tar: Removing leading `/' from member names


tar: Removing leading `/' from member names
tar: Removing leading `/' from member names


In [59]:
# GET LOAD TESTING RESULT
docker_path = "/app/large-scale-online-learning/"
docker_path_kafka_log = "/app/large-scale-online-learning/MLOps-Architecture/Serialization_Datasets/"
experiment_folder = f"experiment-results\\{experiment_name + '-' + str(loads) + 'VU'}\\load-test-results"

import os
os.makedirs(experiment_folder, exist_ok=True)

copy_docker_file_to_local(f"{docker_path}result_testing_stats_history.csv", experiment_folder)
copy_docker_file_to_local(f"{docker_path}result_testing_exceptions.csv", experiment_folder)
copy_docker_file_to_local(f"{docker_path}result_testing_failures.csv", experiment_folder)
copy_docker_file_to_local(f"{docker_path}result_testing_stats.csv", experiment_folder)
copy_docker_file_to_local(f"{docker_path_kafka_log}kafka_lag_log.csv", experiment_folder)



result_testing_stats_hist | 32 kB |  32.0 kB/s | ETA: 00:00:04 |  19%
result_testing_stats_hist | 128 kB |  64.0 kB/s | ETA: 00:00:00 |  78%
result_testing_stats_hist | 164 kB |  82.0 kB/s | ETA: 00:00:00 | 100%

result_testing_exceptions | 0 kB |   0.0 kB/s | ETA: 00:00:00 | 100%

result_testing_failures.c | 0 kB |   0.3 kB/s | ETA: 00:00:00 | 100%

result_testing_stats.csv  | 0 kB |   0.6 kB/s | ETA: 00:00:00 | 100%

kafka_lag_log.csv         | 0 kB |   0.5 kB/s | ETA: 00:00:00 | 100%


In [60]:
# Delete all copied files in kubernetes
# pods = !kubectl get pods | findstr /R "upd"
!kubectl exec {pods[0].split()[0]} -- rm /app/message_log.csv
!kubectl exec {pods[0].split()[0]} -- rm /app/model_upload_latency.csv
!kubectl exec {pods[0].split()[0]} -- rm /app/for_auc.csv
pods = !kubectl get pods | findstr /R "inf"
for pod in pods:
    # Delete all files matching load_model*.csv at the end of the file in /app, suppress error if not found
    !kubectl exec {pods[0].split()[0]} -- bash -c "rm -f /app/load_model*.gz"
    !kubectl exec {pod.split()[0]} -- bash -c "rm -f /app/load_model*.csv"
    !kubectl exec {pod.split()[0]} -- bash -c "rm -f /app/api_inference_*.log"
run_docker_command("rm -rf /app/large-scale-online-learning/MLOps-Architecture/Serialization_Datasets/kafka_lag_log.csv")

rm: cannot remove '/app/message_log.csv': No such file or directory
command terminated with exit code 1
rm: cannot remove '/app/model_upload_latency.csv': No such file or directory
command terminated with exit code 1
rm: cannot remove '/app/for_auc.csv': No such file or directory
command terminated with exit code 1


In [None]:
# do ls to one of the pod
pods = !kubectl get pods | findstr /R "inf"
for pod in pods:
    !kubectl exec {pod.split()[0]} -- ls /app