Bitswan Gitops Webhook Pipeline
-------------------------------

This pipeline allows you to easilly deploy Bitswan pipelines to any server. You just need to POST to the webhook in your CI and your pipelines will be running in no time.

To install bitswan you need to initialize a git repository on your server/the place you want to run your bitswan installation and run the bitswan-pre docker image with the following ENV vars set:

- `BS_WEBHOOK_PORT` - defaults to 8080
- `BS_WEBHOOK_SECRET` - A secret that will be added to your webhook URL. To deploy post `{"action": "deploy-git"}` to the URL `https://localhost:<BS_WEBHOOK_PORT>/?secret=<BS_WEBHOOK_SECRET>`
- `BS_BITSWAN_DIR` - The directory where your `bitswan.yaml` file resides. Should be in a checked out git repository. Defaults to `/mnt/repo/bitswan`.

Simply adding the following curl command to your CI/CD pipeline should be enough to automatically deploy your data machines:

```
curl -X POST "https://<WEBHOOK_URL>:<BS_WEBHOOK_PORT>/?secret=<BS_WEBHOOK_SECRET>" -d '{"action": "deploy-git"}'
```

In [None]:
from bspump.jupyter import *
import bspump.http.web.source
import bspump.common
import bspump.mqtt
import json
import os
import asyncio
import yaml
import datetime
from dockerfile_parse import DockerfileParser
from bspump.abc.generator import Generator
import docker
import docker.errors
import subprocess
import re
import bspump.kafka

In [None]:
@register_connection
def connection(app):
    return bspump.mqtt.MQTTConnection(app, "MQTTConnection")

In [None]:
new_pipeline("BitswanGitopsWebHookPipeline")

In [None]:
@register_source
def source(app, pipeline):
    return bspump.http.web.source.WebHookSource(
       app,
       pipeline,
       config = {
           "port": int(os.environ.get("BS_WEBHOOK_PORT", 8080)),
           "path": "/",
           "secret_qparam": os.environ.get("BS_WEBHOOK_SECRET", "not-secure")
       })

In [None]:
sample_events([
    b"""{"action": "deploy-git"}"""
])

In [None]:
@step
def parse_json(event):
    return json.loads(event)

In [None]:
@step
def get_bitswan_dir(event):
    event["bitswan_dir"] = os.environ.get("BS_BITSWAN_DIR", "/mnt/repo/bitswan")
    event["host_base_path"] = os.environ.get("BS_HOST_BASE_PATH", "/opt/bitswan-pipelines")
    return event

In [None]:
sample_events([{'action': 'deploy-git', "bitswan_dir":"/Users/lukasvecerka/Work/BitSwan/Development/repo/datamachines"}])

In [None]:
@async_step
async def git_pull(inject, event):
    bitswan_dir = event["bitswan_dir"]
    print("Pulling in new changes.")
    git_repo_root = os.path.abspath(bitswan_dir)  # Convert to absolute path

    while git_repo_root != os.path.dirname(git_repo_root):  # Check until the root of the file system
        if os.path.isdir(os.path.join(git_repo_root, '.git')):
            break  # Return the path if .git directory is found
        git_repo_root = os.path.dirname(git_repo_root)  # Move up one directory level
    event["git_repo_root"] = git_repo_root

    await asyncio.create_subprocess_exec("git", "config", "--global", "--add", "safe.directory", git_repo_root) 
    await asyncio.create_subprocess_exec("git", "config", "pull.rebase", "false", cwd=bitswan_dir) 
    pull_process = await asyncio.create_subprocess_exec("git", "pull", cwd=bitswan_dir)
    await pull_process.wait()

    if pull_process.returncode != 0:
        print("Failed to pull in changes.")
        return
    
    print("Done pulling in changes.")
    await inject(event)

In [None]:
@step
def load_bitswan_yaml(event):
    event["yaml"] = yaml.full_load(open(os.path.join(event["bitswan_dir"], "bitswan.yaml")))
    return event

In [None]:
def get_deployment_info(client, deployment_ids):
    deployment_info = {}
    for id in deployment_ids:
        deployment_info[id] = {
            "deployment_id": id,
            "running": False,
            "container": None,
            "recreate": False,
            "build_commit": None
        }

    for container in client.containers.list():
        container_envs = dict(var.split('=', 1) for var in container.attrs['Config']['Env'])
        deployment_id = container_envs.get('DEPLOYMENT_ID')
        if deployment_id in deployment_ids:
            deployment_info[deployment_id]["running"] = True
            deployment_info[deployment_id]["container"] = container
            deployment_info[deployment_id]["build_commit"] = container.attrs['Config']['Labels']['built.from']
    return deployment_info

In [None]:
def get_current_commit(path):
    try:
        commit_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD'], text=True, cwd=path).strip()
        return commit_hash
    except subprocess.CalledProcessError as e:
        print(f"Error obtaining current Git commit: {e}")
        return None

In [None]:
def pump_folder_changed(last_commit, pump_path):
    result = subprocess.run(['git', 'diff', '--name-only', last_commit, 'HEAD', pump_path], stdout=asyncio.subprocess.PIPE, text=True, cwd=pump_path)

    return bool(result.stdout.strip())

In [None]:
def parse_dockerfile(parser, dockerfile_path, envs):
    with open(dockerfile_path, 'r') as f:
        parser.content = f.read()
    copy_statements = [instr for instr in parser.structure if instr['instruction'] in ['COPY', 'ADD']]

    files = []
    for copy_statement in copy_statements:
        files.extend(copy_statement['value'].split()[:-1])

    resolved_file_list = []

    for file in files:
        for env_var, value in envs.items():
            if env_var in file:
                file = file.replace(f'${{{env_var}}}', value).replace(f'${env_var}', value)
                if '$' in file:
                    continue
        
        resolved_file_list.append(file)
    
    return resolved_file_list

In [None]:
def pump_dependencies_changed(commit: str, files: list, path: str) -> bool:
    changes = []
    for file in files:
        file_path = os.path.join(path, file)
        
        result = subprocess.run(['git', 'diff', '--name-only', commit, file_path], 
                                stdout=subprocess.PIPE, text=True, cwd=path)
        if result.stdout.strip():
            changes.append(file)

    return bool(changes)

In [None]:
@step
def container_management(event):
    bitswan_dir = event["bitswan_dir"]
    git_repo_root = event["git_repo_root"]
    dmy = event["yaml"]

    dockerfile_parser = DockerfileParser()
    docker_client = docker.from_env()

    deployment_info = get_deployment_info(docker_client, dmy["data-machines"].keys())
    
    for deployment_id, info in deployment_info.items():
        # if running check if something changed
        if info["running"]:
            pump_path = os.path.join(bitswan_dir, dmy["data-machines"][deployment_id]["source"])
            dockerfile_path = os.path.join(pump_path, "Dockerfile")

            # check if pump folder changed
            if pump_folder_changed(info["build_commit"], pump_path):
                info["recreate"] = True
                info["build_commit"] = get_current_commit(bitswan_dir)
                continue
            
            # check if pump's dependencies changed
            container_envs = dict(var.split('=', 1) for var in info["container"].attrs['Config']['Env'])
            pump_dependencies = parse_dockerfile(dockerfile_parser, dockerfile_path, container_envs)

            if pump_dependencies_changed(info["build_commit"], pump_dependencies, git_repo_root):
                info["recreate"] = True
                info["build_commit"] = get_current_commit(bitswan_dir)

        else:
            info["recreate"] = True
            info["build_commit"] = get_current_commit(bitswan_dir)
    print(deployment_info)
    event["deployment_info"] = deployment_info
    return (event)

In [None]:
@step
def generate_docker_compose(event):
    bitswan_dir = event["bitswan_dir"]
    bitswan_yaml = event["yaml"]
    deployment_info = event["deployment_info"]
    git_repo_root = event["git_repo_root"]

    dc = {
        "version": "3",
        "services": {},
        "networks": {}
    }
    for network in bitswan_yaml.get("default-networks", []):
        dc["networks"][network] = {"external": True}
    for deployment_id, conf in bitswan_yaml["data-machines"].items():
        if conf is None:
            conf = {}
        entry = {}
        deployment = deployment_info[deployment_id]
        entry["environment"] = {"DEPLOYMENT_ID": deployment_id}
        entry["container_name"] = deployment_id
        entry["labels"] = {"built.from": deployment["build_commit"]}
        if "env-dir" in bitswan_yaml:
           env_file = os.path.join(bitswan_yaml["env-dir"], deployment_id)
           if os.path.exists(env_file):
               entry["env_file"] = [env_file]
        if "default-networks" in bitswan_yaml:
            entry["networks"] = bitswan_yaml["default-networks"].copy()
        source = conf.get("source", deployment_id)
        data_machine_dir = os.path.join(bitswan_dir, source)
        dockerfile_path = "Dockerfile"
        entry["build"] = {
            "dockerfile": os.path.join(data_machine_dir, dockerfile_path),
            "context": git_repo_root,
            "args": {
                "DATA_MACHINE_SOURCE_PATH": data_machine_dir.replace(git_repo_root, "."),
            }
        }
        passthroughs = ["volumes", "network_mode", "ports", "restart", "devices", "container_name", ]
        for passthrough in passthroughs:
            if passthrough in conf:
              entry[passthrough] = conf[passthrough]
        if conf.get("enabled", True):
            dc["services"][deployment_id] = entry

        # IDE Container definition
        ide_entry = {}
        ide_entry["entrypoint"] = "/start-ide.sh"
        ide_entry["image"] = "dev_" + deployment_id
        ide_entry["volumes"] = entry.get("volumes", [])
        ide_entry["networks"] = entry["networks"]
        ide_entry["volumes"] += [
            os.path.join(event["host_base_path"],"dev/")+":/mnt",
        ]
        ide_entry["container_name"] = deployment_id + "__ide__"
        dc["services"][deployment_id+"__ide__"] = ide_entry

    dc_yaml = yaml.dump(dc)
    print(dc_yaml)
    event["docker_compose"] = dc_yaml
    return event

In [None]:
@async_step
async def docker_compose_up_daemon(inject, event):
    bitswan_dir = event["bitswan_dir"]
    docker_compose_yaml = event["docker_compose"]
    deployment_info = event["deployment_info"]
    
    services_to_recreate = [service for service, info in deployment_info.items() if info["recreate"]]
    ide_services = [service+"__ide__" for service in deployment_info.keys()]
    
    build_result = None
    if services_to_recreate:
        cmd = ["docker-compose", "-f", "/dev/stdin", "build", "--pull"]
        cmd.extend(services_to_recreate)

        # Create a subprocess with stdin pipe
        proc = await asyncio.create_subprocess_exec(
            *cmd, 
            stdin=asyncio.subprocess.PIPE, 
            stdout=asyncio.subprocess.PIPE, 
            stderr=asyncio.subprocess.PIPE,
            cwd=bitswan_dir
        )

        # Send docker_compose_yaml as input to the process and wait for completion
        stdout, stderr = await proc.communicate(input=docker_compose_yaml.encode())

        build_result = {
            "cmd": cmd,
            "stdout": stdout.decode("utf-8"),
            "stderr": stderr.decode("utf-8"),
            "returncode": proc.returncode,
        }
    
    cmd = ["docker-compose", "-f", "/dev/stdin", "up", "-d"]
    cmd.extend(services_to_recreate)

    # Create a subprocess with stdin pipe
    proc = await asyncio.create_subprocess_exec(
        *cmd, 
        stdin=asyncio.subprocess.PIPE, 
        stdout=asyncio.subprocess.PIPE, 
        stderr=asyncio.subprocess.PIPE,
        cwd=bitswan_dir
    )

    # Send docker_compose_yaml as input to the process and wait for completion
    stdout, stderr = await proc.communicate(input=docker_compose_yaml.encode())

    up_result = {
        "cmd": cmd,
        "stdout": stdout.decode("utf-8"),
        "stderr": stderr.decode("utf-8"),
        "returncode": proc.returncode,
    }

    # IDE Container creation
    cmd = ["docker-compose", "-f", "/dev/stdin", "up", "--no-start"]
    cmd.extend(ide_services)

    proc = await asyncio.create_subprocess_exec(
        *cmd,
        stdin=asyncio.subprocess.PIPE,
        stdout=asyncio.subprocess.PIPE, 
        stderr=asyncio.subprocess.PIPE,
        cwd=bitswan_dir
    )

    stdout, stderr = await proc.communicate(input=docker_compose_yaml.encode())

    ide_result = {
        "cmd": cmd,
        "stdout": stdout.decode("utf-8"),
        "stderr": stderr.decode("utf-8"),
        "returncode": proc.returncode
    }

    event = {
        "@timestamp": datetime.datetime.now().timestamp(),
        "local-time": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "build": build_result,
        "up": up_result,
        "ide": ide_result
    }
    await inject(event)

In [None]:
@step
def serialize_yaml(event):
    return yaml.dump(event)

In [None]:
@register_sink
def init_sink(app, pipeline):
    return bspump.common.PrintSink(app, pipeline)

In [None]:
end_pipeline()

In [None]:
new_pipeline("IDELaunchPipeline")

In [None]:
@register_source
def mqtt_source(app, pipeline):
    return bspump.mqtt.MQTTSource(app, pipeline, "MQTTConnection")

In [None]:
@register_processor
def processor(app, pipeline):
    return bspump.common.BytesToStringParser(app, pipeline)

In [None]:
@register_processor
def processor(app, pipeline):
    return bspump.common.StdJsonToDictParser(app, pipeline)

In [None]:
@async_step
async def start_ide(inject, event):
    event["ide_container"] = event["deployment_id"] + "__ide__"
    docker_client = docker.from_env()
    
    try:
        container = docker_client.containers.get(event["ide_container"])
        container.start()
    except docker.errors.NotFound:
        print(f"Container {event['ide_container']} not found.")

    await inject(event)

In [None]:
async def stream_ide_logs(container_name, mqtt_connection, log_topic, url_topic):
    docker_client = docker.from_env()

    url_pattern = re.compile(r"http://localhost:\d+/lab\?token=[a-zA-Z0-9]+")

    try:
        container = docker_client.containers.get(container_name)
        for line in container.logs(stream=True):
            log_line = line.decode('utf-8').rstrip()
            
            # Publish every log line to MQTT
            message = {
                "message": log_line,
            }
            mqtt_connection.publish_to_topic(log_topic, json.dumps(message))

            # Check for Jupyter Lab URL in the log line
            match = url_pattern.search(log_line)
            if match:
                jupyter_url = match.group(0)
                # Publish the found URL to a separate MQTT topic
                message = {
                    "url": jupyter_url,
                    "redirect": True
                }
                # Wait for the Jupyter Lab to start
                await asyncio.sleep(1)
                mqtt_connection.publish_to_topic(url_topic, json.dumps(message))
                return
    except docker.errors.NotFound:
        print(f"Container {container_name} not found.")

In [None]:
class IDELogsStreamer(Generator):
    def __init__(self, app, pipeline, id=None, config=None):
        super().__init__(app, pipeline, id, config)
        self.MQTTConnection = pipeline.locate_connection(app, "MQTTConnection")
    async def generate(self, context, event, depth):
        container_name = event["ide_container"]
        deployment_id = event["deployment_id"]
        logs_topic = f"{deployment_id}/editor/launch"
        url_topic = f"{deployment_id}/editor/redirect"

        asyncio.create_task(stream_ide_logs(container_name, self.MQTTConnection, logs_topic, url_topic))

        self.Pipeline.inject(context, event, depth)

In [None]:
@register_generator
def create_stream_logs(app, pipeline):
    return IDELogsStreamer(app, pipeline)

In [None]:
@step
def serialize_yaml(event):
    return yaml.dump(event)

In [None]:
@register_sink
def init_sink(app, pipeline):
    return bspump.common.PrintSink(app, pipeline)

In [None]:
end_pipeline()