# Data Platform Health Check

This notebook checks the health of all jobs in the data platform and provides logs for the most recent run if any job has failed.

In [1]:
import docker
import pandas as pd

# List of job container names
job_containers = [
    'data-platform-airflow-init-1',
    'data-platform-airflow-webserver-1',
    'data-platform-airflow-scheduler-1',
    'data-platform-spark-1',
    'data-platform-spark-worker-1',
    'data-platform-spark-history-server-1',
    'data-platform-sqlserver-1',
    'data-platform-trino-1',
    'data-platform-jupyter-1',
    'data-platform-postgres-1'
]

def get_container_health(container_name):
    try:
        client = docker.from_env()
        container = client.containers.get(container_name)
        status = container.status
        logs = container.logs().decode('utf-8')
        return status, logs
    except docker.errors.NotFound:
        return 'Not Found', 'Container not found'
    except Exception as e:
        return 'Error', str(e)

# Get health status of each container
health_data = {container: get_container_health(container) for container in job_containers}

# Create a DataFrame to display the health status
health_df = pd.DataFrame([(container, status) for container, (status, _) in health_data.items()],
                          columns=['Container', 'Status'])
print(health_df)

# Display logs for any containers that are not running
for container, (status, logs) in health_data.items():
    if status != 'running':
        print(f'\nLogs for {container}:')
        print(logs)

ModuleNotFoundError: No module named 'docker'