In [None]:
# pip install prometheus_client

In [None]:
from pyspark.sql.types import *
from pyspark.sql import functions as F
from prometheus_client import CollectorRegistry, Gauge, Counter, push_to_gateway
import time



Stopping previous SparkSession
Stopping previous SparkContext


In [None]:
pushgateway_endpoint = 'prometheus-prometheus-pushgateway.monitoring:9091'

path_root = 'xxxxxxxxxxx'
path_relative = 'SANDBOX/xxxxxxxxx/PUSHGATEWAY'

In [27]:
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True)
])

data = [
    (1, "Alice", 30),
    (2, "Bob", 25),
    (3, "Charlie", 35)
]

df = spark.createDataFrame(data, schema)

df.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 30|
|  2|    Bob| 25|
|  3|Charlie| 35|
+---+-------+---+



In [34]:
project = 'power'
dag_id = 'abc'
task_id = 'teste1'

# DEFINE METRICS
registry = CollectorRegistry()

write_datalake_seconds = Gauge('write_datalake_seconds',
                               'Writing time for a table in the datalake',
                               ['project', 'dag_id', 'task_id', 'path_lake'],
                               registry=registry
                              )

write_datalake_rows = Gauge('write_datalake_rows',
                            'Number of rows in a table saved in the data lake',
                            ['project', 'dag_id', 'task_id', 'path_lake'],
                            registry=registry
                           )

# WRITE DATALAKE SECONDS
start_time = time.time()

df.write.mode("overwrite").parquet(path_root+path_relative)

execution_time = time.time() - start_time

(write_datalake_seconds.labels(project=project,
                               dag_id=dag_id,
                               task_id=task_id,
                               path_lake=path_relative)
                         .set(execution_time)
)

# WRITE DATALAKE ROWS
rows = df.count()
(write_datalake_rows.labels(project=project,
                            dag_id=dag_id,
                            task_id=task_id,
                            path_lake=path_relative)
                     .set(rows)
)

# SEND TO PUSHGATEWAY
push_to_gateway(pushgateway_endpoint, job='my_job', registry=registry)