In [0]:
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, LongType

In [0]:
result = dbutils.notebook.run("./confluent_stream_test_configs", 60)
table_list, schema_list = eval(result)

In [0]:
confluentBootstrapServers = "pkc-wzjyjg.us-east-2.aws.confluent.cloud:9092"

# NOTE: You have the right idea here to use secrets
# confluentApiKey = dbutils.secrets.get(scope = "confluentTest", key = "api-key")
# confluentSecret = dbutils.secrets.get(scope = "confluentTest", key = "secret")
confluentApiKey = 'G6D3CLLKS6Y3AD5E'
confluentSecret = '2q98GIqZQbrYpZKLwJ97yspZT9XjxmrStlgpluMX03FSSDDMThXEuI6KzuZUkZVE'

confluentTopicName = "parts"

bronze_stream_checkpoint = f"/tmp/ahahn/checkpoints/{confluentTopicName}_bronze"
bronze_table_name = f"ahahn_demo.confluent_navy_test.`{confluentTopicName}_bronze`"

In [0]:
def reset_stream(checkpoint_location: str, table_name: str):
  assert checkpoint_location.startswith("/tmp/"), "Checkpoint location has to be in /tmp/"

  print("Deleting checkpoint")
  dbutils.fs.rm(checkpoint_location, True)

  print("Dropping table")
  spark.sql(f"DROP TABLE IF EXISTS {table_name}")

reset_stream(bronze_stream_checkpoint, bronze_table_name)

In [0]:
df_streaming = (
    spark
    .readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", confluentBootstrapServers)
    .option("subscribe", confluentTopicName)
    .option("startingOffsets", "earliest")
    .option("kafka.security.protocol", "SASL_SSL")
    .option("kafka.sasl.jaas.config", "kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username='{}' password='{}';".format(confluentApiKey, confluentSecret))
    .option("kafka.ssl.endpoint.identification.algorithm", "https")
    .option("kafka.sasl.mechanism", "PLAIN")
    .load()
    .withColumn("value", col("value").cast("string"))
)

In [0]:
(df_streaming
  .writeStream
  .queryName("confluent_stream_query")
  .option("checkpointLocation", bronze_stream_checkpoint)
  .toTable(bronze_table_name)
)

In [0]:
%sql select * from ahahn_demo.confluent_navy_test.`historical-turbine-status_bronze`

In [0]:
%sql
-- GRANT ALL PRIVILEGES ON CATALOG ahahn_demo TO `alex.vanadio@databricks.com`;
-- TODO: Look up the revoke command. I don't recall what it is
-- REVOKE ALL PRIVILEGES ON CATALOG ahahn_demo TO `alex.vanadio@databricks.com`;

In [0]:

# json_schema = StructType([
#     StructField("EQUIPMENTID", StringType(), True),
#     StructField("WORKCENTERID", IntegerType(), True),
#     StructField("WORKCENTERNAME", StringType(), True),
#     StructField("EQUIPMENTTITLE", StringType(), True),
#     StructField("EQUIPMENTLOCATION", StringType(), True),
#     StructField("EQUIPMENTSERIALNUMBER", StringType(), True),
#     StructField("EQUIPMENTNOTE", StringType(), True),
#     StructField("EQUIPMENTGROUP", StringType(), True),
#     StructField("PRIMARYMIP", StringType(), True),
#     StructField("PRIMARYMIPBDAY", StringType(), True),
#     StructField("CDMRIN", StringType(), True),
#     StructField("DAMAGECONTROL", BooleanType(), True),
#     StructField("UPDATE_TS", LongType(), True)
# ])

# NOTE: THIS is a less painful way of defining schemas
json_schema = """
NSN STRING,
height LONG,
production_time LONG,
sensors ARRAY<STRING>,
stock_available LONG,
stock_location STRING,
type STRING,
weight LONG,
width LONG,
UPDATE_TS BIGINT
"""

(spark.table(bronze_table_name)
    .withColumn("value_parsed", from_json("value", json_schema))
    .select(col("value_parsed.*"))
).display()

In [0]:
# When validating, _rescued_data may not exists. Also, nested array in parts table schema definition might break

sensor_bronze_schema = """
timestamp LONG,
sensor_E DOUBLE,
sensor_C DOUBLE,
sensor_B DOUBLE,
sensor_A DOUBLE,
sensor_F DOUBLE,
sensor_D DOUBLE,
energy DOUBLE,
turbine_id STRING,
"""

turbine_schema = """
country STRING,
lat STRING,
location STRING,
long STRING,
model STRING,
state STRING,
turbine_id STRING,
"""

ship_meta_schema = """
homeport STRING,
lat STRING,
long STRING,
model STRING,
ship STRING,
turbine_id STRING,
"""

historical_turbine_status_schema = """
abnormal_sensor STRING,
end_time LONG,
start_time LONG,
turbine_id STRING,
"""

parts_schema = """
NSN STRING,
height LONG,
production_time LONG,
sensors ARRAY<STRING>,
stock_available LONG,
stock_location STRING,
type STRING,
weight LONG,
width LONG,
"""

table_list = ['sensor_bronze', 'parts', 'ship_meta', 'historical_turbine_status', 'turbine']

table_dict = {
    # 'sensor_bronze': {'schema': sensor_bronze_schema, 'confluentTopic': 'ADD TOPIC', 'comment': 'ADD COMMENT'},
    'parts': {'schema': parts_schema, 'confluentTopic': 'parts', 'comment': 'ADD COMMENT'},
    # 'ship_meta': {'schema': ship_meta_schema, 'confluentTopic': 'ship-meta', 'comment': 'ADD COMMENT'},
    # 'historical_turbine_status': {'schema': historical_turbine_status_schema, 'confluentTopic': 'historical-turbine-status', 'comment': 'ADD COMMENT'},
    # 'turbine': {'schema': turbine_schema, 'confluentTopic': 'turbine', 'comment': 'ADD COMMENT'}
}

In [0]:
for table, info in table_dict.items():
  print(info['schema'])
  print(parts_schema)

In [0]:
from pyspark.sql.window import Window

latest_metrics = (
    spark.table("ahahn_demo.dbdemos_navy_pdm.sensor_hourly")
    .join(spark.table("ahahn_demo.dbdemos_navy_pdm.turbine"), on="turbine_id")
    .withColumn("row_number", row_number().over(Window.partitionBy("turbine_id", "hourly_timestamp").orderBy(col("hourly_timestamp").desc())))
).display()

In [0]:
from pyspark.sql.functions import max as spark_max

turbine_current_status = ( spark.table("ahahn_demo.dbdemos_navy_pdm.turbine_current_status")
                          .select(
                            "turbine_id",
                            "hourly_timestamp",
                            "prediction"
                            )
                          )
ship_meta = spark.table("ahahn_demo.dbdemos_navy_pdm.ship_meta")
sensor_maintenance = spark.table("ahahn_demo.dbdemos_navy_pdm.sensor_maintenance")

max_hourly_timestamp = turbine_current_status.agg(spark_max("hourly_timestamp")).collect()[0][0]


#     turbine_current_status
#     .join(ship_meta, "turbine_id")
#     .join(sensor_maintenance, turbine_current_status.prediction == sensor_maintenance.fault, "left")
#     .filter(col("hourly_timestamp") == max_hourly_timestamp)
#     .select(
#         turbine_current_status.turbine_id,
#         turbine_current_status.hourly_timestamp,
#         turbine_current_status.prediction,
#         *[col(f"s.{c}") for c in ship_meta.columns if c != "turbine_id"],
#         *[col(f"m.{c}") for c in sensor_maintenance.columns]
#     )
# )

In [0]:
df = ( turbine_current_status
      .join(ship_meta, "turbine_id")
      .join(sensor_maintenance, turbine_current_status.prediction == sensor_maintenance.fault, "left")
      .filter(col("hourly_timestamp") == max_hourly_timestamp)
      .select(
        *[col(f"{c}") for c in ship_meta.columns if c != "turbine_id"],
        *[col(f"{c}") for c in sensor_maintenance.columns]
    )
).display()

In [0]:
from pyspark.sql import functions as F

ship_status_df = spark.table("ahahn_demo.dbdemos_navy_pdm.ship_current_status_gold")
parts_df = spark.table("ahahn_demo.dbdemos_navy_pdm.parts")

result_df = (ship_status_df.alias("s") 
    .join(parts_df.alias("p"), F.expr("array_contains(p.sensors, s.prediction)"), "left") )

result_df.display()