In [0]:
%run "./Environment Setup"

In [0]:

import pyspark.sql.functions as f
from pyspark.sql.types import *

from delta.tables import *
 
import dlt as dlt
import boto3
import socket
from botocore.exceptions import NoCredentialsError
import time
from kafka import KafkaProducer
from kafka.admin import KafkaAdminClient, NewTopic
import boto3
import socket
from botocore.exceptions import NoCredentialsError
from kafka.errors import NoBrokersAvailable, KafkaTimeoutError
from kafka import KafkaConsumer
import json


In [0]:
# Define AWS configuration details in the aws_config dictionary
# DBTITLE 1,Initialize Config Settings
if 'config' not in locals() or not isinstance(config, dict):
    config = {}

config['aws'] = {
    'access_key_id': '***************************',
    'secret_access_key': '*****************************',
    'region_name': 'us-west-2',
    'subnets': [
        'subnet-**************',  # SubnetID-1 
        'subnet-**************'   # SubnetID-2
    ],
    'security_group': 'sg-********************',  # Security group ID
    'cluster_name': 'real-time-pos-msk',  # Unique cluster name
    'kafka_version': '2.8.1',
    'number_of_broker_nodes': 4,
    'instance_type': 'kafka.m5.large',
    'cluster_arn': 'arn:aws:kafka:us-west-2:*****************:cluster/real-time-pos-msk/******************************************'
}

## Config Settings for DBFS Mount Point
config['dbfs_mount_name'] = f'/mnt/real-time-pos/' 

# Store the filenames for the data files into Config
config['inventory_change_store001_filename'] = config['dbfs_mount_name'] + '/data-generator/inventory_change_store001.txt'
config['inventory_change_online_filename'] = config['dbfs_mount_name'] + '/data-generator/inventory_change_online.txt'
 
# snapshot data files
config['inventory_snapshot_store001_filename'] = config['dbfs_mount_name'] + '/data-generator/inventory_snapshot_store001.txt'
config['inventory_snapshot_online_filename'] = config['dbfs_mount_name'] + '/data-generator/inventory_snapshot_online.txt'
 
# static data files
config['stores_filename'] = config['dbfs_mount_name'] + '/data-generator/store.txt'
config['items_filename'] = config['dbfs_mount_name'] + '/data-generator/item.txt'
config['change_types_filename'] = config['dbfs_mount_name'] + '/data-generator/inventory_change_type.txt'

# Config Settings for Checkpoint Files
config['inventory_snapshot_path'] = config['dbfs_mount_name'] + '/inventory_snapshots/'
# Config Settings for DLT Data
config['dlt_pipeline'] = config['dbfs_mount_name'] + '/dlt_pipeline_pos'

# Identify Database for Data Objects and initialize it
database_name = f'pos_dlt'
config['database'] = database_name




In [0]:
## Create A Consumer to Consume the Data from the Kafka Topic and then write it to the Cloud Storage as CSV New File for 
## 1 minute data 

def consume_kafka_topic(topic_name, kafka_bootstrap_servers, checkpoint_path, output_path, aws_access_key_id, aws_secret_access_key):
    try:
        df = (spark.readStream
              .format("kafka")
              .option("kafka.bootstrap.servers", kafka_bootstrap_servers)
              .option("subscribe", topic_name)
              .option("startingOffsets", "Earliest")
              .load())

        df = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

        query = (df.writeStream
                 .format("csv")
                 .option("path", output_path)
                 .option("checkpointLocation", checkpoint_path)
                 .option("fs.s3a.access.key", aws_access_key_id)
                 .option("fs.s3a.secret.key", aws_secret_access_key)
                 .trigger(processingTime='1 minute')
                 .start())

        query.awaitTermination()
    except Exception as e:
        print(f"Error consuming Kafka topic: {e}")




In [0]:
# Initialize a session using Amazon MSK
session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name=region
)

# Create an MSK client
msk_client = session.client('kafka')

try:
    # Get bootstrap brokers
    response = msk_client.get_bootstrap_brokers(
        ClusterArn=CLUSTER_ARN,
    )
    bootstrap_servers = response['BootstrapBrokerString']
except NoCredentialsError:
    print("No credentials available. Please check your AWS credentials.")
    exit(1)  # Exit the script if credentials are not available
except Exception as e:
    print(f"Error getting bootstrap brokers: {e}")
    exit(1)






In [0]:
bootstrap_server=bootstrap_servers
topic_name = 'InventorySnapshot'
output_path=f"s3a://real-time-pos-msk/inventory_snapshots/"
checkpoint_path=config['inventory_snapshot_path']
consume_kafka_topic(topic_name, bootstrap_server, checkpoint_path, output_path, aws_access_key_id, aws_secret_access_key)