In [0]:
%run "./Environment Setup"

In [0]:
# Define AWS configuration details in the aws_config dictionary
# DBTITLE 1,Initialize Config Settings
if 'config' not in locals() or not isinstance(config, dict):
    config = {}

config['aws'] = {
    'access_key_id': '***************************',
    'secret_access_key': '*****************************',
    'region_name': 'us-west-2',
    'subnets': [
        'subnet-**************',  # SubnetID-1 
        'subnet-**************'   # SubnetID-2
    ],
    'security_group': 'sg-********************',  # Security group ID
    'cluster_name': 'real-time-pos-msk',  # Unique cluster name
    'kafka_version': '2.8.1',
    'number_of_broker_nodes': 4,
    'instance_type': 'kafka.m5.large',
    'cluster_arn': 'arn:aws:kafka:us-west-2:*****************:cluster/real-time-pos-msk/******************************************'
}

## Config Settings for DBFS Mount Point
config['dbfs_mount_name'] = f'/mnt/real-time-pos/' 

# Store the filenames for the data files into Config
config['inventory_change_store001_filename'] = config['dbfs_mount_name'] + '/data-generator/inventory_change_store001.txt'
config['inventory_change_online_filename'] = config['dbfs_mount_name'] + '/data-generator/inventory_change_online.txt'
 
# snapshot data files
config['inventory_snapshot_store001_filename'] = config['dbfs_mount_name'] + '/data-generator/inventory_snapshot_store001.txt'
config['inventory_snapshot_online_filename'] = config['dbfs_mount_name'] + '/data-generator/inventory_snapshot_online.txt'
 
# static data files
config['stores_filename'] = config['dbfs_mount_name'] + '/data-generator/store.txt'
config['items_filename'] = config['dbfs_mount_name'] + '/data-generator/item.txt'
config['change_types_filename'] = config['dbfs_mount_name'] + '/data-generator/inventory_change_type.txt'

# Config Settings for Checkpoint Files
config['inventory_snapshot_path'] = config['dbfs_mount_name'] + '/inventory_snapshots/'
# Config Settings for DLT Data
config['dlt_pipeline'] = config['dbfs_mount_name'] + '/dlt_pipeline_pos'

# Identify Database for Data Objects and initialize it
database_name = f'pos_dlt'
config['database'] = database_name




In [0]:
from pyspark.sql.types import *
import pyspark.sql.functions as f
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
import datetime, time
import json
import random


In [0]:
# Reset the DLT ENV
dbutils.fs.rm(config['dlt_pipeline'],True)

## Step 1: Assemble Inventory Change Records
#### - Represent events impacting inventory in a store location.
#### - Events include sales transactions, loss, damage, theft, replenishment, and BOPIS.
#### - Consolidated into a single stream of inventory change event records.
#### - Each event type identified by a change type identifier.
#### - Events may involve multiple products (items).
#### - Group data by transaction ID for efficient transmission.

In [0]:
## Define the schema for Inventory Change Data
Schema_IC = StructType([
    StructField('trans_id', StringType()),  # Transaction ID
    StructField('item_id', IntegerType()),  # Item ID
    StructField('store_id', IntegerType()),  # Store ID
    StructField('date_time', TimestampType()),  # Date and Time of the transaction
    StructField('quantity', IntegerType()),  # Quantity of the item
    StructField('change_type_id', IntegerType())  # Type of change (e.g., addition, removal)
])

# List of inventory change data files from different stores
inventory_change_files = [
  config['inventory_change_store001_filename'],  # Inventory change file for store 001
  config['inventory_change_online_filename']  # Inventory change file for online store
]

## Read the Inventory Change data using Spark DataFrame with fixed Schema
change_inventory = (
spark
    .read
    .csv(
      inventory_change_files, 
      header=True,  # First row contains header
      schema=Schema_IC,  # Use the predefined schema
      timestampFormat='yyyy-MM-dd HH:mm:ss'  # Timestamp format in the data
      )
        .withColumn('trans_id', f.expr("substring(trans_id, 2, length(trans_id)-2)"))  # Clean up transaction ID
        .withColumn('item', f.struct('item_id', 'quantity'))  # Create a struct column for item_id and quantity
        .groupBy('date_time', 'trans_id')  # Group by date_time and transaction ID
        .agg(
            f.first('store_id').alias('store_id'),  # Get the first store_id in the group
            f.first('change_type_id').alias('change_type_id'),  # Get the first change_type_id in the group
            f.collect_list('item').alias('items')  # Collect all items in the group into a list
        )
        .orderBy('date_time', 'trans_id')  # Order by date_time and transaction ID
        .toJSON()  # Convert the DataFrame to JSON format
        .collect()  # Collect the result as a list
)

display(spark.read.json(sc.parallelize(change_inventory)))

In [0]:
### Evaluate the Spark DataFrame
eval(change_inventory[0])

Step 2: Assemble Inventory Snapshots
##### Inventory snapshots represent the physical counts taken of products sitting in inventory.
##### Such counts are periodically taken to capture the true state of a store's inventory and are necessary given the challenges most retailers encounter in inventory management.

##### With each snapshot, we capture basic information about the store, item, and quantity on-hand along with the date time and employee associated with the count. So that the impact of inventory snapshots may be more rapidly reflected in our streams, we simulate a complete recount of products in a store every 5-days. This is far more aggressive than would occur in the real world but again helps to demonstrate the streaming logic:

In [0]:
# format of inventory snapshot records
# inventory snapshot data files (one from each store) -> SCHEMA_IS
Schema_IS = StructType([
  StructField('item_id', IntegerType()),
  StructField('employee_id', IntegerType()),
  StructField('store_id', IntegerType()),
  StructField('date_time', TimestampType()),
  StructField('quantity', IntegerType())
  ])
 
# inventory snapshot files
inventory_snapshot_files = [ 
  config['inventory_snapshot_store001_filename'],
  config['inventory_snapshot_online_filename']
  ]
 
# read inventory snapshot data
snapshots_inventory = (
  spark
    .read
    .csv(
      inventory_snapshot_files, 
      header=True, 
      timestampFormat='yyyy-MM-dd HH:mm:ss', 
      schema=Schema_IS
      )
  )
 
display(snapshots_inventory)


In [0]:
eval(snapshots_inventory.toJSON().collect()[2])

In [0]:
# get date_time of each inventory snapshot by store
inventory_snapshot_times = (
  snapshots_inventory
    .select('date_time','store_id')
    .distinct()
    .orderBy('date_time')  # sorting of list is essential for logic below
  )
 
# display snapshot times
display(inventory_snapshot_times)

In [0]:
# get date_time of each inventory snapshot by store
inventory_snapshot_times = (
  snapshots_inventory
    .select('date_time','store_id')
    .distinct()
    .orderBy('date_time')  # sorting of list is essential for logic below
  )
 
# display snapshot times
display(inventory_snapshot_times)


##### Send the Inventory Snapshot and Change Inventory Data to Different topics on MSK cluster 
##### Create two topic for and send the data to producer 

In [0]:
## Initialize the boto3 session
session = boto3.Session(
    aws_access_key_id=config['aws']['access_key_id'],
    aws_secret_access_key=config['aws']['secret_access_key'],
    region_name=config['aws']['region_name']
)

In [0]:
# Create an MSK client
msk_client = session.client('kafka')

try:
    # Get bootstrap brokers
    response = msk_client.get_bootstrap_brokers(
        ClusterArn=config['aws']['cluster_arn'],
    )
    bootstrap_servers = response['BootstrapBrokerString']
except NoCredentialsError:
    print("No credentials available. Please check your AWS credentials.")
    exit(1)  # Exit the script if credentials are not available
except Exception as e:
    print(f"Error getting bootstrap brokers: {e}")
    exit(1)

# Kafka Admin Client operations
try:
    admin_client = KafkaAdminClient(
        bootstrap_servers=bootstrap_servers,
        security_protocol='PLAINTEXT',  
        client_id=socket.gethostname(),
    )
    existing_topics = admin_client.list_topics()
    # Create topics
    topics = ['InventorySnapshot', 'ChangeInventoryData']
    new_topics = [NewTopic(name=topic, num_partitions=1, replication_factor=1) for topic in topics if topic not in existing_topics]
    
    if new_topics:
        admin_client.create_topics(new_topics=new_topics, validate_only=False)
        print(f'Created New Topic: {new_topics}')
    else:
        print(f'Topics already exist')
        
except NoBrokersAvailable:
    print("No brokers available. Please check the broker endpoint and network connectivity.")
except KafkaTimeoutError:
    print("Kafka timeout error. Failed to update metadata after 60.0 secs. Please check the broker endpoint and network connectivity.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")
finally:
    # List topics and close the admin client to release resources
    if 'admin_client' in locals():  # Check if admin_client is defined
        try:
            topics = admin_client.list_topics()
            print("Topics in the cluster:", topics)
        except Exception as e:
            print(f"Error listing topics: {e}")
        admin_client.close()


## Sendind data to KAFKA and S3 bucket
1. **Random Sampling of Change Inventory Data**
   - Randomly sample 100 records from the `change_inventory` dataset.

2. **Event Speed Factor and Message Size Configuration**
   - Set the event speed factor to 10x real-time speed.
   - Define the maximum message size as 256KB.

3. **Iterate Through Sampled Events**
   - For each event in the sampled data:
     - Extract the datetime from the transaction document.
     - Compare the event datetime with snapshot datetimes.

4. **Inventory Snapshot Transmission**
   - For each snapshot datetime that the event datetime exceeds:
     - Extract snapshot data for the specific datetime and store ID.
     - Transmit the snapshot data to S3 as a CSV file.
     - Remove the snapshot datetime from the list after transmission.

5. **Calculate Delay Between Events**
   - Calculate the delay between the current event and the previous event.
   - Adjust the delay by the event speed factor.

6. **Transmit Inventory Change Event**
   - If the event size exceeds the maximum message size:
     - Split the items in the event and send them individually to Kafka.
     - Pause transmission every 25 items to avoid overwhelming MSK.
   - Otherwise, send the entire transaction document to Kafka.

7. **Update Last Event Datetime**
   - Set the last event datetime for the next iteration.
"""

In [0]:
# Initialize Kafka producer
producer= KafkaProducer(
   bootstrap_servers=bootstrap_servers,
   value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

# Function to send messages to Kafka
def send_to_kafka(producer, topic, message):
    producer.send(topic, value=message)
    producer.flush()



In [0]:
# Randomly sample change inventory data
change_inventory_sample = random.sample(change_inventory, 100)

max_msg_size = 256 * 1024  # event message to MSK cannot exceed 256KB

for event in change_inventory_sample:
    # extract datetime from transaction document
    d = json.loads(event)  # parse json as a dictionary
    dt = datetime.datetime.strptime(d['date_time'], '%Y-%m-%dT%H:%M:%S.000Z')

    # inventory snapshot transmission
    # -----------------------------------------------------------------------
    snapshot_start = time.time()

    inventory_snapshot_times_for_loop = inventory_snapshot_times.collect()  # copy snapshot times list as this may be modified in loop
    for snapshot_dt, store_id in inventory_snapshot_times_for_loop:  # for each snapshot

        # extract snapshot data for this dt
        snapshot_pd = (
            snapshots_inventory
            .filter(f.expr("store_id='{}' AND date_time='{}'".format(store_id, snapshot_dt)))
            .withColumn('date_time', f.expr("date_format(date_time, 'yyyy-MM-dd HH:mm:ss')"))  # force timestamp conversion to include
            .toPandas()
        )

        # transmit to S3 as csv
        s3_client = session.client('s3')
        s3_client.put_object(
            Bucket="real-time-pos-msk",
            Key=("inventory_snapshots/" +
                    'inventory_snapshot_{}_{}.csv'.format(store_id, snapshot_dt.strftime('%Y-%m-%d_%H-%M-%S'))),
            Body=snapshot_pd.to_csv(index=False, mode='a', header=False)
        )

        # Print confirmation of file sent to S3
        print(f'Sent to S3: inventory_snapshot_{store_id}_{snapshot_dt.strftime("%Y-%m-%d_%H-%M-%S")}.csv')

        # remove snapshot date from inventory_snapshot_times
        inventory_snapshot_times = inventory_snapshot_times.filter(inventory_snapshot_times['date_time'] != snapshot_dt)
        print('Loaded inventory snapshot for {}'.format(snapshot_dt.strftime('%Y-%m-%d %H:%M:%S')))

    snapshot_seconds = time.time() - snapshot_start
    # -----------------------------------------------------------------------

    # inventory change event transmission
    # -----------------------------------------------------------------------
    # sleep for 60 seconds
    time.sleep(1)

    # send items individually if json document too large
    if len(event) > max_msg_size:
        items = d.pop('items')  # retrieve items collection
        for i, item in enumerate(items):  # for each item
            temp = [item]
            d['items'] = temp  # add a one-item items-collection
            send_to_kafka(producer, 'ChangeInventoryData', d)  # send message

            # Print confirmation of message sent to Kafka
            print(f'Sent to Kafka: {json.dumps(d)}')

            if (i + 1) % 25 == 0:  # pause transmission every Xth item to avoid overwhelming MSK
                time.sleep(1)
    else:  # send whole transaction document
        send_to_kafka(producer, 'ChangeInventoryData', d)

        # Print confirmation of message sent to Kafka
        print(f'Sent to Kafka: {json.dumps(d)}')

    # -----------------------------------------------------------------------

In [0]:
# from kafka import KafkaConsumer
# import json

# # Initialize Kafka consumer
# consumer = KafkaConsumer(
#     'ChangeInventoryData',
#     bootstrap_servers=bootstrap_servers,
#     auto_offset_reset='latest',
#     enable_auto_commit=True,
#     group_id='my-group',
#     value_deserializer=lambda x: json.loads(x.decode('utf-8'))
# )

# # Function to consume data from Kafka topic and display it
# def consume_from_kafka(consumer):
#     for message in consumer:
#         data = message.value
#         display(data)

# # Start consuming data
# consume_from_kafka(consumer)