# 1. Producing the data
## 1.1 Process Event Producer

In [None]:
# Import all the required libraries
from time import sleep
from json import dumps
from kafka import KafkaProducer
import random
import csv
import datetime as dt
from pytz import timezone

In [None]:
# Define a function to read the csv file into a dictionary
def read_csv(fileName):
    data = []
    with open(fileName, newline = '') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            data.append(row)  
    return data

In [None]:
# Define a function to publish the final message
def publish_message(producer_instance, topic_name, data):
    try:
        producer_instance.send(topic_name, data)
        print('Message published successfully. Data: ' + str(data))
    except Exception as ex:
        print('Exception in publishing message.')
        print(str(ex))

In [None]:
# Define a function to connect to the kafka producer
def connect_kafka_producer():
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers = ['localhost:9092'],
                                  value_serializer = lambda x: dumps(x).encode('ascii'),
                                  api_version = (0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka.')
        print(str(ex))
    finally:
        return _producer

In [None]:
# Define a function to get the first lot of messages
def publish1(cRows, ts):
    # Define an end point for the data using a random number generator
    end = random.randint(10, 50)
    # Select the data based on the end point
    data = cRows[:end]
    # Add the generated timestamp in each dictionary of data
    for item in data:
        item.update(ts)
    # Return the data and end point to be used as the start point for the next data
    return data, end

# Define a function to get the messages after the first set of messages have been published
def publish2(cRows, start, ts):
    # Define an end point for the data using a random number generator
    end = start + random.randint(10, 50)
    # Select the data based on the end point
    data = cRows[start:end]
    # Update the start point
    start = end
    # Add the generated timestamp in each dictionary of data
    for item in data:
        item.update(ts)
    # Return the data and end point to be used as the start point for the next data
    return data, start

# Define a function to call the publish2 function created above for each machine filtered data and append it as returned
# Also, check if there is no data returned
def publish2_final(start, machine, data, ts):
    machine_data, start_ret = publish2([d for d in cRows if d['machine'] == machine], start, ts)
    if machine_data:
        data.append(machine_data)
    # If the data is finished, restart it from the first sequence
    if not machine_data:
        machine_data, start_ret = publish1([d for d in cRows if d['machine'] == machine], ts)
        data.append(machine_data)   
    return data, start_ret

In [None]:
if __name__ == '__main__':
   
    topic = 'process_stream'
    cRows = read_csv('data_and_models/Streaming_Linux_process.csv')
    
    print('Publishing records...')
    producer = connect_kafka_producer()

    while True:
        # Define an empty list for the data to be published
        data = []
        # Generate a UTC timestamp to be added in the data
        ts = {'ts': int(dt.datetime.now(timezone('UTC')).timestamp())}
        
        # Call the publish1 function created above for each machine filtered data and append it as returned  
        data_4, start_4 = publish1([d for d in cRows if d['machine'] == '4'], ts)
        data.append(data_4)
        data_5, start_5 = publish1([d for d in cRows if d['machine'] == '5'], ts)
        data.append(data_5)
        data_6, start_6 = publish1([d for d in cRows if d['machine'] == '6'], ts)
        data.append(data_6)
        data_7, start_7 = publish1([d for d in cRows if d['machine'] == '7'], ts)
        data.append(data_7)
        data_8, start_8 = publish1([d for d in cRows if d['machine'] == '8'], ts)
        data.append(data_8)
        
        # Create a flat list of all the machines' data to be published
        data = [item for sublist in data for item in sublist]
        # Publish the data
        publish_message(producer, topic, data)
        
        # Create a delay of 5 seconds between the messages
        sleep(5)
        print('---------------------------------------------------------------')
        
        # Create another infinite loop to publish the next set of messages after the first publish
        while True:
            # Define an empty list for the data to be published
            data = []
            # Generate a UTC timestamp to be added in the data
            ts = {'ts': int(dt.datetime.now(timezone('UTC')).timestamp())}
            
            # Publish the further messages for each machine
            data, start_4 = publish2_final(start_4, '4', data, ts)
            data, start_5 = publish2_final(start_5, '5', data, ts)
            data, start_6 = publish2_final(start_6, '6', data, ts)
            data, start_7 = publish2_final(start_7, '7', data, ts)
            data, start_8 = publish2_final(start_8, '8', data, ts)
            
            # Create a flat list of all the machines' data to be published
            data = [item for sublist in data for item in sublist]
            # Publish the data
            publish_message(producer, topic, data)
                
            sleep(5)
            print('---------------------------------------------------------------')