# 1. Producing the data
## 1.2 Memory Event Producer

In [None]:
# Import all the required libraries
from time import sleep
from json import dumps
from kafka import KafkaProducer
import random
import csv
import datetime as dt
from pytz import timezone

In [None]:
# Define a function to read the csv file into a dictionary
def read_csv(fileName):
    data = []
    with open(fileName, newline = '') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            data.append(row)
    return data

In [None]:
# Define a function to publish the final message
def publish_message(producer_instance, topic_name, data):
    try:
        producer_instance.send(topic_name, data)
        print('Message published successfully. Data: ' + str(data))
    except Exception as ex:
        print('Exception in publishing message.')
        print(str(ex))

In [None]:
# Define a function to connect to the kafka producer
def connect_kafka_producer():
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers = ['localhost:9092'],
                                  value_serializer = lambda x: dumps(x).encode('ascii'),
                                  api_version = (0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka.')
        print(str(ex))
    finally:
        return _producer

In [None]:
# Define a function to publish the first lot of X messages
def publish1_x(cRows, ts):
    # Define an end point for the data using a random number generator
    end_x = random.randint(20, 80)
    # Select the data based on the end point
    data_x = cRows[:end_x]
    # Add the generated timestamp in each dictionary of data
    for item in data_x:
        item.update(ts)
    # Return the data and end point to be used as the start point for the next data
    return data_x, end_x

# Define a function to publish the X messages after the first set of messages have been published
def publish2_x(cRows, end_y, ts):
    # Define an end point for the data using a random number generator
    end_x = end_y + random.randint(20, 80)
    # Select the data based on the end point
    data_x = cRows[end_y:end_x]
    # Add the generated timestamp in each dictionary of data
    for item in data_x:
        item.update(ts)
    # Return the data and end point to be used as the start point for the next data
    return data_x, end_x

# Define a function to publish the Y messages
def publish_y(cRows, end_x, ts):
    # Define an end point for the data using a random number generator
    end_y = end_x + random.randint(0, 5)
    # Select the data based on the end point
    data_y = cRows[end_x:end_y]
    # Add the generated timestamp in each dictionary of data
    for item in data_y:
        item.update(ts)
    # Return the data and end point to be used as the start point for the next data
    return data_y, end_y

# Define a function to call the publish2 function created above for each machine filtered data and append it as returned
# Also, check if there is no data returned
def publish2_x_final(end, machine, data, ts):
    machine_data, end_ret = publish2_x([d for d in cRows if d['machine'] == machine], end, ts)
    if machine_data:
        data.append(machine_data)
    # If the data is finished, restart it from the first sequence
    if not machine_data:
        machine_data, end_ret = publish1_x([d for d in cRows if d['machine'] == machine], ts)
        data.append(machine_data)
    return data, end_ret

In [None]:
if __name__ == '__main__':
   
    topic = 'memory_stream'
    cRows = read_csv('data_and_models/Streaming_Linux_memory.csv')
    
    print('Publishing records..')
    producer = connect_kafka_producer()
    
    while True:
        # Define an empty list for the data to be published
        data = []
        # Generate a UTC timestamp to be added in the data
        ts = {'ts': int(dt.datetime.now(timezone('UTC')).timestamp())}
        
        # Call the publish1 function created above for each machine filtered data for the X data and append it as returned
        data_x_4, end_x_4 = publish1_x([d for d in cRows if d['machine'] == '4'], ts)
        data.append(data_x_4)
        data_x_5, end_x_5 = publish1_x([d for d in cRows if d['machine'] == '5'], ts)
        data.append(data_x_5)
        data_x_6, end_x_6 = publish1_x([d for d in cRows if d['machine'] == '6'], ts)
        data.append(data_x_6)
        data_x_7, end_x_7 = publish1_x([d for d in cRows if d['machine'] == '7'], ts)
        data.append(data_x_7)
        data_x_8, end_x_8 = publish1_x([d for d in cRows if d['machine'] == '8'], ts)
        data.append(data_x_8)
        
        # Create a flat list of all the machines' data to be published
        data = [item for sublist in data for item in sublist]
        # Publish the data
        publish_message(producer, topic, data)
        
        # Create a delay of 10 seconds between the messages
        sleep(10)
        print('---------------------------------------------------------------')
        # Define an empty list for the data to be published
        data = []
        # Generate a UTC timestamp to be added in the data
        ts = {'ts': int(dt.datetime.now(timezone('UTC')).timestamp())}
        
        # Call the publish2 function created above for each machine filtered data for the Y data
        data_y_4, end_y_4 = publish_y([d for d in cRows if d['machine'] == '4'], end_x_4, ts)
        data.append(data_y_4)
        data_y_5, end_y_5 = publish_y([d for d in cRows if d['machine'] == '5'], end_x_5, ts)
        data.append(data_y_5)
        data_y_6, end_y_6 = publish_y([d for d in cRows if d['machine'] == '6'], end_x_6, ts)
        data.append(data_y_6)
        data_y_7, end_y_7 = publish_y([d for d in cRows if d['machine'] == '7'], end_x_7, ts)
        data.append(data_y_7)
        data_y_8, end_y_8 = publish_y([d for d in cRows if d['machine'] == '8'], end_x_8, ts)
        data.append(data_y_8)
        
         # Create another infinite loop to publish the next set of messages after the first publish
        while True:
            
            # Publish the further messages for each machine
            data, end_x_4 = publish2_x_final(end_y_4, '4', data, ts)
            data, end_x_5 = publish2_x_final(end_y_5, '5', data, ts)
            data, end_x_6 = publish2_x_final(end_y_6, '6', data, ts)
            data, end_x_7 = publish2_x_final(end_y_7, '7', data, ts)
            data, end_x_8 = publish2_x_final(end_y_8, '8', data, ts)
            
            # Create a flat list of all the machines' data to be published
            data = [item for sublist in data for item in sublist]
            # Publish the data
            publish_message(producer, topic, data)
                
            # Create a delay of 10 seconds between the messages
            sleep(10)
            print('---------------------------------------------------------------')
            # Define an empty list for the data to be published
            data = []
            # Generate a UTC timestamp to be added in the data
            ts = {'ts': int(dt.datetime.now(timezone('UTC')).timestamp())}
            
            # Call the publish2 function created above for each machine filtered data for the Y data and append it as returned
            data_y_4, end_y_4 = publish_y([d for d in cRows if d['machine'] == '4'], end_x_4, ts)
            data.append(data_y_4)
            data_y_5, end_y_5 = publish_y([d for d in cRows if d['machine'] == '5'], end_x_5, ts)
            data.append(data_y_5)
            data_y_6, end_y_6 = publish_y([d for d in cRows if d['machine'] == '6'], end_x_6, ts)
            data.append(data_y_6)
            data_y_7, end_y_7 = publish_y([d for d in cRows if d['machine'] == '7'], end_x_7, ts)
            data.append(data_y_7)
            data_y_8, end_y_8 = publish_y([d for d in cRows if d['machine'] == '8'], end_x_8, ts)
            data.append(data_y_8)