# Producer for the HealthCosts dataset(s)

Create Kafka error message

In [None]:

def error_cb(err):
    """ The error callback is used for generic client errors. These
        errors are generally to be considered informational as the client will
        automatically try to recover from all errors, and no extra action
        is typically required by the application.
        For this example however, we terminate the application if the client
        is unable to connect to any broker (_ALL_BROKERS_DOWN) and on
        authentication errors (_AUTHENTICATION). """

    print("Client error: {}".format(err))
    if err.code() == KafkaError._ALL_BROKERS_DOWN or \
       err.code() == KafkaError._AUTHENTICATION:
        # Any exception raised from this callback will be re-raised from the
        # triggering flush() or poll() call.
        raise KafkaException(err)


def acked(err, msg):
    """ 
        Error callback is used for generic issues for producer errors. 
        
        Parameters:
            err (err): Error flag.
            msg (str): Error message that was part of the callback.
    """
    if err is not None:
        print("Failed to deliver message: %s: %s" % (str(msg), str(err)))
    else:
        print("Message produced: %s" % (str(msg)))

Set up strings

In [None]:
from confluent_kafka import Consumer
from time import sleep
import uuid
from confluent_kafka import Producer, Consumer, KafkaError, KafkaException
from confluent_kafka.admin import AdminClient, NewTopic
import json

#KAFKA variables, get from your cluster and put into a config file
from config import confluentClusterName
from config import confluentBootstrapServers
from config import confluentTopicName
from config import schemaRegistryUrl
from config import confluentApiKey
from config import confluentSecret
from config import confluentRegistryApiKey
from config import confluentRegistrySecret

create topic

In [None]:
# Create topic

# Create admin_client
admin_client = AdminClient({
    'bootstrap.servers': confluentBootstrapServers,
    'sasl.mechanism': 'PLAIN',
    'security.protocol': 'SASL_SSL',
    'sasl.username': confluentApiKey,
    'sasl.password': confluentSecret,
    'group.id': str(uuid.uuid1()),  # this will create a new consumer group on each invocation.
    'auto.offset.reset': 'earliest',
    'error_cb': error_cb,
})

In [None]:
# #If needed, we can delete a topic to "reset" it

# try:
#     topics =['insurance-capstone2']
#     fs = admin_client.delete_topics(topics, request_timeout=30)

#     for topic, f in fs.items():
#         try:
#             f.result()  # The result itself is None
#             print("Topic {} deleted".format(topic))
#         except Exception as e:
#             print("Failed to delete topic {}: {}".format(topic, e))
# except Exception as e:
#     print(e)


Add a topic

In [None]:
# # Add topic
# topic_list = []
# topic_list.append(NewTopic(confluentTopicName, 1, 3))
# admin_client.create_topics(topic_list)
# futures = admin_client.create_topics(topic_list)
# try:
#     record_metadata = []
#     for k, future in futures.items():
#         # f = i.get(timeout=10)
#         print(f"type(k): {type(k)}")
#         print(f"type(v): {type(future)}")
#         print(future.result())
# except KafkaError:
#     # Decide what to do if produce request failed...
#     print(traceback.format_exc())
#     result = 'Fail'
# finally:
#     print("finally")

Creat producer object

In [None]:
#Create producer object

p = Producer({
    'bootstrap.servers': confluentBootstrapServers,
    'sasl.mechanism': 'PLAIN',
    'security.protocol': 'SASL_SSL',
    'sasl.username': confluentApiKey,
    'sasl.password': confluentSecret,
    'group.id': str(1),  # this will create a new consumer group on each invocation.
    'auto.offset.reset': 'earliest',
    'error_cb': error_cb,
})

define mount point to read in data files

In [None]:
# Mount the capstone container
from config import storageAccount
from config import storageContainer
from config import clientSecret
from config import clientid
from config import mount_point
    
    
configs = {"fs.azure.account.auth.type": "OAuth",
   "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
   "fs.azure.account.oauth2.client.id": clientid,
   "fs.azure.account.oauth2.client.secret": clientSecret,
   "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/d46b54b2-a652-420b-aa5a-2ef7f8fc706e/oauth2/token",
   "fs.azure.createRemoteFileSystemDuringInitialization": "true"}

try:
    dbutils.fs.unmount(mount_point)
except:
    pass


dbutils.fs.mount(
source = f"abfss://{storageContainer}@{storageAccount}.dfs.core.windows.net/", 
mount_point = mount_point, 
extra_configs = configs)

In [None]:
%fs
ls /mnt/capstone-group2-data/datain/rawdata/HealthCosts

In [None]:
import random
from time import sleep

#dictionary to read healthcosts
combinedDict = {}

#read in csv files
df = spark.read.csv('/mnt/capstone-group2-data/datain/rawdata/HealthCosts/insurance.csv', header = 'true')

In [None]:
#Create N instances of HealthCosts data and push
data_entries = df.count()
for i in range(data_entries):
    #If you want to randomize the data feed from the file, you can uncomment the line below
    #rand_row = random.randint(0, data_entries - 1)
    
    for data_col in df.columns:
        combinedDict[data_col] = df.collect()[i][data_col]
        
    p.produce(confluentTopicName, json.dumps(combinedDict))
    p.flush()
    sleep(5)
    print(f'# {i}', combinedDict)
    