In [6]:
import json
from time import sleep

from bs4 import BeautifulSoup

In [7]:

from kafka import KafkaConsumer, KafkaProducer

First it will consume data from raw_recipes topic, parse and transform data into JSON and then will publish it in parsed_recipes topic. Below is the code that will fetch HTML data from raw_recipes topic, parse and then feed into parsed_recipes topic.

In [8]:
def publish_message(producer_instance, topic_name, key, value):
    try:
        key_bytes = bytes(key, encoding='utf-8') 
        value_bytes = bytes(value, encoding='utf-8') 
        producer_instance.send(topic_name, key=key_bytes, value=value_bytes)
        producer_instance.flush()
        print('Message published successfully (producer).')
    except Exception as ex:
        print('Exception in publishing message')
        print(str(ex))


def connect_kafka_producer():
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers=['localhost:9092'], api_version=(0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka')
        print(str(ex))
    finally:
        return _producer


def parse(markup):
    title = '-'
    submit_by = '-'
    description = '-'
    calories = 0
    ingredients = []
    rec = {}

    try:

        soup = BeautifulSoup(markup, 'lxml')
        # title
        title_section = soup.select('.recipe-title')
        # submitter
        #submitter_section = soup.select('.submitter__name')
        # description
        #description_section = soup.select('.submitter__description')
        # ingredients
        ingredients_section = soup.select('.ingredients-item-name')

        # calories
        nutrition_section = soup.select('.recipe-nutrition-section .section-body')#soup.select('.calorie-count')
        nutrition_section = [s  for div in nutrition_section for s in div.stripped_strings]
        print(nutrition_section)
        if nutrition_section:
            #calories = calories_section[0].text.replace('cals', '').strip()
            calories = nutrition_section[0].split(';')[0]
        
        if ingredients_section:
            for ingredient in ingredients_section:
                ingredient_text = ingredient.text.strip()
                if 'Add all ingredients to list' not in ingredient_text and ingredient_text != '':
                    ingredients.append({'step': ingredient.text.strip()})

        #if description_section:
        #    description = description_section[0].text.strip().replace('"', '')

        #if submitter_section:
        #    submit_by = submitter_section[0].text.strip()

        if title_section:
            title = title_section[0].text

        rec = {'title': title,  'calories': calories,
               'ingredients': ingredients}
        print(rec)
    except Exception as ex:
        print('Exception while parsing')
        print(str(ex))
    finally:
        return json.dumps(rec)




In [9]:
if __name__ == '__main__':
    print('Running Consumer..')
    parsed_records = []
    topic_name = 'raw_recipes'
    parsed_topic_name = 'parsed_recipes'

    consumer = KafkaConsumer(topic_name, auto_offset_reset='earliest',
                             bootstrap_servers=['localhost:9092'], api_version=(0, 10), consumer_timeout_ms=1000)
    for msg in consumer:
        html = msg.value
        result = parse(html)
        print (result)
        parsed_records.append(result)
    consumer.close()
    sleep(5)

    if len(parsed_records) > 0:
        print('Publishing records..')
        producer = connect_kafka_producer()
        for rec in parsed_records:
            publish_message(producer, parsed_topic_name, 'parsed', rec)

Running Consumer..
['817 calories; protein 8.4g; carbohydrates 144.3g; fat 26g; sodium 458.4mg.', 'Full Nutrition']
{'title': 'Thai Sweet Sticky Rice With Mango (Khao Neeo Mamuang)', 'calories': '817 calories', 'ingredients': [{'step': '2 cups water'}, {'step': '1\u2009½ cups uncooked short-grain white rice'}, {'step': '1\u2009½ cups coconut milk, divided'}, {'step': '1 cup white sugar'}, {'step': '¾ teaspoon salt, divided'}, {'step': '1 tablespoon white sugar'}, {'step': '1 tablespoon tapioca starch'}, {'step': '3 mangos, peeled and sliced'}, {'step': '1 tablespoon toasted sesame seeds'}]}
{"title": "Thai Sweet Sticky Rice With Mango (Khao Neeo Mamuang)", "calories": "817 calories", "ingredients": [{"step": "2 cups water"}, {"step": "1\u2009\u00bd cups uncooked short-grain white rice"}, {"step": "1\u2009\u00bd cups coconut milk, divided"}, {"step": "1 cup white sugar"}, {"step": "\u00be teaspoon salt, divided"}, {"step": "1 tablespoon white sugar"}, {"step": "1 tablespoon tapioca star

you may delete buffer topic:

bin/kafka-topics.sh --zookeeper localhost:2181 --delete --topic parsed_recipes


In [10]:
import json
from time import sleep

from kafka import KafkaConsumer

if __name__ == '__main__':
    parsed_topic_name = 'parsed_recipes'
    # Notify if a recipe has more than 200 calories
    calories_threshold = 200

    consumer = KafkaConsumer(parsed_topic_name, auto_offset_reset='earliest',
                             bootstrap_servers=['localhost:9092'], api_version=(0, 10), consumer_timeout_ms=1000)
    for msg in consumer:
        record = json.loads(msg.value)
        print(record)
         
        calories =  int(record['calories'].split(' ')[0])
        title = record['title']

        if calories > calories_threshold:
            print('Alert: {} calories count is {}'.format(title, calories))
        sleep(3)

    if consumer is not None:
        consumer.close()

{'title': 'Thai Sweet Sticky Rice With Mango (Khao Neeo Mamuang)', 'calories': '817 calories', 'ingredients': [{'step': '2 cups water'}, {'step': '1\u2009½ cups uncooked short-grain white rice'}, {'step': '1\u2009½ cups coconut milk, divided'}, {'step': '1 cup white sugar'}, {'step': '¾ teaspoon salt, divided'}, {'step': '1 tablespoon white sugar'}, {'step': '1 tablespoon tapioca starch'}, {'step': '3 mangos, peeled and sliced'}, {'step': '1 tablespoon toasted sesame seeds'}]}
Alert: Thai Sweet Sticky Rice With Mango (Khao Neeo Mamuang) calories count is 817
{'title': 'Thai Sweet Sticky Rice With Mango (Khao Neeo Mamuang)', 'calories': '817 calories', 'ingredients': [{'step': '2 cups water'}, {'step': '1\u2009½ cups uncooked short-grain white rice'}, {'step': '1\u2009½ cups coconut milk, divided'}, {'step': '1 cup white sugar'}, {'step': '¾ teaspoon salt, divided'}, {'step': '1 tablespoon white sugar'}, {'step': '1 tablespoon tapioca starch'}, {'step': '3 mangos, peeled and sliced'}, {