Prerequisite:
You have to run test_kafka_producer.ipynb
first before running this notebook.

In [67]:
import json
from time import sleep

from bs4 import BeautifulSoup

In [68]:

from kafka import KafkaConsumer, KafkaProducer

First it will consume data from raw_recipes topic, parse and transform data into JSON and then will publish it in parsed_recipes topic. Below is the code that will fetch HTML data from raw_recipes topic, parse and then feed into parsed_recipes topic.

In [70]:
def publish_message(producer_instance, topic_name, key, value):
    try:
        key_bytes =  bytes(key, encoding='utf-8') 
        value_bytes =json.dumps(value).encode('utf-8') # bytes(value, encoding='utf-8') 
        print(value_bytes)
        producer_instance.send(topic_name, key=key_bytes, value=value_bytes)
        producer_instance.flush()
        print('Message published successfully (producer).')
    except Exception as ex:
        print('Exception in publishing message')
        print(str(ex))


def connect_kafka_producer():
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers=['localhost:9092'], api_version=(0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka')
        print(str(ex))
    finally:
        return _producer


 

In [71]:


def parse_nutrition_facts(htmldoc):
    """Extract nutrition facts from an AllRecipes page and return as dictionary."""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
 
 
    soup = BeautifulSoup(htmldoc, 'html.parser')
    #print(htmldoc)
    title = soup.title.string if soup.title else "No title found"
    tables = soup.find_all('table',class_="mm-recipes-nutrition-facts-summary__table")
    nutrition = {}
    # Extract rows from each table
    for i, table in enumerate(tables, 1):
         
        rows = table.find_all('tr')  # Get all rows in the table
        
        for row in rows:
            # Extract headers (th) or data (td)
            cells = row.find_all(['th', 'td'])
            row_data = [cell.get_text(strip=True) for cell in cells]
            #print(row_data)
            row_data[0] = row_data[0].replace("g","")
            nutrition['title'] = title
            nutrition[row_data[1]] =   int(row_data[0])
            
            
    
    return nutrition

         
 
 

In [72]:
if __name__ == '__main__':
    print('Running Consumer..')
    parsed_records = []
    topic_name = 'raw_recipes'
    parsed_topic_name = 'parsed_recipes'

    consumer = KafkaConsumer(topic_name, auto_offset_reset='earliest',
                             bootstrap_servers=['localhost:9092'], api_version=(0, 10), consumer_timeout_ms=1000)
    for msg in consumer:
        html = msg.value
        result =  parse_nutrition_facts(html)
        print (result)
        parsed_records.append(result)
    consumer.close()
    sleep(5)

    if len(parsed_records) > 0:
        print('Publishing records..')
        producer = connect_kafka_producer()
        for rec in parsed_records:
            print(rec)
            publish_message(producer, parsed_topic_name, 'parsed', rec)

Running Consumer..
{'title': 'Thai Chopped Chicken Bean Salad Recipe', 'Calories': 341, 'Fat': 20, 'Carbs': 26, 'Protein': 17}
{'title': 'Thai Peanut Butter Ramen Recipe', 'Calories': 503, 'Fat': 27, 'Carbs': 57, 'Protein': 15}
{'title': 'Turkey Tom Kha Gai', 'Calories': 441, 'Fat': 27, 'Carbs': 13, 'Protein': 39}
{'title': 'Thai Chopped Chicken Bean Salad Recipe', 'Calories': 341, 'Fat': 20, 'Carbs': 26, 'Protein': 17}
{'title': 'Thai Peanut Butter Ramen Recipe', 'Calories': 503, 'Fat': 27, 'Carbs': 57, 'Protein': 15}
{'title': 'Turkey Tom Kha Gai', 'Calories': 441, 'Fat': 27, 'Carbs': 13, 'Protein': 39}
{'title': 'Thai Chopped Chicken Bean Salad Recipe', 'Calories': 341, 'Fat': 20, 'Carbs': 26, 'Protein': 17}
{'title': 'Thai Peanut Butter Ramen Recipe', 'Calories': 503, 'Fat': 27, 'Carbs': 57, 'Protein': 15}
{'title': 'Turkey Tom Kha Gai', 'Calories': 441, 'Fat': 27, 'Carbs': 13, 'Protein': 39}
{'title': 'Thai Chopped Chicken Bean Salad Recipe', 'Calories': 341, 'Fat': 20, 'Carbs': 2

you may delete buffer topic:

bin/kafka-topics.sh --zookeeper localhost:2181 --delete --topic parsed_recipes


In [73]:
import json
from time import sleep

from kafka import KafkaConsumer

if __name__ == '__main__':
    parsed_topic_name = 'parsed_recipes'
    # Notify if a recipe has more than 200 calories
    calories_threshold = 200

    consumer = KafkaConsumer(parsed_topic_name, auto_offset_reset='earliest',
                             bootstrap_servers=['localhost:9092'], api_version=(0, 10), consumer_timeout_ms=1000)
    for msg in consumer:
        record = json.loads(msg.value)
        print(record)
        print(record['Calories'])
        if not ('Calories' in record) :
            continue
         
            
        print('2',record)
        calories = record['Calories']
        title = record['title']

        if calories > calories_threshold:
            print('Alert: {} calories count is {}'.format(title, calories))
        sleep(3)

    if consumer is not None:
        consumer.close()

{'title': 'Thai Chopped Chicken Bean Salad Recipe', 'Calories': 341, 'Fat': 20, 'Carbs': 26, 'Protein': 17}
341
2 {'title': 'Thai Chopped Chicken Bean Salad Recipe', 'Calories': 341, 'Fat': 20, 'Carbs': 26, 'Protein': 17}
Alert: Thai Chopped Chicken Bean Salad Recipe calories count is 341
{'title': 'Thai Peanut Butter Ramen Recipe', 'Calories': 503, 'Fat': 27, 'Carbs': 57, 'Protein': 15}
503
2 {'title': 'Thai Peanut Butter Ramen Recipe', 'Calories': 503, 'Fat': 27, 'Carbs': 57, 'Protein': 15}
Alert: Thai Peanut Butter Ramen Recipe calories count is 503
{'title': 'Turkey Tom Kha Gai', 'Calories': 441, 'Fat': 27, 'Carbs': 13, 'Protein': 39}
441
2 {'title': 'Turkey Tom Kha Gai', 'Calories': 441, 'Fat': 27, 'Carbs': 13, 'Protein': 39}
Alert: Turkey Tom Kha Gai calories count is 441
{'title': 'Thai Chopped Chicken Bean Salad Recipe', 'Calories': 341, 'Fat': 20, 'Carbs': 26, 'Protein': 17}
341
2 {'title': 'Thai Chopped Chicken Bean Salad Recipe', 'Calories': 341, 'Fat': 20, 'Carbs': 26, 'Pro