# 5.0 Event Streaming

###### Author: Gan Yee Jing, Yeap Jie Shen
###### Last Edited: 02/09/2024

## 5.1 Kafka Streaming (Getting Raw News)
### 5.1.1 Importing Necessary Libraries

In [1]:
from pyspark.sql import SparkSession

import sys

sys.path.append(r'/home/student/RDS2S3G4_CLO2_B')

from data_stores.hbaseClient import HBaseClient

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from fake_useragent import UserAgent
from kafka import KafkaProducer
import json
import time

### 5.1.2 Instantiate Spark Session and HDFS Client

In [2]:
spark = SparkSession.builder.appName('Kafka Streaming').getOrCreate()
hbase_client = HBaseClient(host = 'localhost', port = 9090)

24/09/02 14:21:35 WARN Utils: Your hostname, Gan. resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/09/02 14:21:35 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/02 14:21:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/09/02 14:21:36 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/09/02 14:21:36 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


### 5.1.3 Defining Source URL

In [3]:
# 'SELANGOR JOURNAL' URL
url = 'https://selangorjournal.my/category/current/crime/'

### 5.1.4 Scraping News URLs

In [4]:
news_urls = []

# Create a Useragent instance
user_agent = UserAgent()

# Setup webdriver
options = webdriver.ChromeOptions()
options.add_argument(f'user-agent={user_agent.random}')

# Open Chrome on Ubuntu
driver = webdriver.Chrome(options = options)

# Get request
driver.get(url)

# Locate all news article urls element
url_elements = driver.find_elements(By.CSS_SELECTOR, '.penci-link-post.penci-image-holder.penci-disable-lazy') 

# Retrrieve single news article url
for url_element in url_elements:
    url = url_element.get_attribute('href')
    news_urls.append(url)

driver.quit()

### 5.1.5 Verifying Existence of Duplicates

In [5]:
# Retrieving urls from HBase
url_records = [
        record[1][b'cf1:url'].decode('utf-8')
        for record in hbase_client.read_keys('news', ['k' + str(i) for i in range(6690)], [
            'cf1:url'
        ])
    ]

In [6]:
# Convert both new urls and eixsting urls to set for easier checking
new_urls_set = set(news_urls)
old_urls_set = set(url_records)

# Find common urls
common_urls = old_urls_set.intersection(new_urls_set)

# Discard common urls
valid_news_urls = [url for url in new_urls_set if url not in common_urls]

### 5.1.6 Scraping News 

In [7]:
# List to store all news data
data = []

# Retriving data from each news articles
for url in valid_news_urls:
    # Create a Useragent instance
    user_agent = UserAgent()

    # Setup webdriver
    options = webdriver.ChromeOptions()
    options.add_argument(f'user-agent={user_agent.random}')

    # Open Chrome on Ubuntu
    driver = webdriver.Chrome(options = options)

    try: 
        # Get Request
        driver.get(url)
    
        # Retrieve headline
        headline = driver.find_element(By.TAG_NAME, "h1").text

        # Retrieve date of published
        date = driver.find_element(By.TAG_NAME, "time").get_attribute('datetime')

        # Retrieve news text
        content_wrapper = driver.find_element(By.CLASS_NAME,"dable-content-wrapper")
        texts = content_wrapper.find_elements(By.TAG_NAME, "p")
        article_content = ''

        for text in texts:
        	article_content += text.text + ' '

        # Create a dictionary for single news article
        news_item ={
            'url' : url,
            'headline' : headline,
            'datetime' : date,
            'content' : article_content,
            'publisher' : 'Selangor Journal',
            'author' : ''
        }

        # Append to final list
        data.append(news_item)
        
    except Exception as e:
        print('Unknown error occured')
        continue

driver.quit()

print(len(data))

12


### 5.1.7 Publishing Scarpped News

In [19]:
# Initailise producer
# kafka-topics.sh --delete --bootstrap-server localhost:9092 --topic CrimeNews
# kafka-topics.sh --create --bootstrap-server localhost:9092 --replication-factor 1 --partitions 1 --topic CrimeNews

producer = KafkaProducer(value_serializer=lambda v: json.dumps(v).encode('utf-8'), bootstrap_servers='localhost:9092')

topic = 'CrimeNews'

# Publish 
for news in data:
    producer.send(topic=topic, value=news)
    time.sleep(3)

In [9]:
spark.stop()