# 1.0 Data Collection (Data Crawling)

###### Author: Gan Yee Jing
###### Last Edited: 25/07/2024

## 1.1 Newsdata io
### 1.1.1 Importing Necessary Libraries and Instantiate Spark Session

In [1]:
from pyspark.sql import SparkSession

import sys

sys.path.append(r'/home/student/RDS2S3G4_CLO2_B')

from data_stores.hdfsClient import HdfsClient
from data_stores.redisClient import RedisClient

from newsdataapi import NewsDataApiClient
import pickle
import csv

spark = SparkSession.builder.appName('Newsdata io').getOrCreate()

### 1.1.2 Initialising API Client

In [2]:
api = NewsDataApiClient(apikey="pub_493460de0350ed9f7f24fcd82e80dd6ca03f9")

### 1.1.3 Crawling Data

In [3]:
# Initializing a list to store news 
news_list = []

# Retrieving 50 news
for i in range(50):
    response = api.news_api(country = 'us', category = 'crime', language = 'en')

    # Retreive all news
    news = response['results']

    news_list.extend(news)

print(len(news_list))

### 1.1.4 Exporting Data to HDFS

In [4]:
# Getting all the keys in the dictionaries 
all_keys = set()

for news in news_list:
    all_keys.update(news.keys())

print(all_keys)

In [5]:
hdfs_client = HdfsClient()
redis_client = RedisClient(host = 'localhost', port = 6379, db = 0, start_now = True)

df = spark.createDataFrame(news_list)

df.collect()

hdfs_client.write_file(dataframe = df, file_format = 'csv', destination_path = r'newsdata_io.csv')
redis_client.set_key_value('newsdata_io_list', pickle.dumps(df.collect()), seconds = 30* 60)

In [6]:
with open(r'../data/newsdata_io.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames = list(all_keys))
    writer.writeheader()
    for news in news_list:
        writer.writerow(news)

In [7]:
redis_client.stop_service()
spark.stop()