In [19]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
import json
from pyspark.sql.functions import explode, collect_list, to_timestamp, date_format


In [3]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName("BBC_Data_Analysis") \
    .getOrCreate()


In [4]:
# Reading dictionary from a JSON file
with open('scraping_news_data.json', 'r') as json_file:
    data = json.load(json_file)


# Flatten the dictionary
flatten_data = []
for key, inner_dict in data.items():
    flatten_data.append(Row( **inner_dict))

# Create a DataFrame
df = spark.createDataFrame(flatten_data)

# Show the DataFrame
df.show()



+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|                link|            subtitle|                time|               title|              topics|              images|videos|              author|                text|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|https://www.bbc.c...|                    |                null|China holds milit...|[Asia, China, Tai...|[https://ichef.bb...|    []|Kelly Ng in Singa...|[China has starte...|
|https://www.bbc.c...|                    |                null|Singapore Airline...|[Aviation acciden...|[https://ichef.bb...|    []|  Jacqueline Howard,|[More than 20 peo...|
|https://www.bbc.c...|Dash-cam footage ...|2024-05-23T20:25:...|Golfer Scottie Sc...|                  []|         

In [5]:
df.dtypes

[('link', 'string'),
 ('subtitle', 'string'),
 ('time', 'string'),
 ('title', 'string'),
 ('topics', 'array<string>'),
 ('images', 'array<string>'),
 ('videos', 'array<string>'),
 ('author', 'string'),
 ('text', 'array<string>')]

In [6]:
df = df.withColumn("time", to_timestamp("time"))

In [7]:
df.dtypes

[('link', 'string'),
 ('subtitle', 'string'),
 ('time', 'timestamp'),
 ('title', 'string'),
 ('topics', 'array<string>'),
 ('images', 'array<string>'),
 ('videos', 'array<string>'),
 ('author', 'string'),
 ('text', 'array<string>')]

In [11]:
df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|                link|            subtitle|                time|               title|              topics|              images|videos|              author|                text|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|https://www.bbc.c...|                    |                null|China holds milit...|[Asia, China, Tai...|[https://ichef.bb...|    []|Kelly Ng in Singa...|[China has starte...|
|https://www.bbc.c...|                    |                null|Singapore Airline...|[Aviation acciden...|[https://ichef.bb...|    []|  Jacqueline Howard,|[More than 20 peo...|
|https://www.bbc.c...|Dash-cam footage ...|2024-05-23 20:25:...|Golfer Scottie Sc...|                  []|         

In [20]:
# Filter rows where date is May 23, 2024
df_filtered = df.filter(date_format(df["time"], "yyyy-MM-dd") == "2024-05-23")
df_filtered.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                link|            subtitle|                time|               title|              topics|              images|              videos|              author|                text|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|https://www.bbc.c...|Dash-cam footage ...|2024-05-23 20:25:...|Golfer Scottie Sc...|                  []|                  []|                  []|                    |                  []|
|https://www.bbc.c...|There could be up...|2024-05-23 16:09:...|Hurricane season ...|[Climate, Hurrica...|[https://ichef.bb...|                  []|      Mark Poynting,|[The North Atlant...|
|https://www.bbc.c...|Kenyan President ...|20

In [24]:
# Perform explode on topics column
df_exploded = df.withColumn("topic", explode("topics"))
#Extracting trending topics
trending_topics=df_exploded.groupBy("Topic" ).count()
trending_topics=trending_topics.orderBy('count',ascending=0)
trending_topics.show()

+--------------------+-----+
|               Topic|count|
+--------------------+-----+
|              Africa|    4|
|       United States|    4|
|           Singapore|    3|
|          Air travel|    3|
|           Aerospace|    2|
|              Travel|    2|
|              Europe|    2|
|         Rishi Sunak|    2|
|                 Art|    2|
|             Bangkok|    2|
|                Asia|    2|
|               Music|    2|
|               Kenya|    2|
|London Heathrow A...|    1|
|             Beijing|    1|
|          Hurricanes|    1|
|              Taiwan|    1|
|Aviation accident...|    1|
|             Climate|    1|
|              Russia|    1|
+--------------------+-----+
only showing top 20 rows



In [35]:
#Grouping by "Topic" and aggregating the titles
classified_articles = df_exploded.groupBy("Topic").agg(collect_list("Title").alias("Articles"))
#Adding 'count' column ti classified_articles:
join_df=classified_articles.join(trending_topics,classified_articles.Topic==trending_topics.Topic)
#Ordring by popularity of the topic
join_df=join_df.orderBy('count',ascending=0)
#Showing the result
join_df.show()



+--------------------+--------------------+--------------------+-----+
|               Topic|            Articles|               Topic|count|
+--------------------+--------------------+--------------------+-----+
|              Africa|[US names Kenya a...|              Africa|    4|
|       United States|[US names Kenya a...|       United States|    4|
|           Singapore|[Singapore Airlin...|           Singapore|    3|
|          Air travel|[Singapore Airlin...|          Air travel|    3|
|           Aerospace|[What is it like ...|           Aerospace|    2|
|              Travel|[Singapore Airlin...|              Travel|    2|
|              Europe|[Rosenberg: Russi...|              Europe|    2|
|         Rishi Sunak|[How Rishi Sunak ...|         Rishi Sunak|    2|
|                 Art|[Ace-Liam Ankrah:...|                 Art|    2|
|             Bangkok|[Singapore Airlin...|             Bangkok|    2|
|                Asia|[China holds mili...|                Asia|    2|
|     

Row(Topic='London Heathrow Airport', Articles=['Singapore Airlines: More than twenty treated for spinal injuries after turbulence flight'])

In [27]:
join_df=classified_articles.join(trending_topics,classified_articles.Topic==trending_topics.Topic)

In [28]:
join_df=join_df.orderBy('count',ascending=0)

In [30]:
join_df.show()

+--------------------+--------------------+--------------------+-----+
|               Topic|            Articles|               Topic|count|
+--------------------+--------------------+--------------------+-----+
|              Africa|[US names Kenya a...|              Africa|    4|
|       United States|[US names Kenya a...|       United States|    4|
|           Singapore|[Singapore Airlin...|           Singapore|    3|
|          Air travel|[Singapore Airlin...|          Air travel|    3|
|           Aerospace|[What is it like ...|           Aerospace|    2|
|              Travel|[Singapore Airlin...|              Travel|    2|
|              Europe|[Rosenberg: Russi...|              Europe|    2|
|         Rishi Sunak|[How Rishi Sunak ...|         Rishi Sunak|    2|
|                 Art|[Ace-Liam Ankrah:...|                 Art|    2|
|             Bangkok|[Singapore Airlin...|             Bangkok|    2|
|                Asia|[China holds mili...|                Asia|    2|
|     