In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
import json
from pyspark.sql.functions import explode, collect_list, to_timestamp, date_format
from pyspark.sql.types import StructType, StructField, StringType, ArrayType


In [2]:
# Using zipfile module in a Jupyter notebook
#import zipfile
#import os

# Path to the zip file
#zip_file_path = '2024_data.zip'

# Path to extract
#extraction_path = ''


# Extract the zip file
#with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
#    zip_ref.extractall(extraction_path)

#print(f"Extracted all files to {extraction_path}")



In [3]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName("BBC_Data_Analysis") \
    .getOrCreate()


In [4]:
# Reading dictionary from a JSON file
with open('scraping_news_data.json', 'r') as json_file:
    data = json.load(json_file)


# Flatten the dictionary
flatten_data = []
for key, inner_dict in data.items():
    flatten_data.append(Row( **inner_dict))

# Create a DataFrame
df = spark.createDataFrame(flatten_data)

# Show the DataFrame
df.show()



+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|                link|            subtitle|                time|               title|              topics|              images|videos|              author|                text|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|https://www.bbc.c...|                    |                null|China holds milit...|[Asia, China, Tai...|[https://ichef.bb...|    []|Kelly Ng in Singa...|[China has starte...|
|https://www.bbc.c...|                    |                null|Singapore Airline...|[Aviation acciden...|[https://ichef.bb...|    []|  Jacqueline Howard,|[More than 20 peo...|
|https://www.bbc.c...|Dash-cam footage ...|2024-05-23T20:25:...|Golfer Scottie Sc...|                  []|         

In [5]:
schema = StructType([
    StructField("link", StringType(), True),
    StructField("subtitle", StringType(), True),
    StructField("time", StringType(), True),
    StructField("title", StringType(), True),
    StructField("topics", ArrayType(StringType()), True),
    StructField("images", ArrayType(StringType()), True),
    StructField("videos", ArrayType(StringType()), True),
    StructField("author", StringType(), True),
    StructField("text", ArrayType(StringType()), True)
])

In [6]:
df.count()

39

In [7]:
# Adding new data to the DataFrame
# Reading dictionary from a JSON file
categories=['earth','buisness','innovation','travel']
for s in categories:
    for i in range(1,12):
        file=s+'_data_page_'+str(i)+'.json'
        with open(file, 'r') as json_file:
            data = json.load(json_file)

        # Flatten the dictionary
        flatten_data = []
        for key, inner_dict in data.items():
            flatten_data.append(Row( **inner_dict))

        # Create a DataFrame
        df2 = spark.createDataFrame(flatten_data,schema)
        df = df.union(df2)

# Show the DataFrame
df.show()
df.count()


+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|                link|            subtitle|                time|               title|              topics|              images|videos|              author|                text|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|https://www.bbc.c...|                    |                null|China holds milit...|[Asia, China, Tai...|[https://ichef.bb...|    []|Kelly Ng in Singa...|[China has starte...|
|https://www.bbc.c...|                    |                null|Singapore Airline...|[Aviation acciden...|[https://ichef.bb...|    []|  Jacqueline Howard,|[More than 20 peo...|
|https://www.bbc.c...|Dash-cam footage ...|2024-05-23T20:25:...|Golfer Scottie Sc...|                  []|         

435

In [8]:
# Readind data from culture categorie
# Reading dictionary from a JSON file
categories=['culture']
for s in categories:
    for i in range(1,11):
        file=s+'_data_page_'+str(i)+'.json'
        with open(file, 'r') as json_file:
            data = json.load(json_file)

        # Flatten the dictionary
        flatten_data = []
        for key, inner_dict in data.items():
            flatten_data.append(Row( **inner_dict))

        # Create a DataFrame
        df2 = spark.createDataFrame(flatten_data,schema)
        df = df.union(df2)

# Show the DataFrame
df.show()
df.count()


+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|                link|            subtitle|                time|               title|              topics|              images|videos|              author|                text|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|https://www.bbc.c...|                    |                null|China holds milit...|[Asia, China, Tai...|[https://ichef.bb...|    []|Kelly Ng in Singa...|[China has starte...|
|https://www.bbc.c...|                    |                null|Singapore Airline...|[Aviation acciden...|[https://ichef.bb...|    []|  Jacqueline Howard,|[More than 20 peo...|
|https://www.bbc.c...|Dash-cam footage ...|2024-05-23T20:25:...|Golfer Scottie Sc...|                  []|         

525

In [9]:
# Reading 2024 data
# Reading dictionary from a JSON file
categories=['2024']
for s in categories:
    for i in range(1,220):
        file=s+'_data_page_'+str(i)+'.json'
        with open(file, 'r') as json_file:
            data = json.load(json_file)

        # Flatten the dictionary
        flatten_data = []
        for key, inner_dict in data.items():
            flatten_data.append(Row( **inner_dict))

        # Create a DataFrame
        df2 = spark.createDataFrame(flatten_data,schema)
        df = df.union(df2)

# Show the DataFrame
df.show()
df.count()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|                link|            subtitle|                time|               title|              topics|              images|videos|              author|                text|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|https://www.bbc.c...|                    |                null|China holds milit...|[Asia, China, Tai...|[https://ichef.bb...|    []|Kelly Ng in Singa...|[China has starte...|
|https://www.bbc.c...|                    |                null|Singapore Airline...|[Aviation acciden...|[https://ichef.bb...|    []|  Jacqueline Howard,|[More than 20 peo...|
|https://www.bbc.c...|Dash-cam footage ...|2024-05-23T20:25:...|Golfer Scottie Sc...|                  []|         

2368

In [10]:
df.dtypes

[('link', 'string'),
 ('subtitle', 'string'),
 ('time', 'string'),
 ('title', 'string'),
 ('topics', 'array<string>'),
 ('images', 'array<string>'),
 ('videos', 'array<string>'),
 ('author', 'string'),
 ('text', 'array<string>')]

In [11]:
df = df.withColumn("time", to_timestamp("time"))

In [12]:
df.dtypes

[('link', 'string'),
 ('subtitle', 'string'),
 ('time', 'timestamp'),
 ('title', 'string'),
 ('topics', 'array<string>'),
 ('images', 'array<string>'),
 ('videos', 'array<string>'),
 ('author', 'string'),
 ('text', 'array<string>')]

In [13]:
df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|                link|            subtitle|                time|               title|              topics|              images|videos|              author|                text|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|https://www.bbc.c...|                    |                null|China holds milit...|[Asia, China, Tai...|[https://ichef.bb...|    []|Kelly Ng in Singa...|[China has starte...|
|https://www.bbc.c...|                    |                null|Singapore Airline...|[Aviation acciden...|[https://ichef.bb...|    []|  Jacqueline Howard,|[More than 20 peo...|
|https://www.bbc.c...|Dash-cam footage ...|2024-05-23 20:25:...|Golfer Scottie Sc...|                  []|         

In [40]:
# Filter rows where date is May 23, 2024
df_filtered = df.filter(date_format(df["time"], "yyyy-MM-dd") == "2024-05-23")
df_filtered.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                link|            subtitle|                time|               title|              topics|              images|              videos|              author|                text|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|https://www.bbc.c...|Dash-cam footage ...|2024-05-23 20:25:...|Golfer Scottie Sc...|                  []|                  []|                  []|                    |                  []|
|https://www.bbc.c...|There could be up...|2024-05-23 16:09:...|Hurricane season ...|[Climate, Hurrica...|[https://ichef.bb...|                  []|      Mark Poynting,|[The North Atlant...|
|https://www.bbc.c...|Kenyan President ...|20

In [41]:
# Perform explode on topics column
df_exploded = df.withColumn("topic", explode("topics"))
#Extracting trending topics
trending_topics=df_exploded.groupBy("topic" ).count()
trending_topics=trending_topics.orderBy('count',ascending=0)
trending_topics.show()

+--------------------+-----+
|               topic|count|
+--------------------+-----+
|England and Wales...|  122|
|General election ...|  116|
|               Music|   55|
|       United States|   51|
|          Air travel|   50|
|           Transport|   50|
|               Wales|   45|
|         Environment|   35|
|         Photography|   34|
|Wildlife photography|   34|
|               India|   30|
|         Rishi Sunak|   30|
|            Features|   29|
|             Bristol|   29|
|                Asia|   27|
|           Berkshire|   26|
|        Labour Party|   26|
|              London|   25|
|              Nature|   24|
|    Northern Ireland|   23|
+--------------------+-----+
only showing top 20 rows



In [42]:
#Grouping by "Topic" and aggregating the titles
classified_articles = df_exploded.groupBy("topic").agg(collect_list("Title").alias("Articles"))
#Adding 'count' column ti classified_articles:
join_df=classified_articles.join(trending_topics,classified_articles.topic==trending_topics.topic)
join_df = join_df.select(classified_articles["topic"], "Articles", trending_topics["count"])
#Ordring by popularity of the topic
join_df=join_df.orderBy('count',ascending=0)
#Showing the result
join_df.show()



+--------------------+--------------------+-----+
|               topic|            Articles|count|
+--------------------+--------------------+-----+
|England and Wales...|[North Yorkshire ...|  122|
|General election ...|[General election...|  116|
|               Music|[Billie Eilish an...|   55|
|       United States|[Elon Musk says h...|   51|
|          Air travel|[Ryanair sees ris...|   50|
|           Transport|[Aircraft turbule...|   50|
|               Wales|[Rare monkeys: Fo...|   45|
|         Environment|[Straw bale Ely C...|   35|
|         Photography|[Dorset's Big Pic...|   34|
|Wildlife photography|[Dorset's Big Pic...|   34|
|               India|[Assam: The 'doub...|   30|
|         Rishi Sunak|[Sunak says it'll...|   30|
|            Features|[Some Parisians s...|   29|
|             Bristol|[Geoff Kitchen: B...|   29|
|                Asia|[World's most exp...|   27|
|           Berkshire|[Berkshire's Big ...|   26|
|        Labour Party|[Rachel Reeves pr...|   26|


In [27]:
#different_articles=join_df.distinct().count()
