# Silver Layer: Cleaning, Normalization and Validation

In [0]:
from pyspark.sql.functions import col, coalesce, explode_outer, to_timestamp

# --- SETUP ---
# Create the target schema for Silver tables if it doesn't already exist.
print("Ensuring schema 'cve_silver' exists...")
spark.sql("CREATE SCHEMA IF NOT EXISTS cve_silver")
print("Schema setup complete.")

# --- LOAD DATA ---
# Load the Bronze table that was created in the bronze notebook.
bronze_df = spark.table("cve_bronze.records")


print("\nSuccessfully loaded and cached the Bronze data.")
display(bronze_df.limit(5))

Ensuring schema 'cve_silver' exists...
Schema setup complete.

Successfully loaded and cached the Bronze data.


dataType,dataVersion,cveMetadata,containers
CVE_RECORD,5.1,"List(CVE-2010-10011, 2024-01-12T19:31:04.066Z, 2025-06-03T14:04:08.764Z)","List(List(Acritum Femitter Server path traversal, List(List(A vulnerability, which was classified as problematic, was found in Acritum Femitter Server 1.04. Affected is an unknown function. The manipulation leads to path traversal. It is possible to launch the attack remotely. The exploit has been disclosed to the public and may be used. VDB-250446 is the identifier assigned to this vulnerability.), List(Es wurde eine Schwachstelle in Acritum Femitter Server 1.04 gefunden. Sie wurde als problematisch eingestuft. Hiervon betroffen ist ein unbekannter Codeblock. Durch das Beeinflussen mit unbekannten Daten kann eine path traversal-Schwachstelle ausgenutzt werden. Der Angriff kann über das Netzwerk angegangen werden. Der Exploit steht zur öffentlichen Verfügung.)), List(List(Acritum, Femitter Server)), List(List(List(4.3, MEDIUM), null), List(null, null), List(null, null))))"
CVE_RECORD,5.1,"List(CVE-2011-10005, 2024-01-16T08:00:05.823Z, 2025-06-02T15:11:59.735Z)","List(List(EasyFTP MKD Command buffer overflow, List(List(A vulnerability, which was classified as critical, was found in EasyFTP 1.7.0.2. Affected is an unknown function of the component MKD Command Handler. The manipulation leads to buffer overflow. It is possible to launch the attack remotely. The exploit has been disclosed to the public and may be used. The identifier of this vulnerability is VDB-250716.), List(Es wurde eine Schwachstelle in EasyFTP 1.7.0.2 gefunden. Sie wurde als kritisch eingestuft. Hiervon betroffen ist ein unbekannter Codeblock der Komponente MKD Command Handler. Durch Manipulation mit unbekannten Daten kann eine buffer overflow-Schwachstelle ausgenutzt werden. Der Angriff kann über das Netzwerk angegangen werden. Der Exploit steht zur öffentlichen Verfügung.)), List(List(n/a, EasyFTP)), List(List(List(6.3, MEDIUM), null), List(null, null), List(null, null))))"
CVE_RECORD,5.1,"List(CVE-2011-10006, 2024-04-08T13:00:05.786Z, 2024-08-07T00:30:46.944Z)","List(List(GamerZ WP-PostRatings wp-postratings.php cross site scripting, List(List(A vulnerability was found in GamerZ WP-PostRatings up to 1.64. It has been classified as problematic. This affects an unknown part of the file wp-postratings.php. The manipulation leads to cross site scripting. It is possible to initiate the attack remotely. Upgrading to version 1.65 is able to address this issue. The identifier of the patch is 6182a5682b12369ced0becd3b505439ce2eb8132. It is recommended to upgrade the affected component. The identifier VDB-259629 was assigned to this vulnerability.), List(Es wurde eine Schwachstelle in GamerZ WP-PostRatings bis 1.64 ausgemacht. Sie wurde als problematisch eingestuft. Betroffen hiervon ist ein unbekannter Ablauf der Datei wp-postratings.php. Durch das Manipulieren mit unbekannten Daten kann eine cross site scripting-Schwachstelle ausgenutzt werden. Umgesetzt werden kann der Angriff über das Netzwerk. Ein Aktualisieren auf die Version 1.65 vermag dieses Problem zu lösen. Der Patch wird als 6182a5682b12369ced0becd3b505439ce2eb8132 bezeichnet. Als bestmögliche Massnahme wird das Einspielen eines Upgrades empfohlen.)), List(List(GamerZ, WP-PostRatings)), List(List(List(3.5, LOW), null), List(null, null), List(null, null))))"
CVE_RECORD,5.1,"List(CVE-2012-10018, 2024-10-16T06:43:33.160Z, 2024-10-16T18:05:36.335Z)","List(List(Mapplic Lite and Mapplic <= (Various Versions) - Server Side Request Forgery to Cross-Site Scirpting, List(List(The Mapplic and Mapplic Lite plugins for WordPress are vulnerable to Server-Side Request Forgery in versions up to, and including 6.1, 1.0 respectively. This makes it possible for attackers to forgery requests coming from a vulnerable site's server and ultimately perform an XSS attack if requesting an SVG file.)), List(List(sekler, Mapplic Lite), List(sekler, Mapplic - Custom Interactive Map WordPress Plugin)), List(List(List(8.3, HIGH), null))))"
CVE_RECORD,5.1,"List(CVE-2012-6664, 2024-06-21T00:00:00, 2024-09-15T19:39:03.375Z)","List(List(null, List(List(Multiple directory traversal vulnerabilities in the TFTP Server in Distinct Intranet Servers 3.10 and earlier allow remote attackers to read or write arbitrary files via a .. (dot dot) in the (1) get or (2) put commands.)), List(List(n/a, n/a)), null))"


In [0]:
# --- 1. Create the Core CVE Table ---

# Select and flatten the necessary fields from the Bronze DataFrame.
silver_cves_df = bronze_df.select(
    col("cveMetadata.cveId").alias("cve_id"),
    # Standardize the date string to a proper timestamp format
    to_timestamp(col("cveMetadata.datePublished")).alias("date_published"),
    col("containers.cna.title").alias("title"),
    # Extract the first description from the descriptions array
    col("containers.cna.descriptions")[0]["value"].alias("description"),
    # Use coalesce to find the first non-null score, searching in order of preference (v4.0, then v3.1)
    coalesce(
        col("containers.cna.metrics")[0]["cvssV4_0"]["baseScore"],
        col("containers.cna.metrics")[0]["cvssV3_1"]["baseScore"]
    ).alias("cvss_score"),
    coalesce(
        col("containers.cna.metrics")[0]["cvssV4_0"]["baseSeverity"],
        col("containers.cna.metrics")[0]["cvssV3_1"]["baseSeverity"]
    ).alias("cvss_severity")
)

# Write the clean, flattened data to a new Silver table.
(silver_cves_df.write
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("cve_silver.cves")
)

print("Successfully created the 'cve_silver.cves' table.")
display(spark.table("cve_silver.cves"))

Successfully created the 'cve_silver.cves' table.


cve_id,date_published,title,description,cvss_score,cvss_severity
CVE-2005-10003,2024-10-17T14:00:16.571Z,mikexstudios Xcomic os command injection,A vulnerability classified as critical has been found in mikexstudios Xcomic up to 0.8.2. This affects an unknown part. The manipulation of the argument cmd leads to os command injection. It is possible to initiate the attack remotely. The complexity of an attack is rather high. The exploitability is told to be difficult. The exploit has been disclosed to the public and may be used. Upgrading to version 0.8.3 is able to address this issue. The patch is named 6ed8e3cc336e29f09c7e791863d0559939da98bf. It is recommended to upgrade the affected component.,6.3,MEDIUM
CVE-2010-10011,2024-01-12T19:31:04.066Z,Acritum Femitter Server path traversal,"A vulnerability, which was classified as problematic, was found in Acritum Femitter Server 1.04. Affected is an unknown function. The manipulation leads to path traversal. It is possible to launch the attack remotely. The exploit has been disclosed to the public and may be used. VDB-250446 is the identifier assigned to this vulnerability.",4.3,MEDIUM
CVE-2011-10005,2024-01-16T08:00:05.823Z,EasyFTP MKD Command buffer overflow,"A vulnerability, which was classified as critical, was found in EasyFTP 1.7.0.2. Affected is an unknown function of the component MKD Command Handler. The manipulation leads to buffer overflow. It is possible to launch the attack remotely. The exploit has been disclosed to the public and may be used. The identifier of this vulnerability is VDB-250716.",6.3,MEDIUM
CVE-2011-10006,2024-04-08T13:00:05.786Z,GamerZ WP-PostRatings wp-postratings.php cross site scripting,A vulnerability was found in GamerZ WP-PostRatings up to 1.64. It has been classified as problematic. This affects an unknown part of the file wp-postratings.php. The manipulation leads to cross site scripting. It is possible to initiate the attack remotely. Upgrading to version 1.65 is able to address this issue. The identifier of the patch is 6182a5682b12369ced0becd3b505439ce2eb8132. It is recommended to upgrade the affected component. The identifier VDB-259629 was assigned to this vulnerability.,3.5,LOW
CVE-2012-10018,2024-10-16T06:43:33.160Z,Mapplic Lite and Mapplic <= (Various Versions) - Server Side Request Forgery to Cross-Site Scirpting,"The Mapplic and Mapplic Lite plugins for WordPress are vulnerable to Server-Side Request Forgery in versions up to, and including 6.1, 1.0 respectively. This makes it possible for attackers to forgery requests coming from a vulnerable site's server and ultimately perform an XSS attack if requesting an SVG file.",8.3,HIGH
CVE-2012-6664,2024-06-21T00:00:00.000Z,,Multiple directory traversal vulnerabilities in the TFTP Server in Distinct Intranet Servers 3.10 and earlier allow remote attackers to read or write arbitrary files via a .. (dot dot) in the (1) get or (2) put commands.,,
CVE-2014-125110,2024-03-31T23:31:04.748Z,wp-file-upload Plugin wfu_ajaxactions.php wfu_ajax_action_callback cross site scripting,A vulnerability has been found in wp-file-upload Plugin up to 2.4.3 on WordPress and classified as problematic. Affected by this vulnerability is the function wfu_ajax_action_callback of the file lib/wfu_ajaxactions.php. The manipulation leads to cross site scripting. The attack can be launched remotely. Upgrading to version 2.4.4 is able to address this issue. The identifier of the patch is c846327df030a0a97da036a2f07c769ab9284ddb. It is recommended to upgrade the affected component. The identifier VDB-258781 was assigned to this vulnerability.,3.5,LOW
CVE-2014-125111,2024-04-08T13:00:07.717Z,namithjawahar Wp-Insert cross site scripting,A vulnerability was found in namithjawahar Wp-Insert up to 2.0.8 and classified as problematic. Affected by this issue is some unknown functionality. The manipulation leads to cross site scripting. The attack may be launched remotely. Upgrading to version 2.0.9 is able to address this issue. The name of the patch is a07b7b08084b9b85859f3968ce7fde0fd1fcbba3. It is recommended to upgrade the affected component. The identifier of this vulnerability is VDB-259628.,3.5,LOW
CVE-2014-5470,2024-06-21T00:00:00.000Z,,Actual Analyzer through 2014-08-29 allows code execution via shell metacharacters because untrusted input is used for part of the input data passed to an eval operation.,,
CVE-2015-10123,2024-03-13T08:31:55.341Z,Wago: Buffer Copy without Checking Size of Input in wbm of multiple products,An unautheticated remote attacker could send specifically crafted packets to a affected device. If an authenticated user then views that data in a specific page of the web-based management a buffer overflow will be triggered to gain full access of the device.,8.8,HIGH


In [0]:
# --- 2. Create the Affected Products Table ---

# Explode the 'affected' array to create a one-to-many relationship.
silver_affected_products_df = bronze_df.select(
    col("cveMetadata.cveId").alias("cve_id"),
    explode_outer(col("containers.cna.affected")).alias("affected_product")
).select(
    "cve_id",
    col("affected_product.vendor").alias("vendor"),
    col("affected_product.product").alias("product")
)

# Write the exploded data to its own Silver table.
(silver_affected_products_df.write
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("cve_silver.affected_products")
)

print("Successfully created the 'cve_silver.affected_products' table.")
display(spark.table("cve_silver.affected_products"))

Successfully created the 'cve_silver.affected_products' table.


cve_id,vendor,product
CVE-2005-10003,mikexstudios,Xcomic
CVE-2010-10011,Acritum,Femitter Server
CVE-2011-10005,,EasyFTP
CVE-2011-10006,GamerZ,WP-PostRatings
CVE-2012-10018,sekler,Mapplic Lite
CVE-2012-10018,sekler,Mapplic - Custom Interactive Map WordPress Plugin
CVE-2012-6664,,
CVE-2014-125110,,wp-file-upload Plugin
CVE-2014-125111,namithjawahar,Wp-Insert
CVE-2014-5470,,


In [0]:
# --- 3. Data Quality Checks on the Silver Table ---
print("--- Running Data Quality Checks on the 'cve_silver.cves' Table ---")

# Load the newly created Silver table for verification.
final_silver_table = spark.table("cve_silver.cves")

# 1. Row Count Check (matches the Bronze layer requirement)
total_count = final_silver_table.count()
print(f"Total 2024 CVEs loaded into Silver table: {total_count}")
assert total_count >= 30000, f"DATA QUALITY FAILED: Expected >= 30,000 records, found {total_count}."
print("✅ Quality Check Passed: Record count is above threshold.")

# 2. Null ID Check
null_id_count = final_silver_table.filter(col("cve_id").isNull()).count()
print(f"Number of records with null cve_id: {null_id_count}")
assert null_id_count == 0, "DATA QUALITY FAILED: Found records with null cve_id."
print("✅ Quality Check Passed: No null CVE IDs found.")

# 3. Uniqueness Check
distinct_id_count = final_silver_table.select("cve_id").distinct().count()
print(f"Total count: {total_count} | Distinct CVE IDs: {distinct_id_count}")
assert total_count == distinct_id_count, "DATA QUALITY FAILED: Found duplicate CVE IDs."
print("✅ Quality Check Passed: All CVE IDs are unique.")

print("\n--- All data quality checks passed! Silver layer is complete. ---")

--- Running Data Quality Checks on the 'cve_silver.cves' Table ---
Total 2024 CVEs loaded into Silver table: 40274
✅ Quality Check Passed: Record count is above threshold.
Number of records with null cve_id: 0
✅ Quality Check Passed: No null CVE IDs found.
Total count: 40274 | Distinct CVE IDs: 40274
✅ Quality Check Passed: All CVE IDs are unique.

--- All data quality checks passed! Silver layer is complete. ---
