In [1]:
import os
import math

import altair as alt
import netaddr
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


import plotly.express as px
import plotly.graph_objects as go

import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.types import ShortType

In [2]:
df_path = "F:\Datasets\CSV datasets\cybersecurity_threat_detection_logs.csv"

In [3]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.5-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [4]:
spark = (
    SparkSession.builder
    .appName('MOMA art collection - Optimized Local')
    .master('local[*]')
    .config("spark.driver.memory", "60g")
    .config("spark.driver.maxResultSize", "4g")
    .config('spark.sql.adaptive.enabled', 'true')
    .config('spark.sql.adaptive.coalescePartitions.enabled', 'true')
    .config('spark.sql.adaptive.advisoryPartitionSizeInBytes', '128mb')
    .config('spark.sql.shuffle.partitions', '100')
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config('spark.sql.autoBroadcastJoinThreshold', '256mb')
    .getOrCreate()
)

print(f"SparkSession configured with Driver Memory: {spark.conf.get('spark.driver.memory')}")

SparkSession configured with Driver Memory: 60g


In [5]:
df = spark.read.option(
    "header", "true"
).option(
    "inferSchema", "true"
).csv(df_path)

In [6]:
df.show(truncate=False)

+-------------------+--------------+-------------+--------+-------+------------+-----------+-----------------+---------------------------------------------------------------------------------------------------------------------+-------------+
|timestamp          |source_ip     |dest_ip      |protocol|action |threat_label|log_type   |bytes_transferred|user_agent                                                                                                           |request_path |
+-------------------+--------------+-------------+--------+-------+------------+-----------+-----------------+---------------------------------------------------------------------------------------------------------------------+-------------+
|2024-05-01 00:00:00|192.168.1.125 |192.168.1.124|TCP     |blocked|benign      |firewall   |10889            |Nmap Scripting Engine                                                                                                |/            |
|2024-07-18 00:00:00|192.168

In [7]:
df.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- source_ip: string (nullable = true)
 |-- dest_ip: string (nullable = true)
 |-- protocol: string (nullable = true)
 |-- action: string (nullable = true)
 |-- threat_label: string (nullable = true)
 |-- log_type: string (nullable = true)
 |-- bytes_transferred: integer (nullable = true)
 |-- user_agent: string (nullable = true)
 |-- request_path: string (nullable = true)



In [8]:
from pyspark.sql.types import ArrayType, StringType
import pyspark.sql.functions as F
# Ensure 'netaddr' is installed on all worker nodes if you rely on a global import.
# If not, import it inside the UDF.

@F.udf(returnType=ArrayType(StringType()))
def get_ip_address_type(ip_address):
    import netaddr # Import inside UDF is safer unless worker env is guaranteed

    if ip_address is None:
        return None

    try:
        ip_obj = netaddr.IPAddress(ip_address)
        ip_types = []

        if ip_obj.is_ipv4_private_use():
             ip_types.append('Private')
        if ip_obj.is_reserved(): ip_types.append('Reserved')
        if ip_obj.is_loopback(): ip_types.append('Loopback')
        if ip_obj.is_multicast(): ip_types.append('Multicast')
        if ip_obj.is_link_local(): ip_types.append('Link Local')
        if ip_obj.is_global(): ip_types.append('Global')

        if not ip_types:
            ip_types.append('Other Valid')

        return ip_types

    except netaddr.AddrFormatError: # This means netaddr was imported successfully
        return ['Invalid Format']
    except Exception as e:
        # THIS WILL NOW SHOW YOU THE REAL ERROR
        return [f'Actual Error: {str(e)}']

In [9]:
df.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- source_ip: string (nullable = true)
 |-- dest_ip: string (nullable = true)
 |-- protocol: string (nullable = true)
 |-- action: string (nullable = true)
 |-- threat_label: string (nullable = true)
 |-- log_type: string (nullable = true)
 |-- bytes_transferred: integer (nullable = true)
 |-- user_agent: string (nullable = true)
 |-- request_path: string (nullable = true)



In [10]:
df = df.withColumn(
    'source_ip_type',
    get_ip_address_type(F.col('source_ip'))
)

In [11]:
df = df.withColumn(
    'dst_ip_type',
    get_ip_address_type(F.col('dest_ip'))
)

In [12]:
df.select(F.max('timestamp').alias('Min')).show(), df.select(F.min('timestamp').alias('Max')).show()

+-------------------+
|                Min|
+-------------------+
|2024-12-30 00:00:00|
+-------------------+

+-------------------+
|                Max|
+-------------------+
|2024-01-01 00:00:00|
+-------------------+



(None, None)

In [13]:
df.select(
    'source_ip'
).distinct().count()

354

In [14]:
df = df.withColumn(
    'timestamp',
    F.to_date(F.col('timestamp'))
)

In [15]:
df.show(truncate=False)

+----------+--------------+-------------+--------+-------+------------+-----------+-----------------+---------------------------------------------------------------------------------------------------------------------+-------------+--------------+-----------+
|timestamp |source_ip     |dest_ip      |protocol|action |threat_label|log_type   |bytes_transferred|user_agent                                                                                                           |request_path |source_ip_type|dst_ip_type|
+----------+--------------+-------------+--------+-------+------------+-----------+-----------------+---------------------------------------------------------------------------------------------------------------------+-------------+--------------+-----------+
|2024-05-01|192.168.1.125 |192.168.1.124|TCP     |blocked|benign      |firewall   |10889            |Nmap Scripting Engine                                                                                               

In [16]:
df.groupBy(
    F.col('timestamp')
).count().orderBy(
    F.reverse('count')
).show()

+----------+-----+
| timestamp|count|
+----------+-----+
|2024-01-14|16400|
|2024-05-09|16600|
|2024-02-15|16310|
|2024-01-20|16410|
|2024-03-15|16510|
|2024-10-22|16610|
|2024-10-27|16610|
|2024-03-17|16220|
|2024-12-05|16520|
|2024-09-05|16520|
|2024-06-08|16430|
|2024-11-12|16430|
|2024-01-01|16630|
|2024-08-31|16630|
|2024-06-28|16240|
|2024-07-09|16340|
|2024-09-06|16440|
|2024-08-25|16640|
|2024-12-20|16050|
|2024-09-12|16450|
+----------+-----+
only showing top 20 rows



In [25]:
df.groupBy(
    F.col('source_ip')
).count().orderBy(F.col('count').desc()).show()

+---------------+-----+
|      source_ip|count|
+---------------+-----+
|   59.211.9.207|18295|
|109.106.120.222|18273|
|    88.72.40.56|18252|
| 185.225.185.68|18239|
| 122.63.201.122|18229|
| 229.140.23.152|18203|
|  44.137.187.63|18202|
|114.207.221.220|18193|
|  61.72.172.125|18193|
| 166.19.156.163|18178|
|     109.9.8.24|18176|
|  25.169.111.91|18173|
|  55.139.34.186|18164|
|   13.221.29.40|18146|
| 187.14.173.168|18137|
|  208.223.2.195|18133|
|  19.182.162.16|18126|
| 103.172.167.96|18126|
|   207.79.62.15|18105|
| 144.138.68.192|18105|
+---------------+-----+
only showing top 20 rows

