In [1]:
import os
import math

import altair as alt
import netaddr
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


import plotly.express as px
import plotly.graph_objects as go

import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.types import ShortType

In [2]:
df_path = "F:\Datasets\CSV datasets\cybersecurity_threat_detection_logs.csv"

In [3]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.5-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [4]:
spark = (
    SparkSession.builder
    .appName('MOMA art collection - Optimized Local')
    .master('local[*]')
    .config("spark.driver.memory", "60g")
    .config("spark.driver.maxResultSize", "4g")
    .config('spark.sql.adaptive.enabled', 'true')
    .config('spark.sql.adaptive.coalescePartitions.enabled', 'true')
    .config('spark.sql.adaptive.advisoryPartitionSizeInBytes', '128mb')
    .config('spark.sql.shuffle.partitions', '100')
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config('spark.sql.autoBroadcastJoinThreshold', '256mb')
    .getOrCreate()
)

print(f"SparkSession configured with Driver Memory: {spark.conf.get('spark.driver.memory')}")

SparkSession configured with Driver Memory: 60g


In [5]:
df = spark.read.option(
    "header", "true"
).option(
    "inferSchema", "true"
).csv(df_path)

In [7]:
df.show(truncate=False)

+-------------------+--------------+-------------+--------+-------+------------+-----------+-----------------+---------------------------------------------------------------------------------------------------------------------+-------------+
|timestamp          |source_ip     |dest_ip      |protocol|action |threat_label|log_type   |bytes_transferred|user_agent                                                                                                           |request_path |
+-------------------+--------------+-------------+--------+-------+------------+-----------+-----------------+---------------------------------------------------------------------------------------------------------------------+-------------+
|2024-05-01 00:00:00|192.168.1.125 |192.168.1.124|TCP     |blocked|benign      |firewall   |10889            |Nmap Scripting Engine                                                                                                |/            |
|2024-07-18 00:00:00|192.168

In [8]:
df.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- source_ip: string (nullable = true)
 |-- dest_ip: string (nullable = true)
 |-- protocol: string (nullable = true)
 |-- action: string (nullable = true)
 |-- threat_label: string (nullable = true)
 |-- log_type: string (nullable = true)
 |-- bytes_transferred: integer (nullable = true)
 |-- user_agent: string (nullable = true)
 |-- request_path: string (nullable = true)



In [9]:
@F.udf(returnType=ShortType())
def get_ip_address_type(ip_address):
    try:
        ip_obj = netaddr.IPAddress(ip_address)

        ip_types = []

        if ip_obj.is_ipv4_private_use(): ip_types.append('Private')
        if ip_obj.is_reserved(): ip_types.append('Reserved')
        if ip_obj.is_loopback(): ip_types.append('Loopback')
        if ip_obj.is_multicast(): ip_types.append('Multicast')
        if ip_obj.is_link_local(): ip_types.append('Link Local')
        if ip_obj.is_global(): ip_types.append('Global')
        if not ip_types: ip_types.append('Unassigned')

        return ip_types

    except netaddr.AddrFormatError:
        return ['Invalid Format']
    except Exception as e:
        return [f'Unknown Error: {e}']