In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import GBTClassificationModel
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col
from pyspark.sql.types import *
# ✅ 1. Initialize Spark session
spark = SparkSession.builder \
    .appName("PhishingDetection") \
    .master("local[8]") \
    .config("spark.sql.shuffle.partitions", "8") \
    .getOrCreate()

In [9]:
from pyspark.sql.functions import monotonically_increasing_id, row_number, lit, concat_ws
from pyspark.sql.window import Window
import os

class Model_work:
  def __init__(self,model_path):
    self.model_path = model_path
    self.model = GBTClassificationModel.load(self.model_path)
    self.features = ['url_length', 'n_dots', 'n_hypens', 'n_underline', 'n_slash',
            'n_questionmark', 'n_equal', 'n_at', 'n_and', 'n_exclamation',
            'n_space', 'n_tilde', 'n_comma', 'n_plus', 'n_asterisk', 'n_hastag',
            'n_dollar', 'n_percent', 'n_redirection']
  def model_working(self,path):
    # Extract filename without extension
    file_name = os.path.splitext(os.path.basename(path))[0]  # e.g., "data_1"

    # Load CSV
    df = spark.read.csv(path, header=True, inferSchema=True)

    # Assemble features
    assembler = VectorAssembler(inputCols=self.features, outputCol="features")
    df_assembled = assembler.transform(df)

    # Make predictions
    predictions = self.model.transform(df_assembled)

    # Add row number per file (to create key)
    window_spec = Window.orderBy(monotonically_increasing_id())
    predictions_with_index = predictions.withColumn(
        "row_num", row_number().over(window_spec)
    )

    # Create key column as "file_row1", "file_row2", ...
    predictions_with_key = predictions_with_index.withColumn(
        "key", concat_ws("_", lit(file_name), lit("row"), predictions_with_index.row_num)
    )

    return predictions_with_key.select("key","prediction")
  def stack_and_index_predictions(self,predictions_list):
    # Step 1: Combine all DataFrames into one using union
    combined_df = predictions_list[0]
    for df in predictions_list[1:]:
        combined_df = combined_df.union(df)

    # Step 2: Add sample index starting from 1
    window_spec = Window.orderBy(monotonically_increasing_id())
    combined_df = combined_df.withColumn("sample", row_number().over(window_spec))

    # Step 3: Select and order the result
    return combined_df.select("key","prediction")
  def dirproccessing(self,files,dirpath = '/content/Streaming_input'):
   predictions = []
   for file in files:
    predictions.append(self.model_working(f'{dirpath}/{file}'))
   return predictions
  def model_fullywork(self,path,files):
   predictions = self.dirproccessing(files,path)
   combined_df = self.stack_and_index_predictions(predictions)
   combined_df.show()
   return combined_df

In [3]:
with open("test_phishing.csv", "w") as f:
    f.write("""url_length,n_dots,n_hypens,n_underline,n_slash,n_questionmark,n_equal,n_at,n_and,n_exclamation,n_space,n_tilde,n_comma,n_plus,n_asterisk,n_hastag,n_dollar,n_percent,n_redirection
75,3,1,0,5,1,2,0,0,0,0,0,0,0,0,0,0,0,1
45,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
98,4,2,1,6,1,3,1,1,1,0,1,0,0,1,0,0,0,2""")


In [10]:
model = Model_work("/content/drive/MyDrive/School/Scalable/best_phishing_model")

In [12]:
path = f"/content/Streaming_input"
files = os.listdir(path)
combined_df = model.model_fullywork(path,files)

+------------+----------+
|         key|prediction|
+------------+----------+
|data_1_row_1|       1.0|
|data_1_row_2|       0.0|
|data_5_row_1|       1.0|
|data_5_row_2|       1.0|
|data_2_row_1|       1.0|
|data_2_row_2|       1.0|
|data_4_row_1|       1.0|
|data_4_row_2|       1.0|
|data_3_row_1|       1.0|
|data_3_row_2|       1.0|
+------------+----------+



In [5]:
prediction = model.model_working("/content/test_phishing.csv")

In [17]:
prediction.show()

+-------------------+----------+
|                key|prediction|
+-------------------+----------+
|test_phishing_row_1|       1.0|
|test_phishing_row_2|       1.0|
|test_phishing_row_3|       1.0|
+-------------------+----------+



In [8]:
import os
import time
import random
import threading

def generate_csv_files(output_dir="/content/Streaming_input",count = 0):
    os.makedirs(output_dir, exist_ok=True)
    filename = f"{output_dir}/data_{count}.csv"
    features = ['url_length', 'n_dots', 'n_hypens', 'n_underline', 'n_slash',
            'n_questionmark', 'n_equal', 'n_at', 'n_and', 'n_exclamation',
            'n_space', 'n_tilde', 'n_comma', 'n_plus', 'n_asterisk', 'n_hastag',
            'n_dollar', 'n_percent', 'n_redirection']
    with open(filename, "w") as f:
            # Write header
            f.write(",".join(features) + "\n")

            # Write 3 rows of random values
            for _ in range(2):
                values = [str(random.randint(0, 10)) for _ in features]
                f.write(",".join(values) + "\n")


def thread_generatefile(interval_Time = 15,num_generate = 20):
  for i in range(num_generate):
    generate_csv_files(count = i+1)
    time.sleep(interval_Time)

thread_generatefile(5,5)

In [15]:
class Streamworking:
  def __init__(self,model_path):
    self.model =  Model_work(model_path)
  def sort_files_by(self,files, base_path="", key_func=None, reverse=True):
    """
    Sort a list of files using a custom criterion.

    Args:
        files (list): List of filenames (not full paths).
        base_path (str): Base path to prefix filenames, if needed.
        key_func (callable): Function that takes a full file path and returns a sort key.
        reverse (bool): Sort descending (True) or ascending (False).

    Returns:
        List of filenames sorted by the key.
    """
    if key_func is None:
        # Default: sort by modification time
        key_func = lambda f: os.path.getmtime(os.path.join(base_path, f))

    return sorted(files, key=lambda f: key_func(os.path.join(base_path, f)), reverse=reverse)
  def stream_dir_work(self,path,interval = 5):
    processed_files = []
    while True:
      try:
        fullfiles = os.listdir(path)
        work_files = [file for file in fullfiles if file not in processed_files]
        if work_files == []:
            print('No new files')
            time.sleep(interval)
            continue
        work_files = self.sort_files_by(work_files,path)
        print(f'Detect new {len(work_files)} files')
        self.model.model_fullywork(path,work_files)
        processed_files.extend(work_files)
        time.sleep(interval)
        continue
      except KeyboardInterrupt:
        print("\n🛑 Stream processing stopped by user.")
        break

In [16]:
stream = Streamworking("/content/drive/MyDrive/School/Scalable/best_phishing_model")
stream.stream_dir_work("/content/Streaming_input")

Detect new 5 files
+------------+----------+
|         key|prediction|
+------------+----------+
|data_5_row_1|       1.0|
|data_5_row_2|       1.0|
|data_4_row_1|       1.0|
|data_4_row_2|       1.0|
|data_3_row_1|       1.0|
|data_3_row_2|       1.0|
|data_2_row_1|       1.0|
|data_2_row_2|       1.0|
|data_1_row_1|       1.0|
|data_1_row_2|       0.0|
+------------+----------+

No new files
No new files

🛑 Stream processing stopped by user.


In [22]:
from IPython.display import display, HTML, Audio

def alert_user(prediction_value: int):
    if prediction_value == 1:
        display(HTML("<h2 style='color:red;'>🚨 PHISHING ALERT DETECTED!</h2>"))
    else:
        display(HTML("<h3 style='color:green;'>✅ No phishing detected.</h3>"))
alert_user(1)