In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import GBTRegressionModel
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col
from pyspark.sql.types import *
# ✅ 1. Initialize Spark session
spark = SparkSession.builder \
    .appName("PhishingDetection") \
    .master("local[1]") \
    .config("spark.sql.shuffle.partitions", "8") \
    .getOrCreate()

# 2. Load model
model_path = "/content/drive/MyDrive/School/Scalable/Project/best_model_GBT"
best_model = GBTRegressionModel.load(model_path)

In [2]:
# Selected numerical and indexed categorical features
features = [
    "QUANTITYORDERED",        # numerical
    "PRICEEACH",              # numerical
    "ORDERLINENUMBER",        # numerical
    "SALES",                  # numerical
    "QTR_ID",                 # numerical
    "MONTH_ID",               # numerical
    "YEAR_ID",                # numerical
    "MSRP",                   # numerical
    "STATUS_index",           # categorical (StringIndexed)
    "PRODUCTLINE_index",      # categorical (StringIndexed)
    "DEALSIZE_index"          # categorical (StringIndexed)
]


In [3]:
import csv
import random

with open("test_sales_10000000.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow([
        "QUANTITYORDERED", "PRICEEACH", "ORDERLINENUMBER", "SALES",
        "QTR_ID", "MONTH_ID", "YEAR_ID", "MSRP",
        "STATUS_index", "PRODUCTLINE_index", "DEALSIZE_index"
    ])

    for _ in range(10000000):
        quantity_ordered = random.randint(10, 100)
        price_each = round(random.uniform(50, 120), 2)
        order_line_number = random.randint(1, 10)
        sales = round(quantity_ordered * price_each, 2)
        qtr_id = random.randint(1, 4)
        month_id = random.randint(1, 12)
        year_id = random.choice([2003, 2004, 2005])
        msrp = random.randint(80, 150)
        status_index = round(random.uniform(0, 3), 1)
        productline_index = round(random.uniform(0, 5), 1)
        dealsize_index = round(random.uniform(0, 2), 1)

        writer.writerow([
            quantity_ordered, price_each, order_line_number, sales,
            qtr_id, month_id, year_id, msrp,
            status_index, productline_index, dealsize_index
        ])


In [5]:
import time
from pyspark.ml.feature import VectorAssembler

# Define features
features = [
    "QUANTITYORDERED", "PRICEEACH", "ORDERLINENUMBER", "SALES",
    "QTR_ID", "MONTH_ID", "YEAR_ID", "MSRP",
    "STATUS_index", "PRODUCTLINE_index", "DEALSIZE_index"
]

# ✅ Normal Job
start_normal = time.time()

df_normal = spark.read.csv("test_sales_10000000.csv", header=True, inferSchema=True)
assembler = VectorAssembler(inputCols=features, outputCol="features")
df_assembled = assembler.transform(df_normal)
predictions_normal = best_model.transform(df_assembled)
predictions_normal.select([col for col in predictions_normal.columns if col != 'features']).show(5, truncate=False)

end_normal = time.time()
print(f"⏱️ Normal Job Time: {end_normal - start_normal:.4f} seconds\n")



+---------------+---------+---------------+-------+------+--------+-------+----+------------+-----------------+--------------+------------------+
|QUANTITYORDERED|PRICEEACH|ORDERLINENUMBER|SALES  |QTR_ID|MONTH_ID|YEAR_ID|MSRP|STATUS_index|PRODUCTLINE_index|DEALSIZE_index|prediction        |
+---------------+---------+---------------+-------+------+--------+-------+----+------------+-----------------+--------------+------------------+
|76             |67.98    |6              |5166.48|2     |10      |2005   |109 |1.2         |1.6              |1.0           |244.53674724488337|
|58             |55.17    |1              |3199.86|1     |3       |2004   |115 |0.1         |2.2              |1.4           |242.77825954975057|
|27             |95.52    |8              |2579.04|2     |3       |2005   |133 |1.6         |2.8              |1.8           |244.88885868777874|
|53             |67.75    |6              |3590.75|1     |7       |2004   |114 |1.8         |0.9              |2.0          

In [10]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import GBTRegressionModel
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col
from pyspark.sql.types import *
# ✅ 1. Initialize Spark session
spark = SparkSession.builder.config(
    "spark.sql.shuffle.partitions", "8"  # Match number of cores
).config(
    "spark.default.parallelism", "8"     # RDD default parallelism
).appName("OptimizedPrediction") \
 .master("local[8]") \
 .getOrCreate()




# 2. Load model
model_path = "/content/drive/MyDrive/School/Scalable/Project/best_model_GBT"
best_model = GBTRegressionModel.load(model_path)

In [11]:
from pyspark import StorageLevel
import time

# ✅ Improved Optimized Job
start_opt = time.time()

# Load and persist input
df_opt = spark.read.csv("test_sales_10000000.csv", header=True, inferSchema=True) \
    .persist(StorageLevel.MEMORY_AND_DISK)

# Assemble features without unnecessary repartitioning
df_opt_assembled = assembler.transform(df_opt)

# Predict without extra caching
predictions_opt = best_model.transform(df_opt_assembled)

# Display results
predictions_opt.select([col for col in predictions_opt.columns if col != 'features']).show(5, truncate=False)

end_opt = time.time()
print(f"⚡ Optimized Job Time: {end_opt - start_opt:.4f} seconds")


+---------------+---------+---------------+-------+------+--------+-------+----+------------+-----------------+--------------+------------------+
|QUANTITYORDERED|PRICEEACH|ORDERLINENUMBER|SALES  |QTR_ID|MONTH_ID|YEAR_ID|MSRP|STATUS_index|PRODUCTLINE_index|DEALSIZE_index|prediction        |
+---------------+---------+---------------+-------+------+--------+-------+----+------------+-----------------+--------------+------------------+
|76             |67.98    |6              |5166.48|2     |10      |2005   |109 |1.2         |1.6              |1.0           |244.53674724488337|
|58             |55.17    |1              |3199.86|1     |3       |2004   |115 |0.1         |2.2              |1.4           |242.77825954975057|
|27             |95.52    |8              |2579.04|2     |3       |2005   |133 |1.6         |2.8              |1.8           |244.88885868777874|
|53             |67.75    |6              |3590.75|1     |7       |2004   |114 |1.8         |0.9              |2.0          