In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType
from pyspark.sql.functions import col, to_date, dayofmonth, dayofweek, month, when, lit, isnull, count
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler, Imputer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from datetime import datetime
import time

# Initialize Spark Session with optimized configurations
spark = SparkSession.builder \
    .appName("FraudDetection") \
    .config("spark.sql.debug.maxToStringFields", "100") \
    .config("spark.sql.autoBroadcastJoinThreshold", "-1") \
    .getOrCreate()

# Define schema
schema = StructType([
    StructField("Transaction ID", StringType(), True),
    StructField("Customer ID", StringType(), True),
    StructField("Transaction Amount", FloatType(), True),
    StructField("Transaction Date", StringType(), True),
    StructField("Payment Method", StringType(), True),
    StructField("Product Category", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("Customer Age", IntegerType(), True),
    StructField("Customer Location", StringType(), True),
    StructField("Device Used", StringType(), True),
    StructField("IP Address", StringType(), True),
    StructField("Shipping Address", StringType(), True),
    StructField("Billing Address", StringType(), True),
    StructField("Is Fraudulent", IntegerType(), True),
    StructField("Account Age Days", IntegerType(), True),
    StructField("Transaction Hour", IntegerType(), True)
])

# Load data from local file with error handling
try:
    local_path = "Fraudulent_E-Commerce_Transaction_Data.csv"
    
    df = spark.read.option("header", "true") \
                   .option("multiLine", "true") \
                   .schema(schema) \
                   .csv(local_path)
    
    print("Data loaded successfully. Row count:", df.count())
except Exception as e:
    print(f"Error loading data: {str(e)}")
    spark.stop()
    exit()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/03 18:50:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Stage 0:>                                                          (0 + 1) / 1]

Data loaded successfully. Row count: 1472952


                                                                                

In [2]:
# Data Quality Check
print("\nData Quality Check:")
print("Null values per column:")
null_counts = df.select([count(when(isnull(c), c)).alias(c) for c in df.columns])
null_counts.show(vertical=True)


Data Quality Check:
Null values per column:


                                                                                

-RECORD 0-----------------
 Transaction ID     | 0   
 Customer ID        | 0   
 Transaction Amount | 0   
 Transaction Date   | 0   
 Payment Method     | 0   
 Product Category   | 0   
 Quantity           | 0   
 Customer Age       | 0   
 Customer Location  | 0   
 Device Used        | 0   
 IP Address         | 0   
 Shipping Address   | 0   
 Billing Address    | 0   
 Is Fraudulent      | 0   
 Account Age Days   | 0   
 Transaction Hour   | 0   



In [3]:
# Data Preprocessing
print("\nStarting Data Preprocessing...")
start_time = time.time()
from pyspark.sql.functions import col, to_date, dayofmonth, dayofweek, month, when

# Convert Transaction Date and extract features
df = df.withColumn("Transaction Date", to_date(col("Transaction Date")))
df = df.withColumn("DayOfMonth", dayofmonth(col("Transaction Date")))
df = df.withColumn("DayOfWeek", dayofweek(col("Transaction Date")))
df = df.withColumn("Month", month(col("Transaction Date")))

# Create feature: is shipping different from billing
df = df.withColumn("AddressMismatch", 
                  when(col("Shipping Address") != col("Billing Address"), 1).otherwise(0))

# Create feature: transaction amount per quantity
df = df.withColumn("AmountPerQuantity", 
                  col("Transaction Amount") / col("Quantity"))

# Handle class imbalance
fraud_count = df.filter(col("Is Fraudulent") == 1).count()
non_fraud_count = df.filter(col("Is Fraudulent") == 0).count()
fraud_ratio = fraud_count / (fraud_count + non_fraud_count)

print(f"\nFraudulent transactions: {fraud_count} ({fraud_ratio:.2%})")
print(f"Non-fraudulent transactions: {non_fraud_count}")


Starting Data Preprocessing...


[Stage 6:>                                                          (0 + 1) / 1]


Fraudulent transactions: 73838 (5.01%)
Non-fraudulent transactions: 1399114


                                                                                

In [5]:
# 1. Calculate class weights (to handle imbalance)
fraud_weight = non_fraud_count / (fraud_count + non_fraud_count)  # Weight for fraud class
non_fraud_weight = fraud_count / (fraud_count + non_fraud_count)  # Weight for non-fraud

df = df.withColumn("classWeight", 
                  when(col("Is Fraudulent") == 1, fraud_weight)
                  .otherwise(non_fraud_weight))

# 2. Fill nulls in categorical columns (defensive programming)
categorical_cols = ["Payment Method", "Product Category", "Customer Location", "Device Used"]
for col_name in categorical_cols:
    df = df.fillna("unknown", subset=[col_name])

print("\nClass weights applied and categorical nulls handled (if any existed).")
print("Sample of weights (fraud cases should have higher weight):")
df.select("Is Fraudulent", "classWeight").show(5)


Class weights applied and categorical nulls handled (if any existed).
Sample of weights (fraud cases should have higher weight):
+-------------+-------------------+
|Is Fraudulent|        classWeight|
+-------------+-------------------+
|            0|0.05012926422585393|
|            0|0.05012926422585393|
|            0|0.05012926422585393|
|            0|0.05012926422585393|
|            0|0.05012926422585393|
+-------------+-------------------+
only showing top 5 rows



In [4]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler, Imputer

# Define pipeline stages
stages = []

# 1. Categorical encoding
categorical_cols = ["Payment Method", "Product Category", "Customer Location", "Device Used"]
for col_name in categorical_cols:
    # Convert strings to numerical indices
    string_indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_Index", handleInvalid="keep")
    # One-hot encode indices
    encoder = OneHotEncoder(inputCols=[f"{col_name}_Index"], outputCols=[f"{col_name}_OHE"], handleInvalid="keep")
    stages += [string_indexer, encoder]

# 2. Numerical columns (impute missing values with median)
numerical_cols = ["Transaction Amount", "Quantity", "Customer Age", 
                 "Account Age Days", "Transaction Hour", "DayOfMonth",
                 "DayOfWeek", "Month", "AddressMismatch", "AmountPerQuantity"]

for num_col in numerical_cols:
    imputer = Imputer(inputCol=num_col, outputCol=f"{num_col}_imputed", strategy="median")
    stages.append(imputer)
    numerical_cols[numerical_cols.index(num_col)] = f"{num_col}_imputed"  # Update column name

# 3. Assemble all features into a vector
assembler_inputs = [f"{c}_OHE" for c in categorical_cols] + numerical_cols
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="rawFeatures", handleInvalid="keep")
stages.append(assembler)

# 4. Scale features (mean=0, std=1)
scaler = StandardScaler(inputCol="rawFeatures", outputCol="features", withStd=True, withMean=True)
stages.append(scaler)

# 5. Random Forest with class weights
rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="Is Fraudulent",
    weightCol="classWeight",  # Critical for imbalance!
    numTrees=50,
    maxDepth=5,
    seed=42
)
stages.append(rf)

# Create the pipeline
pipeline = Pipeline(stages=stages)
print("\nPipeline built successfully. Ready for training!")


Pipeline built successfully. Ready for training!


In [6]:
# Split data (80% train, 20% test)
train, test = df.randomSplit([0.8, 0.2], seed=42)
print(f"\nTrain rows: {train.count()}, Test rows: {test.count()}")

# Train the model
print("\nTraining started...")
start_time = time.time()
model = pipeline.fit(train)
print(f"Training completed in {time.time() - start_time:.2f} seconds")

# Generate predictions on test set
predictions = model.transform(test)
print("\nPredictions ready for evaluation.")

                                                                                


Train rows: 1178563, Test rows: 294389

Training started...


25/04/03 18:58:52 WARN DAGScheduler: Broadcasting large task binary with size 16.1 MiB
25/04/03 18:59:18 WARN DAGScheduler: Broadcasting large task binary with size 16.1 MiB
25/04/03 18:59:39 WARN DAGScheduler: Broadcasting large task binary with size 17.6 MiB
25/04/03 18:59:56 WARN DAGScheduler: Broadcasting large task binary with size 17.6 MiB
25/04/03 19:27:40 WARN DAGScheduler: Broadcasting large task binary with size 18.5 MiB
25/04/03 20:07:09 WARN DAGScheduler: Broadcasting large task binary with size 18.9 MiB
25/04/03 20:07:29 WARN MemoryStore: Not enough space to cache rdd_218_0 in memory! (computed 151.6 MiB so far)
25/04/03 20:07:29 WARN BlockManager: Persisting block rdd_218_0 to disk instead.
25/04/03 20:22:00 ERROR Inbox: Ignoring error                       (0 + 1) / 1]
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(Thre

ConnectionRefusedError: [Errno 111] Connection refused

In [None]:
# AUC-ROC
evaluator_auc = BinaryClassificationEvaluator(labelCol="Is Fraudulent", metricName="areaUnderROC")
auc = evaluator_auc.evaluate(predictions)
print(f"AUC-ROC: {auc:.4f}")

# Precision, Recall, F1
evaluator_multi = MulticlassClassificationEvaluator(labelCol="Is Fraudulent")
for metric in ["weightedPrecision", "weightedRecall", "f1"]:
    score = evaluator_multi.evaluate(predictions, {evaluator_multi.metricName: metric})
    print(f"{metric}: {score:.4f}")

In [None]:
print('f')