In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType
from pyspark.sql.functions import col, to_date, dayofmonth, dayofweek, month, when, lit, isnull, count
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler, Imputer
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression, GBTClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from datetime import datetime
import time
import pandas as pd
import numpy as np

# Initialize Spark Session optimized for local Jupyter execution
spark = SparkSession.builder \
    .appName("FraudDetection") \
    .master("local[*]") \
    .config("spark.sql.debug.maxToStringFields", "100") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.maxResultSize", "4g") \
    .getOrCreate()

print("Spark session created successfully!")

# Define schema
schema = StructType([
    StructField("Transaction ID", StringType(), True),
    StructField("Customer ID", StringType(), True),
    StructField("Transaction Amount", FloatType(), True),
    StructField("Transaction Date", StringType(), True),
    StructField("Payment Method", StringType(), True),
    StructField("Product Category", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("Customer Age", IntegerType(), True),
    StructField("Customer Location", StringType(), True),
    StructField("Device Used", StringType(), True),
    StructField("IP Address", StringType(), True),
    StructField("Shipping Address", StringType(), True),
    StructField("Billing Address", StringType(), True),
    StructField("Is Fraudulent", IntegerType(), True),
    StructField("Account Age Days", IntegerType(), True),
    StructField("Transaction Hour", IntegerType(), True)
])

# Load data with error handling
try:
    # For local Jupyter environment, use local file path - adjust this to your file location
    try:
        # First try local file
        file_path = "transaction_data.csv"  # Change this to your local file path
        df = spark.read.option("header", "true") \
                     .option("multiLine", "true") \
                     .schema(schema) \
                     .csv(file_path)
    except:
        # If local fails, try HDFS path
        today = datetime.today()
        hdfs_path = f"hdfs://namenode:9000/user/root/transactions/YYYY={today.year}/MM={today.month:02d}/DD={today.day:02d}/transaction_data.csv"
        df = spark.read.option("header", "true") \
                     .option("multiLine", "true") \
                     .schema(schema) \
                     .csv(hdfs_path)
    
    print("Data loaded successfully. Row count:", df.count())
except Exception as e:
    print(f"Error loading data: {str(e)}")
    print("Please update the file path to point to your transaction data.")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/20 19:35:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/20 19:35:50 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Spark session created successfully!


[Stage 0:>                                                          (0 + 1) / 1]

Data loaded successfully. Row count: 23634


                                                                                

In [2]:
# Data Quality Check
print("\nData Quality Check:")
print("Null values per column:")
null_counts = df.select([count(when(isnull(c), c)).alias(c) for c in df.columns])
null_counts.show(vertical=True)

# Show sample data
print("\nSample data:")
df.show(5)

# Check class distribution
fraud_count = df.filter(col("Is Fraudulent") == 1).count()
non_fraud_count = df.filter(col("Is Fraudulent") == 0).count()
total = fraud_count + non_fraud_count
fraud_ratio = fraud_count / total

print(f"\nFraudulent transactions: {fraud_count} ({fraud_ratio:.2%})")
print(f"Non-fraudulent transactions: {non_fraud_count} ({1-fraud_ratio:.2%})")
print(f"Imbalance ratio: 1:{non_fraud_count/fraud_count:.1f}")

# Data Preprocessing
print("\nStarting Data Preprocessing...")
start_time = time.time()

# Convert Transaction Date and extract features
df = df.withColumn("Transaction Date", to_date(col("Transaction Date")))
df = df.withColumn("DayOfMonth", dayofmonth(col("Transaction Date")))
df = df.withColumn("DayOfWeek", dayofweek(col("Transaction Date")))
df = df.withColumn("Month", month(col("Transaction Date")))

# Create feature: is shipping different from billing
df = df.withColumn("AddressMismatch", 
                 when(col("Shipping Address") != col("Billing Address"), 1).otherwise(0))

# Create feature: transaction amount per quantity
df = df.withColumn("AmountPerQuantity", 
                 when(col("Quantity") > 0, col("Transaction Amount") / col("Quantity")).otherwise(col("Transaction Amount")))

# Create time-based features
df = df.withColumn("IsWeekend", 
                 when((col("DayOfWeek") == 1) | (col("DayOfWeek") == 7), 1).otherwise(0))

df = df.withColumn("IsNightTime", 
                 when((col("Transaction Hour") >= 22) | (col("Transaction Hour") <= 5), 1).otherwise(0))

# Calculate class weights (to handle imbalance)
# Higher weight for minority class (fraud)
weight_multiplier = 5.0  # Increase this for better fraud detection
fraud_weight = (non_fraud_count / total) * weight_multiplier
non_fraud_weight = (fraud_count / total)

df = df.withColumn("classWeight", 
                 when(col("Is Fraudulent") == 1, fraud_weight)
                 .otherwise(non_fraud_weight))

# Fill nulls in categorical columns
categorical_cols = ["Payment Method", "Product Category", "Customer Location", "Device Used"]
for col_name in categorical_cols:
    df = df.fillna("unknown", subset=[col_name])

# Define numerical columns for imputation
numerical_cols = ["Transaction Amount", "Quantity", "Customer Age", 
                 "Account Age Days", "Transaction Hour", "DayOfMonth", 
                 "DayOfWeek", "Month", "AddressMismatch", "AmountPerQuantity",
                 "IsWeekend", "IsNightTime"]

print(f"\nPreprocessing completed in {time.time() - start_time:.2f} seconds")
print("Sample of weights (fraud cases should have higher weight):")
df.select("Is Fraudulent", "classWeight").distinct().orderBy("Is Fraudulent").show(5)

# Show all features
print("\nAll features after preprocessing:")
df.printSchema()


Data Quality Check:
Null values per column:
-RECORD 0-----------------
 Transaction ID     | 0   
 Customer ID        | 0   
 Transaction Amount | 0   
 Transaction Date   | 0   
 Payment Method     | 0   
 Product Category   | 0   
 Quantity           | 0   
 Customer Age       | 0   
 Customer Location  | 0   
 Device Used        | 0   
 IP Address         | 0   
 Shipping Address   | 0   
 Billing Address    | 0   
 Is Fraudulent      | 0   
 Account Age Days   | 0   
 Transaction Hour   | 0   


Sample data:
+--------------------+--------------------+------------------+-------------------+--------------+----------------+--------+------------+-------------------+-----------+---------------+--------------------+--------------------+-------------+----------------+----------------+
|      Transaction ID|         Customer ID|Transaction Amount|   Transaction Date|Payment Method|Product Category|Quantity|Customer Age|  Customer Location|Device Used|     IP Address|    Shipping Address| 

In [3]:
# Define preprocessing pipeline stages (without the classifier)
preprocessing_stages = []

# 1. Categorical encoding
categorical_cols = ["Payment Method", "Product Category", "Customer Location", "Device Used"]
categorical_features = []

for col_name in categorical_cols:
    # Convert strings to numerical indices
    string_indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_Index", handleInvalid="keep")
    preprocessing_stages.append(string_indexer)
    
    # One-hot encode indices
    encoder = OneHotEncoder(inputCols=[f"{col_name}_Index"], outputCols=[f"{col_name}_OHE"], handleInvalid="keep")
    preprocessing_stages.append(encoder)
    categorical_features.append(f"{col_name}_OHE")

# 2. Numerical columns (impute missing values with median)
numerical_features = []
for num_col in numerical_cols:
    imputer = Imputer(inputCol=num_col, outputCol=f"{num_col}_imputed", strategy="median")
    preprocessing_stages.append(imputer)
    numerical_features.append(f"{num_col}_imputed")

# 3. Assemble all features into a vector
assembler = VectorAssembler(inputCols=categorical_features + numerical_features, 
                           outputCol="rawFeatures", 
                           handleInvalid="keep")
preprocessing_stages.append(assembler)

# 4. Scale features (mean=0, std=1)
scaler = StandardScaler(inputCol="rawFeatures", outputCol="features", withStd=True, withMean=True)
preprocessing_stages.append(scaler)

print("Feature pipeline built with stages:")
for i, stage in enumerate(preprocessing_stages):
    print(f"{i+1}. {type(stage).__name__}")

# Create preprocessing pipeline
preprocessing_pipeline = Pipeline(stages=preprocessing_stages)

Feature pipeline built with stages:
1. StringIndexer
2. OneHotEncoder
3. StringIndexer
4. OneHotEncoder
5. StringIndexer
6. OneHotEncoder
7. StringIndexer
8. OneHotEncoder
9. Imputer
10. Imputer
11. Imputer
12. Imputer
13. Imputer
14. Imputer
15. Imputer
16. Imputer
17. Imputer
18. Imputer
19. Imputer
20. Imputer
21. VectorAssembler
22. StandardScaler


In [4]:
# Split data (80% train, 20% test)
train, test = df.randomSplit([0.8, 0.2], seed=42)

# Check class distribution in splits
train_fraud_ratio = train.filter(col("Is Fraudulent") == 1).count() / train.count()
test_fraud_ratio = test.filter(col("Is Fraudulent") == 1).count() / test.count()
print(f"Train fraud ratio: {train_fraud_ratio:.2%}")
print(f"Test fraud ratio: {test_fraud_ratio:.2%}")

# Remove any conflicting columns if they exist
columns_to_check = ["prediction", "rawFeatures", "features"] + [f"{c}_Index" for c in categorical_cols] + [f"{c}_OHE" for c in categorical_cols] + [f"{c}_imputed" for c in numerical_cols]
for column in columns_to_check:
    if column in train.columns:
        train = train.drop(column)
    if column in test.columns:
        test = test.drop(column)

# Apply preprocessing to get a consistent dataset
print("\nApplying preprocessing pipeline...")
preprocessor_model = preprocessing_pipeline.fit(train)
train_preprocessed = preprocessor_model.transform(train)
test_preprocessed = preprocessor_model.transform(test)

# Define parameter combinations for Random Forest
rf_params = [
    {"numTrees": 50, "maxDepth": 5, "minInstancesPerNode": 1, "impurity": "gini"},
    {"numTrees": 100, "maxDepth": 10, "minInstancesPerNode": 2, "impurity": "entropy"},
    {"numTrees": 200, "maxDepth": 15, "minInstancesPerNode": 4, "impurity": "gini"}
]

rf_results = []
evaluator = BinaryClassificationEvaluator(labelCol="Is Fraudulent", metricName="areaUnderROC")
print("\nTraining Random Forest with multiple parameter combinations...")

for params in rf_params:
    start_time = time.time()
    
    # Create Random Forest with specific parameters
    rf = RandomForestClassifier(
        featuresCol="features", 
        labelCol="Is Fraudulent", 
        weightCol="classWeight",
        **params
    )
    
    try:
        # Train model directly on preprocessed data
        rf_model = rf.fit(train_preprocessed)
        
        # Make predictions
        predictions = rf_model.transform(test_preprocessed)
        
        # Evaluate
        auc = evaluator.evaluate(predictions)
        recall_eval = MulticlassClassificationEvaluator(
            labelCol="Is Fraudulent", 
            metricName="weightedRecall"
        )
        recall = recall_eval.evaluate(predictions)
        
        training_time = time.time() - start_time
        
        # Store results
        rf_results.append({
            "numTrees": params["numTrees"],
            "maxDepth": params["maxDepth"],
            "minInstancesPerNode": params["minInstancesPerNode"],
            "impurity": params["impurity"],
            "AUC": auc,
            "Recall": recall,
            "Time": training_time,
            "Model": rf_model  # Store the model itself for later use
        })
        
        print(f"RF with {params} - AUC: {auc:.4f}, Recall: {recall:.4f}, Time: {training_time:.1f}s")
    
    except Exception as e:
        print(f"Error during training with {params}: {str(e)}")

# Check if any models were successfully trained
if not rf_results:
    print("\nNo models were successfully trained. Please check the pipeline and data.")
else:
    # Find best model
    best_rf_params = max(rf_results, key=lambda x: x["AUC"])
    print("\nBest Random Forest parameters:")
    print(f"Number of trees: {best_rf_params['numTrees']}")
    print(f"Max depth: {best_rf_params['maxDepth']}")
    print(f"Min instances per node: {best_rf_params['minInstancesPerNode']}")
    print(f"Impurity: {best_rf_params['impurity']}")
    print(f"AUC: {best_rf_params['AUC']:.4f}")
    print(f"Recall: {best_rf_params['Recall']:.4f}")
    
    # Save best model for comparison
    best_rf_model = best_rf_params["Model"]

                                                                                

Train fraud ratio: 5.30%
Test fraud ratio: 4.66%

Applying preprocessing pipeline...


25/04/20 19:37:34 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
25/04/20 19:37:31 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB



Training Random Forest with multiple parameter combinations...


25/04/20 19:37:34 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/04/20 19:37:34 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/04/20 19:37:38 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
25/04/20 19:38:25 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB
25/04/20 19:38:28 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB
25/04/20 19:38:30 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB
25/04/20 19:38:32 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB
25/04/20 19:38:33 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB
25/04/20 19:38:36 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
25/04/20 19:38:37 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
                                                                                

RF with {'numTrees': 50, 'maxDepth': 5, 'minInstancesPerNode': 1, 'impurity': 'gini'} - AUC: 0.7171, Recall: 0.0466, Time: 64.6s


25/04/20 19:38:38 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/04/20 19:38:38 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/04/20 19:38:40 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
25/04/20 19:39:26 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB
25/04/20 19:39:32 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB
25/04/20 19:39:35 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
25/04/20 19:39:38 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
25/04/20 19:39:40 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
25/04/20 19:39:43 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB
25/04/20 19:39:46 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB
25/04/20 19:39:49 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB
25/04/20 19:39:50 WARN DAGScheduler: Broadcasting larg

RF with {'numTrees': 100, 'maxDepth': 10, 'minInstancesPerNode': 2, 'impurity': 'entropy'} - AUC: 0.7791, Recall: 0.0466, Time: 80.9s


25/04/20 19:39:59 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/04/20 19:39:59 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/04/20 19:40:01 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
25/04/20 19:40:47 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB
25/04/20 19:40:55 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
25/04/20 19:41:00 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB
25/04/20 19:41:05 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB
25/04/20 19:41:09 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
25/04/20 19:41:14 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB
25/04/20 19:41:17 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB
25/04/20 19:41:22 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
25/04/20 19:41:27 WARN DAGScheduler: Broadcasting larg

RF with {'numTrees': 200, 'maxDepth': 15, 'minInstancesPerNode': 4, 'impurity': 'gini'} - AUC: 0.7797, Recall: 0.0466, Time: 126.4s

Best Random Forest parameters:
Number of trees: 200
Max depth: 15
Min instances per node: 4
Impurity: gini
AUC: 0.7797
Recall: 0.0466


                                                                                

In [5]:
# Define models to test
try:
    # Only proceed if we have a best RF model from previous step
    models = {
        "Logistic Regression": LogisticRegression(
            featuresCol="features", 
            labelCol="Is Fraudulent", 
            weightCol="classWeight",
            maxIter=20,
            regParam=0.01,
            elasticNetParam=0.5
        ),
        "Random Forest": RandomForestClassifier(
            featuresCol="features", 
            labelCol="Is Fraudulent", 
            weightCol="classWeight",
            numTrees=best_rf_params["numTrees"],
            maxDepth=best_rf_params["maxDepth"],
            minInstancesPerNode=best_rf_params["minInstancesPerNode"],
            impurity=best_rf_params["impurity"]
        ),
        "Gradient-Boosted Trees": GBTClassifier(
            featuresCol="features", 
            labelCol="Is Fraudulent", 
            weightCol="classWeight",
            maxIter=50,
            maxDepth=8,
            stepSize=0.1
        ),
        "Decision Tree": DecisionTreeClassifier(
            featuresCol="features", 
            labelCol="Is Fraudulent", 
            weightCol="classWeight",
            maxDepth=10,
            impurity="entropy"
        )
    }

    # Metrics to evaluate
    metrics = {
        "AUC": BinaryClassificationEvaluator(labelCol="Is Fraudulent", metricName="areaUnderROC"),
        "PR AUC": BinaryClassificationEvaluator(labelCol="Is Fraudulent", metricName="areaUnderPR"),
        "Recall": MulticlassClassificationEvaluator(labelCol="Is Fraudulent", metricName="weightedRecall"),
        "Precision": MulticlassClassificationEvaluator(labelCol="Is Fraudulent", metricName="weightedPrecision"),
        "F1": MulticlassClassificationEvaluator(labelCol="Is Fraudulent", metricName="f1")
    }

    results = []

    # Train and evaluate each model
    for model_name, model in models.items():
        print(f"\nTraining {model_name}...")
        start_time = time.time()
        
        try:
            # Train model directly on preprocessed data
            trained_model = model.fit(train_preprocessed)
            predictions = trained_model.transform(test_preprocessed)
            
            # Evaluate with all metrics
            model_results = {"Model": model_name}
            for metric_name, evaluator in metrics.items():
                score = evaluator.evaluate(predictions)
                model_results[metric_name] = score
            
            training_time = time.time() - start_time
            model_results["Time (s)"] = training_time
            
            results.append(model_results)
            
            # Print model performance
            print(f"{model_name} evaluation:")
            for metric, value in model_results.items():
                if metric != "Model" and metric != "Time (s)":
                    print(f"  - {metric}: {value:.4f}")
            print(f"  - Training time: {training_time:.1f}s")
        
        except Exception as e:
            print(f"Error training {model_name}: {str(e)}")

    # Show results table if we have any
    if results:
        results_df = spark.createDataFrame(results).orderBy("F1", ascending=False)
        print("\nModel Performance Comparison:")
        results_df.show(truncate=False)
        
        # Convert to pandas for visualization (if you want to plot the results)
        results_pd = results_df.toPandas()
        print("Results converted to pandas DataFrame for visualization")
    else:
        print("No models were successfully trained for comparison")

except Exception as e:
    print(f"Error in model comparison step: {str(e)}")
    print("Make sure parameter tuning step completed successfully")


Training Logistic Regression...


25/04/20 19:47:28 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/04/20 19:47:38 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/04/20 19:47:38 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
25/04/20 19:47:38 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/04/20 19:47:47 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/04/20 19:47:48 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/04/20 19:47:49 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/04/20 19:47:49 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/04/20 19:47:50 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/04/20 19:47:50 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/04/20 19:47:51 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/04/20 

Logistic Regression evaluation:
  - AUC: 0.7817
  - PR AUC: 0.2915
  - Recall: 0.3684
  - Precision: 0.9441
  - F1: 0.4898
  - Training time: 40.2s

Training Random Forest...


25/04/20 19:48:08 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/04/20 19:48:08 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/04/20 19:48:08 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
25/04/20 19:49:04 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB
25/04/20 19:49:19 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
25/04/20 19:49:24 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB
25/04/20 19:49:30 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB
25/04/20 19:49:34 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
25/04/20 19:49:39 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB
25/04/20 19:49:45 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB
25/04/20 19:49:50 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
25/04/20 19:49:56 WARN DAGScheduler: Broadcasting larg

Random Forest evaluation:
  - AUC: 0.7797
  - PR AUC: 0.2603
  - Recall: 0.0466
  - Precision: 0.0022
  - F1: 0.0042
  - Training time: 152.6s

Training Gradient-Boosted Trees...


25/04/20 19:50:41 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/04/20 19:50:41 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/04/20 19:50:43 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
25/04/20 19:51:37 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/04/20 19:51:46 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/04/20 19:51:48 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/04/20 19:51:52 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/04/20 19:51:52 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/04/20 19:51:54 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/04/20 19:51:57 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/04/20 19:52:00 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/04/20 19:52:03 WARN DAGScheduler: Broadcasting larg

Gradient-Boosted Trees evaluation:
  - AUC: 0.7452
  - PR AUC: 0.3432
  - Recall: 0.6596
  - Precision: 0.9375
  - F1: 0.7573
  - Training time: 978.2s

Training Decision Tree...


25/04/20 20:06:59 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/04/20 20:06:59 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/04/20 20:07:01 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
25/04/20 20:07:48 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/04/20 20:07:53 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/04/20 20:07:56 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/04/20 20:07:58 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/04/20 20:08:01 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/04/20 20:08:03 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/04/20 20:08:05 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/04/20 20:08:06 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/04/20 20:08:08 WARN DAGScheduler: Broadcasting larg

Decision Tree evaluation:
  - AUC: 0.5856
  - PR AUC: 0.1181
  - Recall: 0.4833
  - Precision: 0.9254
  - F1: 0.6115
  - Training time: 77.4s

Model Performance Comparison:


                                                                                

+------------------+-------------------+----------------------+-------------------+--------------------+-------------------+-----------------+
|AUC               |F1                 |Model                 |PR AUC             |Precision           |Recall             |Time (s)         |
+------------------+-------------------+----------------------+-------------------+--------------------+-------------------+-----------------+
|0.7451786652653702|0.7572958081084157 |Gradient-Boosted Trees|0.34315477559817115|0.9374502246353624  |0.6596015260703688 |978.1850979328156|
|0.5855653017502728|0.6115360455555233 |Decision Tree         |0.11814795945223107|0.9253916354037492  |0.48325561678677403|77.37584209442139|
|0.7816686203969474|0.48975980455963225|Logistic Regression   |0.29146732606134446|0.9440829498593696  |0.36837643069097076|40.21971654891968|
|0.7796657100125286|0.00415495510158951|Random Forest         |0.2603105106423401 |0.002174350179276071|0.04662992793556592|152.6174280643463|

In [6]:
# Determine the best model based on F1 score from results_df
try:
    # If we have valid results
    if results and not results_df.isEmpty():
        # Get the best model by F1 score
        best_model_row = results_df.first()
        best_model_name = best_model_row["Model"]
        
        print(f"\nThe best performing model is: {best_model_name}")
        print(f"F1 Score: {best_model_row['F1']:.4f}")
        print(f"AUC: {best_model_row['AUC']:.4f}")
        print(f"PR AUC: {best_model_row['PR AUC']:.4f}")
        print(f"Precision: {best_model_row['Precision']:.4f}")
        print(f"Recall: {best_model_row['Recall']:.4f}")
        
        # Get the trained model object (assuming it's stored in the variable named according to the model name)
        best_model = None
        if best_model_name == "Gradient-Boosted Trees":
            best_model = models["Gradient-Boosted Trees"].fit(train_preprocessed)
        elif best_model_name == "Random Forest":
            best_model = models["Random Forest"].fit(train_preprocessed)
        elif best_model_name == "Logistic Regression":
            best_model = models["Logistic Regression"].fit(train_preprocessed)
        elif best_model_name == "Decision Tree":
            best_model = models["Decision Tree"].fit(train_preprocessed)
        
        if best_model:
            # Generate HDFS path for model saving
            from datetime import datetime
            today = datetime.today()
            year = today.year
            month = f"{today.month:02d}"
            day = f"{today.day:02d}"
            base_path = "hdfs://namenode:9000/user/root/model"
            date_path = f"YYYY={year}/MM={month}/DD={day}"
            model_path = f"{base_path}/fraud_detection/{date_path}/{best_model_name.replace(' ', '_').lower()}"
            
            # Save the model to HDFS
            try:
                print(f"\nSaving {best_model_name} model to HDFS at: {model_path}")
                best_model.write().overwrite().save(model_path)
                print(f"Model successfully saved!")
                
                # Save model metadata as JSON for reference
                from pyspark.sql.functions import lit, to_json, struct
                
                # Create metadata DataFrame with model metrics and details
                metadata = spark.createDataFrame([{
                    "model_name": best_model_name,
                    "training_date": today.strftime("%Y-%m-%d"),
                    "metrics": {
                        "f1": float(best_model_row["F1"]),
                        "auc": float(best_model_row["AUC"]),
                        "pr_auc": float(best_model_row["PR AUC"]),
                        "precision": float(best_model_row["Precision"]),
                        "recall": float(best_model_row["Recall"]),
                        "training_time_seconds": float(best_model_row["Time (s)"])
                    }
                }])
                
                # Save metadata alongside model
                metadata_path = f"{model_path}_metadata"
                metadata.write.mode("overwrite").json(metadata_path)
                print(f"Model metadata saved to: {metadata_path}")
                
            except Exception as e:
                print(f"Error saving model to HDFS: {str(e)}")
        else:
            print(f"Could not retrieve the trained {best_model_name} model for saving.")
    else:
        print("No valid model results available to determine the best model.")
        
except Exception as e:
    print(f"Error in selecting and saving the best model: {str(e)}")


The best performing model is: Gradient-Boosted Trees
F1 Score: 0.7573
AUC: 0.7452
PR AUC: 0.3432
Precision: 0.9375
Recall: 0.6596


25/04/20 20:26:46 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/04/20 20:26:46 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/04/20 20:26:49 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
25/04/20 20:27:41 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/04/20 20:27:49 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/04/20 20:27:51 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/04/20 20:27:54 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/04/20 20:27:56 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/04/20 20:27:57 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/04/20 20:28:00 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/04/20 20:28:02 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/04/20 20:28:05 WARN DAGScheduler: Broadcasting larg


Saving Gradient-Boosted Trees model to HDFS at: hdfs://namenode:9000/user/root/model/fraud_detection/YYYY=2025/MM=04/DD=20/gradient-boosted_trees


                                                                                

Model successfully saved!




Model metadata saved to: hdfs://namenode:9000/user/root/model/fraud_detection/YYYY=2025/MM=04/DD=20/gradient-boosted_trees_metadata


                                                                                