# Course: Modern Architectures for Big Data II
## Contributors:
- Miguel Hidalgo (Sr. Analytical Consultant)
- Francisco Esteves (Analytics & Data Science Specialist)
- Benjamin Castro (Software Engineer)

##### Original Dataset: https://www.kaggle.com/datasets/kelvinkelue/credit-card-fraud-prediction/data?select=fraud+test.csv

## Environment setup

#### Import Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder

from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.window import Window
from pyspark.sql.functions import sqrt

pd.set_option('display.max_columns', None)

#### Configure Spark Session

In [2]:
spark = \
  SparkSession.builder\
              .appName("classifier")\
              .config("spark.hadoop.home.dir", "C:/hadoop") \
              .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
              .getOrCreate()
print(f"This cluster relies on Spark '{spark.version}'")

25/02/10 05:48:28 WARN Utils: Your hostname, osbdet resolves to a loopback address: 127.0.0.1; using 10.0.2.15 instead (on interface enp0s3)
25/02/10 05:48:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/10 05:48:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/02/10 05:48:35 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


This cluster relies on Spark '3.5.0'


## Feature Engineering

#### Load Data

In [3]:
# Load the dataset
df = spark.read.csv("../data/transactions_train.csv", header=True, inferSchema=True)
df.printSchema()

                                                                                

root
 |-- cc_num: long (nullable = true)
 |-- trans_num: string (nullable = true)
 |-- trans_date_trans_time: string (nullable = true)
 |-- merchant: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amt: double (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- street: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- city_pop: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- unix_time: integer (nullable = true)
 |-- merch_lat: double (nullable = true)
 |-- merch_long: double (nullable = true)
 |-- is_fraud: integer (nullable = true)



#### Clean Data

In [4]:
df_clean = df\
            .withColumn("trans_date_trans_time", f.to_timestamp("trans_date_trans_time", "dd/MM/yyyy HH:mm"))\
            .withColumn("dob", f.to_date(f.col("dob"),"dd/MM/yyyy"))\
            .withColumn("hour", f.hour("trans_date_trans_time"))\
            .withColumn("day", f.dayofweek("trans_date_trans_time"))\
            .withColumn("month", f.month("trans_date_trans_time"))\
            .withColumn("year", f.year("trans_date_trans_time"))\
            .withColumn("weekend", f.when(f.col("day").isin(1, 7), 1).otherwise(0))\
            .withColumn("customer_age", f.round(f.datediff(f.col("trans_date_trans_time"), f.col("dob"))/365,0).cast('Integer'))\
            .withColumn("gender", f.when(f.col("gender") == "Male", 1).otherwise(0))\
            .withColumn("unix_time", f.col("unix_time").cast("long"))\
            .drop(*["merchant", "first", "last", "job", "dob", "street", "city", "state", "zip", "city_pop"])\
            .dropna()

df_clean.printSchema()

root
 |-- cc_num: long (nullable = true)
 |-- trans_num: string (nullable = true)
 |-- trans_date_trans_time: timestamp (nullable = true)
 |-- category: string (nullable = true)
 |-- amt: double (nullable = true)
 |-- gender: integer (nullable = false)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- unix_time: long (nullable = true)
 |-- merch_lat: double (nullable = true)
 |-- merch_long: double (nullable = true)
 |-- is_fraud: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- weekend: integer (nullable = false)
 |-- customer_age: integer (nullable = true)



#### Historical Statistics

##### Amounts

In [5]:
# Lag Window
window_cc_num_timestamp = Window.partitionBy("cc_num").orderBy("trans_date_trans_time")

# Getting statistics of those
df_amt = df_clean\
                .withColumn("historical_mean_amt", f.mean("amt").over(window_cc_num_timestamp))\
                .withColumn("historical_std_amt", f.stddev("amt").over(window_cc_num_timestamp))\
                .withColumn("historical_max_amt", f.max("amt").over(window_cc_num_timestamp))\
                .withColumn("historical_min_amt", f.min("amt").over(window_cc_num_timestamp))

df_amt.printSchema()

root
 |-- cc_num: long (nullable = true)
 |-- trans_num: string (nullable = true)
 |-- trans_date_trans_time: timestamp (nullable = true)
 |-- category: string (nullable = true)
 |-- amt: double (nullable = true)
 |-- gender: integer (nullable = false)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- unix_time: long (nullable = true)
 |-- merch_lat: double (nullable = true)
 |-- merch_long: double (nullable = true)
 |-- is_fraud: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- weekend: integer (nullable = false)
 |-- customer_age: integer (nullable = true)
 |-- historical_mean_amt: double (nullable = true)
 |-- historical_std_amt: double (nullable = true)
 |-- historical_max_amt: double (nullable = true)
 |-- historical_min_amt: double (nullable = true)



##### Average Time Between Transactions

In [6]:
df_time_diff = df_amt\
                    .withColumn(
                        "time_diff", 
                        f.unix_timestamp("trans_date_trans_time") - 
                        f.unix_timestamp(f.lag("trans_date_trans_time", 1).over(window_cc_num_timestamp))
                    )

df_avg_time_diff = df_time_diff\
                    .withColumn("historical_avg_time_diff", f.mean("time_diff").over(window_cc_num_timestamp))\
                    .drop(f.col("time_diff"))

df_avg_time_diff.printSchema()

root
 |-- cc_num: long (nullable = true)
 |-- trans_num: string (nullable = true)
 |-- trans_date_trans_time: timestamp (nullable = true)
 |-- category: string (nullable = true)
 |-- amt: double (nullable = true)
 |-- gender: integer (nullable = false)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- unix_time: long (nullable = true)
 |-- merch_lat: double (nullable = true)
 |-- merch_long: double (nullable = true)
 |-- is_fraud: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- weekend: integer (nullable = false)
 |-- customer_age: integer (nullable = true)
 |-- historical_mean_amt: double (nullable = true)
 |-- historical_std_amt: double (nullable = true)
 |-- historical_max_amt: double (nullable = true)
 |-- historical_min_amt: double (nullable = true)
 |-- historical_avg_time_diff: double (nullable = true)



##### Distance from Merchant

In [7]:
# Getting Distance
df_distance = df_avg_time_diff.withColumn(
    "distance", sqrt(
        (f.col("lat") - f.col("merch_lat"))**2 + (f.col("long") - f.col("merch_long"))**2
    )
)

# Distance Statistics
df_avg_distance = df_distance.withColumn("historical_avg_distance_from_merchant", f.mean("distance").over(window_cc_num_timestamp))\
                                .withColumn("historical_std_distance_from_merchant", f.stddev("distance").over(window_cc_num_timestamp))
                                

df_avg_distance.printSchema()

root
 |-- cc_num: long (nullable = true)
 |-- trans_num: string (nullable = true)
 |-- trans_date_trans_time: timestamp (nullable = true)
 |-- category: string (nullable = true)
 |-- amt: double (nullable = true)
 |-- gender: integer (nullable = false)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- unix_time: long (nullable = true)
 |-- merch_lat: double (nullable = true)
 |-- merch_long: double (nullable = true)
 |-- is_fraud: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- weekend: integer (nullable = false)
 |-- customer_age: integer (nullable = true)
 |-- historical_mean_amt: double (nullable = true)
 |-- historical_std_amt: double (nullable = true)
 |-- historical_max_amt: double (nullable = true)
 |-- historical_min_amt: double (nullable = true)
 |-- historical_avg_time_diff: double (nullable = true)
 |-- distance: double 

##### Number of Unique Transaction Categories

In [8]:
df_historical_num_categories = df_avg_distance.withColumn("historical_num_categories", f.size(f.collect_set("category").over(window_cc_num_timestamp)))

## Machine Learning

### Data Preparation

In [9]:
# Drop all rows with NULL for any column

df_no_nulls = df_historical_num_categories.dropna()


# Convert categorical column to numeric index using StringIndexer
indexer = StringIndexer(inputCol="category", outputCol="category_index")
indexer_model = indexer.fit(df_no_nulls)
df_indexed = indexer_model.transform(df_no_nulls)

25/02/10 05:49:31 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [10]:
indexer_model.write().overwrite().save("/home/osbdet/notebooks/real-time-analytics/classifier/string_indexer")

                                                                                

In [11]:
# Apply OneHotEncoder
encoder = OneHotEncoder(inputCol="category_index", outputCol="category_encoded")
df_encoded = encoder.fit(df_indexed).transform(df_indexed)

In [12]:
# Prepare final data
df_prepared = df_encoded.drop(*["category_index"])

In [13]:
# Define Vector Assembler
feature_columns = [col for col in df_prepared.columns if col not in {"trans_num", "trans_date_trans_time", "category", "is_fraud"}]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Define Split Ratios (20% and 80%)
df_train_split, df_validate_split = df_prepared.randomSplit([0.8, 0.2], seed=42)

# Vectorize Data
df_train = assembler.transform(df_train_split)
df_validate = assembler.transform(df_validate_split)

df_train.printSchema()

root
 |-- cc_num: long (nullable = true)
 |-- trans_num: string (nullable = true)
 |-- trans_date_trans_time: timestamp (nullable = true)
 |-- category: string (nullable = true)
 |-- amt: double (nullable = true)
 |-- gender: integer (nullable = false)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- unix_time: long (nullable = true)
 |-- merch_lat: double (nullable = true)
 |-- merch_long: double (nullable = true)
 |-- is_fraud: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- weekend: integer (nullable = false)
 |-- customer_age: integer (nullable = true)
 |-- historical_mean_amt: double (nullable = true)
 |-- historical_std_amt: double (nullable = true)
 |-- historical_max_amt: double (nullable = true)
 |-- historical_min_amt: double (nullable = true)
 |-- historical_avg_time_diff: double (nullable = true)
 |-- distance: double 

### Classifier Implementation

In [14]:
# Instantiate Random Forest Classifier
rf = RandomForestClassifier(labelCol="is_fraud", featuresCol="features", numTrees=50, maxDepth=10, maxBins=64)

# Train the classifier
rf_model = rf.fit(df_train)

25/02/10 05:56:16 WARN DAGScheduler: Broadcasting large task binary with size 1342.2 KiB
25/02/10 05:56:31 WARN DAGScheduler: Broadcasting large task binary with size 1954.3 KiB
25/02/10 05:56:49 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB

### Save Classifier

In [15]:
rf_model.write().overwrite().save("/home/osbdet/notebooks/real-time-analytics/classifier/model")

                                                                                

In [16]:
loaded_rf_model = RandomForestClassificationModel.load("/home/osbdet/notebooks/real-time-analytics/classifier/model")

                                                                                

In [17]:
loaded_rf_model.numFeatures

36

25/02/10 06:15:29 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 529692 ms exceeds timeout 120000 ms
25/02/10 06:15:30 WARN SparkContext: Killing executors is not supported by current scheduler.
25/02/10 06:15:32 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:80)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:642)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1223)
	at o