In [1]:
#Data Preprocessing
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.types import DoubleType

spark = SparkSession\
        .builder\
        .appName("Vehicle Insurance Claim Fraud Detection")\
        .getOrCreate()

# Load the vehicle insurance claim fraud dataset from the csv file and create a dataframe
df = spark.read.format("csv").option("header", "true").load("../dataset/vehicle_insurance_claim_fraud_data.csv")

# Adds a new column to the dataframe called 'Imputed_Age' that copies the 'Age' column value when it is not 0, 
# or the average age from the range given in 'AgeOfPolicyHolder' column value
df = df.withColumn("Imputed_Age", when(col("Age") == 0, (split(col("AgeOfPolicyHolder"), " ")[0].cast("int") + 
    split(col("AgeOfPolicyHolder"), " ")[0].cast("int")) / 2).otherwise(col("Age")))

# Convert the columns having numerical values to Double type
numerical_cols = ['WeekOfMonth', 'WeekOfMonthClaimed', 'Age', 'Imputed_Age', 'FraudFound_P', 'RepNumber', 'Deductible', 'DriverRating', 'Year']
for col in numerical_cols:
    df = df.withColumn(col, df[col].cast(DoubleType()))

# Divide the dataset into training and test sets
training, test = df.randomSplit([0.8, 0.2], seed=42)

# Perform one-hot encoding for categorical features on the training set
categorical_cols = ['Month', 'DayOfWeek', 'Make', 'AccidentArea', 'DayOfWeekClaimed',
       'MonthClaimed', 'Sex', 'MaritalStatus', 'Fault', 'PolicyType',
       'VehicleCategory', 'VehiclePrice', 'Days_Policy_Accident',
       'Days_Policy_Claim', 'PastNumberOfClaims', 'AgeOfVehicle',
       'AgeOfPolicyHolder', 'PoliceReportFiled', 'WitnessPresent', 'AgentType',
       'NumberOfSuppliments', 'AddressChange_Claim', 'NumberOfCars',
       'BasePolicy']
indexers = [StringIndexer(inputCol=col, outputCol=col+"_indexed", handleInvalid="keep") for col in categorical_cols]
encoders = [OneHotEncoder(inputCol=col+"_indexed", outputCol=col+"_encoded") for col in categorical_cols]
pipeline = Pipeline(stages=indexers + encoders)
pipelineModel = pipeline.fit(training)
training_encoded = pipelineModel.transform(training).drop(*categorical_cols, *["PolicyNumber"])
test_encoded = pipelineModel.transform(test).drop(*categorical_cols, *["PolicyNumber"])
print('Data is processed.')
#training_encoded.show(n=1)

Data is processed.


In [2]:
training_encoded.columns

['WeekOfMonth',
 'WeekOfMonthClaimed',
 'Age',
 'FraudFound_P',
 'RepNumber',
 'Deductible',
 'DriverRating',
 'Year',
 'Imputed_Age',
 'Month_indexed',
 'DayOfWeek_indexed',
 'Make_indexed',
 'AccidentArea_indexed',
 'DayOfWeekClaimed_indexed',
 'MonthClaimed_indexed',
 'Sex_indexed',
 'MaritalStatus_indexed',
 'Fault_indexed',
 'PolicyType_indexed',
 'VehicleCategory_indexed',
 'VehiclePrice_indexed',
 'Days_Policy_Accident_indexed',
 'Days_Policy_Claim_indexed',
 'PastNumberOfClaims_indexed',
 'AgeOfVehicle_indexed',
 'AgeOfPolicyHolder_indexed',
 'PoliceReportFiled_indexed',
 'WitnessPresent_indexed',
 'AgentType_indexed',
 'NumberOfSuppliments_indexed',
 'AddressChange_Claim_indexed',
 'NumberOfCars_indexed',
 'BasePolicy_indexed',
 'Month_encoded',
 'DayOfWeek_encoded',
 'Make_encoded',
 'AccidentArea_encoded',
 'DayOfWeekClaimed_encoded',
 'MonthClaimed_encoded',
 'Sex_encoded',
 'MaritalStatus_encoded',
 'Fault_encoded',
 'PolicyType_encoded',
 'VehicleCategory_encoded',
 'Vehi

In [6]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler

features = ['WeekOfMonth',
 'WeekOfMonthClaimed',
 'Age',
 'RepNumber',
 'Deductible',
 'DriverRating',
 'Year',
 'Imputed_Age',
 'Month_indexed',
 'DayOfWeek_indexed',
 'Make_indexed',
 'AccidentArea_indexed',
 'DayOfWeekClaimed_indexed',
 'MonthClaimed_indexed',
 'Sex_indexed',
 'MaritalStatus_indexed',
 'Fault_indexed',
 'PolicyType_indexed',
 'VehicleCategory_indexed',
 'VehiclePrice_indexed',
 'Days_Policy_Accident_indexed',
 'Days_Policy_Claim_indexed',
 'PastNumberOfClaims_indexed',
 'AgeOfVehicle_indexed',
 'AgeOfPolicyHolder_indexed',
 'PoliceReportFiled_indexed',
 'WitnessPresent_indexed',
 'AgentType_indexed',
 'NumberOfSuppliments_indexed',
 'AddressChange_Claim_indexed',
 'NumberOfCars_indexed',
 'BasePolicy_indexed',
 'Month_encoded',
 'DayOfWeek_encoded',
 'Make_encoded',
 'AccidentArea_encoded',
 'DayOfWeekClaimed_encoded',
 'MonthClaimed_encoded',
 'Sex_encoded',
 'MaritalStatus_encoded',
 'Fault_encoded',
 'PolicyType_encoded',
 'VehicleCategory_encoded',
 'VehiclePrice_encoded',
 'Days_Policy_Accident_encoded',
 'Days_Policy_Claim_encoded',
 'PastNumberOfClaims_encoded',
 'AgeOfVehicle_encoded',
 'AgeOfPolicyHolder_encoded',
 'PoliceReportFiled_encoded',
 'WitnessPresent_encoded',
 'AgentType_encoded',
 'NumberOfSuppliments_encoded',
 'AddressChange_Claim_encoded',
 'NumberOfCars_encoded',
 'BasePolicy_encoded']

assembler = VectorAssembler(inputCols=features, outputCol="features", handleInvalid="keep")
train_df = assembler.transform(training_encoded).select("features", "FraudFound_P")
test_df = assembler.transform(test_encoded).select("features", "FraudFound_P")

# Define the decision tree classifier
dt = DecisionTreeClassifier(labelCol="FraudFound_P", featuresCol="features")

# Define the parameter grid for cross-validation
paramGrid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [2, 5, 10]) \
    .addGrid(dt.minInstancesPerNode, [1, 5, 10]) \
    .build()

# Define the evaluator as precision for the classification
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="FraudFound_P", 
                      metricName="weightedPrecision")

# Define the cross-validator with k-fold stratified sampling
cv = CrossValidator(estimator=dt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5, seed=42)

# Run the cross-validation on the dataset
cvModel = cv.fit(train_df)

# Print the best model's parameters
print("Best maxDepth: ", cvModel.bestModel._java_obj.getMaxDepth())
print("Best minInstancesPerNode: ", cvModel.bestModel._java_obj.getMinInstancesPerNode())


Best maxDepth:  5
Best minInstancesPerNode:  5


In [7]:
predictions = cvModel.transform(test_df)
#predictions.select("prediction").show()
precision = evaluator.evaluate(predictions)
evaluator.setMetricName('weightedRecall')
recall = evaluator.evaluate(predictions)
evaluator.setMetricName('f1')
f1_score = evaluator.evaluate(predictions)
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1_score)
list(zip(features, cvModel.bestModel.featureImportances))

+----------+
|prediction|
+----------+
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
+----------+
only showing top 20 rows

Precision = 0.9501916389138795
Recall = 0.9474206349206349
F1 Score = 0.9239699406348035


[('WeekOfMonth', 0.0),
 ('WeekOfMonthClaimed', 0.0),
 ('Age', 0.0),
 ('RepNumber', 0.0),
 ('Deductible', 0.0),
 ('DriverRating', 0.0),
 ('Year', 0.0),
 ('Imputed_Age', 0.0),
 ('Month_indexed', 0.0),
 ('DayOfWeek_indexed', 0.0),
 ('Make_indexed', 0.0),
 ('AccidentArea_indexed', 0.0),
 ('DayOfWeekClaimed_indexed', 0.0),
 ('MonthClaimed_indexed', 0.0),
 ('Sex_indexed', 0.0),
 ('MaritalStatus_indexed', 0.0),
 ('Fault_indexed', 0.3540980997607374),
 ('PolicyType_indexed', 0.0),
 ('VehicleCategory_indexed', 0.0),
 ('VehiclePrice_indexed', 0.0),
 ('Days_Policy_Accident_indexed', 0.0),
 ('Days_Policy_Claim_indexed', 0.0),
 ('PastNumberOfClaims_indexed', 0.0),
 ('AgeOfVehicle_indexed', 0.0),
 ('AgeOfPolicyHolder_indexed', 0.17620110331104266),
 ('PoliceReportFiled_indexed', 0.0),
 ('WitnessPresent_indexed', 0.0),
 ('AgentType_indexed', 0.0),
 ('NumberOfSuppliments_indexed', 0.0),
 ('AddressChange_Claim_indexed', 0.2199345649033783),
 ('NumberOfCars_indexed', 0.0),
 ('BasePolicy_indexed', 0.2497