In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.types import DoubleType

spark = SparkSession\
        .builder\
        .appName("Vehicle Insurance Claim Fraud Detection")\
        .getOrCreate()

# Load the dataset of vehicle insurance claim fraud
df = spark.read.format("csv").option("header", "true").load("../dataset/vehicle_insurance_claim_fraud_data.csv")

# Split the AgeOfPolicyHolder column into two columns, representing the minimum and maximum age values
df = df.withColumn("AgeOfPolicyHolder_Min", split(col("AgeOfPolicyHolder"), " ")[0].cast("int"))
df = df.withColumn("AgeOfPolicyHolder_Max", split(col("AgeOfPolicyHolder"), " ")[2].cast("int"))

# Compute the average age range for each row in the AgeOfPolicyHolder column
avg_age_range = df.select(avg((col("AgeOfPolicyHolder_Min") + col("AgeOfPolicyHolder_Max")) / 2)).collect()[0][0]

# Add a new column that copies the Age column value when it is not 0, or the average age range otherwise
df = df.withColumn("New_Age", when(col("Age") == 0, avg_age_range).otherwise(col("Age")))

# Convert the columns having numerical values to Double type
numerical_cols = ['WeekOfMonth', 'WeekOfMonthClaimed', 'Age', 'New_Age', 'FraudFound_P', 'RepNumber', 'Deductible', 'DriverRating', 'Year']
for col in numerical_cols:
    df = df.withColumn(col, df[col].cast(DoubleType()))

# Divide the dataset into training and test sets
training, test = df.randomSplit([0.8, 0.2], seed=42)

# Perform one-hot encoding for categorical features on the training set
categorical_cols = ['Month', 'DayOfWeek', 'Make', 'AccidentArea', 'DayOfWeekClaimed',
       'MonthClaimed', 'Sex', 'MaritalStatus', 'Fault', 'PolicyType',
       'VehicleCategory', 'VehiclePrice', 'Days_Policy_Accident',
       'Days_Policy_Claim', 'PastNumberOfClaims', 'AgeOfVehicle',
       'AgeOfPolicyHolder', 'PoliceReportFiled', 'WitnessPresent', 'AgentType',
       'NumberOfSuppliments', 'AddressChange_Claim', 'NumberOfCars',
       'BasePolicy']
indexers = [StringIndexer(inputCol=col, outputCol=col+"_indexed", handleInvalid="keep") for col in categorical_cols]
encoders = [OneHotEncoder(inputCol=col+"_indexed", outputCol=col+"_encoded") for col in categorical_cols]
pipeline = Pipeline(stages=indexers + encoders)
pipelineModel = pipeline.fit(training)
training_encoded = pipelineModel.transform(training).drop(*categorical_cols, *["PolicyNumber"])
test_encoded = pipelineModel.transform(test).drop(*categorical_cols, *["PolicyNumber"])
print('Data is processed.')
training_encoded.show(n=1)

Data is processed.
+-----------+------------------+----+------------+---------+----------+------------+------+---------------------+---------------------+-------+-------------+-----------------+------------+--------------------+------------------------+--------------------+-----------+---------------------+-------------+------------------+-----------------------+--------------------+----------------------------+-------------------------+--------------------------+--------------------+-------------------------+-------------------------+----------------------+-----------------+---------------------------+---------------------------+--------------------+------------------+--------------+-----------------+--------------+--------------------+------------------------+--------------------+-------------+---------------------+-------------+------------------+-----------------------+--------------------+----------------------------+-------------------------+--------------------------+-----------

In [16]:
training_encoded.columns

['WeekOfMonth',
 'WeekOfMonthClaimed',
 'Age',
 'FraudFound_P',
 'RepNumber',
 'Deductible',
 'DriverRating',
 'Year',
 'AgeOfPolicyHolder_Min',
 'AgeOfPolicyHolder_Max',
 'New_Age',
 'Month_indexed',
 'DayOfWeek_indexed',
 'Make_indexed',
 'AccidentArea_indexed',
 'DayOfWeekClaimed_indexed',
 'MonthClaimed_indexed',
 'Sex_indexed',
 'MaritalStatus_indexed',
 'Fault_indexed',
 'PolicyType_indexed',
 'VehicleCategory_indexed',
 'VehiclePrice_indexed',
 'Days_Policy_Accident_indexed',
 'Days_Policy_Claim_indexed',
 'PastNumberOfClaims_indexed',
 'AgeOfVehicle_indexed',
 'AgeOfPolicyHolder_indexed',
 'PoliceReportFiled_indexed',
 'WitnessPresent_indexed',
 'AgentType_indexed',
 'NumberOfSuppliments_indexed',
 'AddressChange_Claim_indexed',
 'NumberOfCars_indexed',
 'BasePolicy_indexed',
 'Month_encoded',
 'DayOfWeek_encoded',
 'Make_encoded',
 'AccidentArea_encoded',
 'DayOfWeekClaimed_encoded',
 'MonthClaimed_encoded',
 'Sex_encoded',
 'MaritalStatus_encoded',
 'Fault_encoded',
 'PolicyT

In [18]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler

features = ['WeekOfMonth',
 'WeekOfMonthClaimed',
 'Age',
 'RepNumber',
 'Deductible',
 'DriverRating',
 'Year',
 'AgeOfPolicyHolder_Min',
 'AgeOfPolicyHolder_Max',
 'New_Age',
 'Month_indexed',
 'DayOfWeek_indexed',
 'Make_indexed',
 'AccidentArea_indexed',
 'DayOfWeekClaimed_indexed',
 'MonthClaimed_indexed',
 'Sex_indexed',
 'MaritalStatus_indexed',
 'Fault_indexed',
 'PolicyType_indexed',
 'VehicleCategory_indexed',
 'VehiclePrice_indexed',
 'Days_Policy_Accident_indexed',
 'Days_Policy_Claim_indexed',
 'PastNumberOfClaims_indexed',
 'AgeOfVehicle_indexed',
 'AgeOfPolicyHolder_indexed',
 'PoliceReportFiled_indexed',
 'WitnessPresent_indexed',
 'AgentType_indexed',
 'NumberOfSuppliments_indexed',
 'AddressChange_Claim_indexed',
 'NumberOfCars_indexed',
 'BasePolicy_indexed',
 'Month_encoded',
 'DayOfWeek_encoded',
 'Make_encoded',
 'AccidentArea_encoded',
 'DayOfWeekClaimed_encoded',
 'MonthClaimed_encoded',
 'Sex_encoded',
 'MaritalStatus_encoded',
 'Fault_encoded',
 'PolicyType_encoded',
 'VehicleCategory_encoded',
 'VehiclePrice_encoded',
 'Days_Policy_Accident_encoded',
 'Days_Policy_Claim_encoded',
 'PastNumberOfClaims_encoded',
 'AgeOfVehicle_encoded',
 'AgeOfPolicyHolder_encoded',
 'PoliceReportFiled_encoded',
 'WitnessPresent_encoded',
 'AgentType_encoded',
 'NumberOfSuppliments_encoded',
 'AddressChange_Claim_encoded',
 'NumberOfCars_encoded',
 'BasePolicy_encoded']

assembler = VectorAssembler(inputCols=features, outputCol="features", handleInvalid="keep")
train_df = assembler.transform(training_encoded).select("features", "FraudFound_P")

# Define the decision tree classifier
dt = DecisionTreeClassifier(labelCol="FraudFound_P", featuresCol="features")

# Define the parameter grid for cross-validation
paramGrid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [2, 5, 10]) \
    .addGrid(dt.minInstancesPerNode, [1, 5, 10]) \
    .build()

# Define the evaluator for binary classification
#evaluator = BinaryClassificationEvaluator(labelCol="FraudFound_P")
precision_evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="FraudFound_P", 
                      metricName="weightedPrecision")

# Define the cross-validator with k-fold stratified sampling
cv = CrossValidator(estimator=dt, estimatorParamMaps=paramGrid, evaluator=precision_evaluator, numFolds=5, seed=42)

# Run the cross-validation on the dataset
cvModel = cv.fit(train_df)

# Print the best model's parameters
print("Best maxDepth: ", cvModel.bestModel._java_obj.getMaxDepth())
print("Best minInstancesPerNode: ", cvModel.bestModel._java_obj.getMinInstancesPerNode())


Best maxDepth:  5
Best minInstancesPerNode:  5


In [38]:
#train_df = assembler.transform(training_encoded).select("features", "FraudFound_P")
test_df = assembler.transform(test_encoded).select("features", "FraudFound_P")
print(precision_evaluator.evaluate(cvModel.transform(train_df)))
print(precision_evaluator.evaluate(cvModel.transform(test_df)))
list(zip(features, cvModel.bestModel.featureImportances))

0.9446570694569505
0.9501916389138795


[('WeekOfMonth', 0.0),
 ('WeekOfMonthClaimed', 0.0),
 ('Age', 0.0),
 ('RepNumber', 0.0),
 ('Deductible', 0.0),
 ('DriverRating', 0.0),
 ('Year', 0.0),
 ('AgeOfPolicyHolder_Min', 0.0),
 ('AgeOfPolicyHolder_Max', 0.0),
 ('New_Age', 0.0),
 ('Month_indexed', 0.0),
 ('DayOfWeek_indexed', 0.0),
 ('Make_indexed', 0.0),
 ('AccidentArea_indexed', 0.0),
 ('DayOfWeekClaimed_indexed', 0.0),
 ('MonthClaimed_indexed', 0.0),
 ('Sex_indexed', 0.0),
 ('MaritalStatus_indexed', 0.0),
 ('Fault_indexed', 0.3540980997607375),
 ('PolicyType_indexed', 0.0),
 ('VehicleCategory_indexed', 0.0),
 ('VehiclePrice_indexed', 0.0),
 ('Days_Policy_Accident_indexed', 0.0),
 ('Days_Policy_Claim_indexed', 0.0),
 ('PastNumberOfClaims_indexed', 0.0),
 ('AgeOfVehicle_indexed', 0.0),
 ('AgeOfPolicyHolder_indexed', 0.1762011033110427),
 ('PoliceReportFiled_indexed', 0.0),
 ('WitnessPresent_indexed', 0.0),
 ('AgentType_indexed', 0.0),
 ('NumberOfSuppliments_indexed', 0.0),
 ('AddressChange_Claim_indexed', 0.21993456490337832),


In [33]:
from pyspark.mllib.evaluation import MulticlassMetrics

predictionAndLabels = test_encoded.rdd.map(lambda lp: (float(cvModel.predict(lp.features)), lp.FraudFound_P))
# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)

# Overall statistics
precision = metrics.precision()
recall = metrics.recall()
f1Score = metrics.fMeasure()
print("Summary Stats")
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1Score)

Traceback (most recent call last):
  File "C:\Users\Shefali Upadhyaya\AppData\Local\Programs\Python\Python310\lib\site-packages\pyspark\serializers.py", line 458, in dumps
    return cloudpickle.dumps(obj, pickle_protocol)
  File "C:\Users\Shefali Upadhyaya\AppData\Local\Programs\Python\Python310\lib\site-packages\pyspark\cloudpickle\cloudpickle_fast.py", line 73, in dumps
    cp.dump(obj)
  File "C:\Users\Shefali Upadhyaya\AppData\Local\Programs\Python\Python310\lib\site-packages\pyspark\cloudpickle\cloudpickle_fast.py", line 602, in dump
    return Pickler.dump(self, obj)
TypeError: cannot pickle '_thread.RLock' object


PicklingError: Could not serialize object: TypeError: cannot pickle '_thread.RLock' object