In [80]:
# Importing required modules
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.types import DoubleType
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler

print('Modules are imported.')

Modules are imported.


In [81]:
# Initializing Spark session
spark = SparkSession\
        .builder\
        .appName("Vehicle Insurance Claim Fraud Detection")\
        .getOrCreate()
print('Spark session is initialized.')

Spark session is initialized.


In [82]:
# Load the vehicle insurance claim fraud dataset from the csv file and create a dataframe
df = spark.read.format("csv").option("header", "true").load("../dataset/vehicle_insurance_claim_fraud_data.csv")
print('Spark dataframe is created.')

Spark dataframe is created.


In [83]:
# Adds a new column to the dataframe called 'Imputed_Age' that copies the 'Age' column value when it is not 0, 
# or the average age from the range given in 'AgeOfPolicyHolder' column value
df = df.withColumn("Imputed_Age", when(col("Age") == 0, (split(col("AgeOfPolicyHolder"), " ")[0].cast("int") + 
    split(col("AgeOfPolicyHolder"), " ")[0].cast("int")) / 2).otherwise(col("Age")))
print("'Imputed_Age' column is added to the Spark dataframe.")

'Imputed_Age' column is added to the Spark dataframe.


In [84]:
# Convert the columns having numerical values to Double type
numerical_cols = ['WeekOfMonth', 'WeekOfMonthClaimed', 'Age', 'Imputed_Age', 'FraudFound_P', 'RepNumber', 'Deductible', 
                  'DriverRating', 'Year']
for col in numerical_cols:
    df = df.withColumn(col, df[col].cast(DoubleType()))
    
print('Columns having numerical values to Double type.')

Columns having numerical values to Double type.


In [85]:
# Divide the dataset into training and test sets
training, test = df.randomSplit([0.8, 0.2], seed=42)
print('Dataset is divided into training and test sets with random split.')

Dataset is divided into training and test sets with random split.


In [86]:
# Create a list of the columns having categorical values
categorical_cols = ['Month', 'DayOfWeek', 'Make', 'AccidentArea', 'DayOfWeekClaimed', 'MonthClaimed', 'Sex', 'MaritalStatus', 
                    'Fault', 'PolicyType', 'VehicleCategory', 'VehiclePrice', 'Days_Policy_Accident', 'Days_Policy_Claim', 
                    'PastNumberOfClaims', 'AgeOfVehicle', 'AgeOfPolicyHolder', 'PoliceReportFiled', 'WitnessPresent', 
                    'AgentType', 'NumberOfSuppliments', 'AddressChange_Claim', 'NumberOfCars', 'BasePolicy']
print('A list of the columns having categorical values is created.')

A list of the columns having categorical values is created.


In [88]:
# Perform one-hot encoding for categorical features on the training set
indexers = [StringIndexer(inputCol=col, outputCol=col+"_indexed", handleInvalid="keep") for col in categorical_cols]
encoders = [OneHotEncoder(inputCol=col+"_indexed", outputCol=col+"_encoded") for col in categorical_cols]
pipeline = Pipeline(stages=indexers + encoders)
pipelineModel = pipeline.fit(training)
training_encoded = pipelineModel.transform(training).drop(*categorical_cols, *["PolicyNumber"])
test_encoded = pipelineModel.transform(test).drop(*categorical_cols, *["PolicyNumber"])
print('One-hot encoding is performed for the categorical features on the training set.')

One-hot encoding is performed for the categorical features on the training set.


In [89]:
# Extracting the features and removing the 'FraudFound_P' column
features = training_encoded.columns
features.remove('FraudFound_P')
print("'Features' are extracted.")

'Features' are extracted.


In [90]:
# Assembles the columns into a feature vector for the training and test sets
assembler = VectorAssembler(inputCols=features, outputCol="features", handleInvalid="keep")
train_df = assembler.transform(training_encoded).select("features", "FraudFound_P")
test_df = assembler.transform(test_encoded).select("features", "FraudFound_P")
print('Training and test sets are assembled.')

Training and test sets are assembled.


In [91]:
# Define the decision tree classifier
dt = DecisionTreeClassifier(labelCol="FraudFound_P", featuresCol="features")
print('Decision tree is defined.')

Decision tree is defined.


In [92]:
# Define the parameter grid for cross-validation
paramGrid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [2, 5, 10]) \
    .addGrid(dt.minInstancesPerNode, [1, 5, 10]) \
    .build()
print('Parameter grid for cross-validation is defined.')

Parameter grid for cross-validation is defined.


In [93]:
# Define the evaluator as precision for the classification
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="FraudFound_P", 
                      metricName="weightedPrecision")
print('Precision evaluator is defined.')

Precision evaluator is defined.


In [94]:
# Define the cross-validator with k-fold stratified sampling
cv = CrossValidator(estimator=dt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5, seed=42)
print('Cross-validator with k-fold stratified sampling is defined.')

Cross-validator with k-fold stratified sampling is defined.


In [95]:
# Run the cross-validation on the training set
cvModel = cv.fit(train_df)
print('Stratified k-fold cross-validation is executed on the training set.')

Stratified k-fold cross-validation is executed on the training set.


In [96]:
# Print the best model's parameters
print("Best maxDepth: ", cvModel.bestModel._java_obj.getMaxDepth())
print("Best minInstancesPerNode: ", cvModel.bestModel._java_obj.getMinInstancesPerNode())
list(zip(features, cvModel.bestModel.featureImportances))

Best maxDepth:  5
Best minInstancesPerNode:  5


[('WeekOfMonth', 0.0),
 ('WeekOfMonthClaimed', 0.0),
 ('Age', 0.0),
 ('RepNumber', 0.0),
 ('Deductible', 0.0),
 ('DriverRating', 0.0),
 ('Year', 0.0),
 ('Imputed_Age', 0.0),
 ('Month_indexed', 0.0),
 ('DayOfWeek_indexed', 0.0),
 ('Make_indexed', 0.0),
 ('AccidentArea_indexed', 0.0),
 ('DayOfWeekClaimed_indexed', 0.0),
 ('MonthClaimed_indexed', 0.0),
 ('Sex_indexed', 0.0),
 ('MaritalStatus_indexed', 0.0),
 ('Fault_indexed', 0.3540980997607374),
 ('PolicyType_indexed', 0.0),
 ('VehicleCategory_indexed', 0.0),
 ('VehiclePrice_indexed', 0.0),
 ('Days_Policy_Accident_indexed', 0.0),
 ('Days_Policy_Claim_indexed', 0.0),
 ('PastNumberOfClaims_indexed', 0.0),
 ('AgeOfVehicle_indexed', 0.0),
 ('AgeOfPolicyHolder_indexed', 0.17620110331104266),
 ('PoliceReportFiled_indexed', 0.0),
 ('WitnessPresent_indexed', 0.0),
 ('AgentType_indexed', 0.0),
 ('NumberOfSuppliments_indexed', 0.0),
 ('AddressChange_Claim_indexed', 0.2199345649033783),
 ('NumberOfCars_indexed', 0.0),
 ('BasePolicy_indexed', 0.2497

In [97]:
# Get the predictions on the test set
predictions = cvModel.transform(test_df)
print('Predictions on the test set are obtained.')

Predictions on the test set are obtained.


In [98]:
# Precision metric
precision = evaluator.evaluate(predictions)
print("Precision = %s" % precision)

Precision = 0.9501916389138795


In [99]:
# Recall metric
evaluator.setMetricName('weightedRecall')
recall = evaluator.evaluate(predictions)
print("Recall = %s" % recall)

Recall = 0.9474206349206349


In [100]:
# F1 score metric
evaluator.setMetricName('f1')
f1_score = evaluator.evaluate(predictions)
print("F1 Score = %s" % f1_score)

F1 Score = 0.9239699406348035


In [101]:
# Accuracy metric
evaluator.setMetricName('accuracy')
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %s" % accuracy)

Accuracy = 0.9474206349206349
