In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import warnings
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

from interpret import set_visualize_provider
from interpret import show
from interpret.provider import InlineProvider
from interpret.glassbox import ExplainableBoostingClassifier

from utils2 import *
from stat_utils import *

pd.set_option('future.no_silent_downcasting', True)
pd.set_option('display.max_columns', None)

warnings.filterwarnings('ignore')
set_visualize_provider(InlineProvider())


In [2]:
df = pd.read_csv('data/fraud_oracle.csv')

In [3]:
address_change_mapping = {
    '1 year' : "change",
    'no change' : "no_change",
    '4 to 8 years' : "change", 
    '2 to 3 years' : "change",
    'under 6 months' : "change"
}
vehicle_price_mapping = {
    'less than 20000': "less than 30000",
    '20000 to 29000': "less than 30000",
    '30000 to 39000': "30000 to 59000",
    '40000 to 59000': "30000 to 59000",
    '60000 to 69000': "more than 60000",
    'more than 69000': "more than 60000"
}

df['AddressChange_Claim_2'] = df['AddressChange_Claim'].map(address_change_mapping)
df['VehiclePrice_num'] = df['VehiclePrice'].map(vehicle_price_mapping)

In [4]:
time_vars = ["Month", "WeekOfMonth", "DayOfWeek", "DayOfWeekClaimed", 'MonthClaimed', 'WeekOfMonthClaimed']
vehicle_vars = ["Make", "VehiclePrice_num", "VehicleCategory", "AgeOfVehicle"]
personal_vars = ["Sex", 'MaritalStatus', "Age", 'DriverRating', 'AgeOfPolicyHolder', 'NumberOfCars', 'PastNumberOfClaims']
policy_vars = ["PolicyType", 'Deductible', 'AgentType', "NumberOfSuppliments"]
accident_vars = ['Days_Policy_Accident', 'Days_Policy_Claim', 'PoliceReportFiled', 'WitnessPresent', 'AddressChange_Claim_2']

In [5]:
collision = df.loc[df['BasePolicy']=="Collision"]
liability = df.loc[df['BasePolicy']=="Liability"]
all_p = df.loc[df['BasePolicy']=="All Perils"]

In [6]:
target_col = "FraudFound_P"
seed = 42
np.random.seed(seed)

### Oversampling

In [7]:
target_col = "FraudFound_P"

In [8]:
from imblearn.over_sampling import SMOTENC

X = all_p[vehicle_vars + personal_vars + policy_vars + accident_vars]
y = all_p[target_col].astype(int)

categorical_features = [X.columns.get_loc(col) for col in X.select_dtypes(include=['object']).columns]

smote_nc = SMOTENC(categorical_features=categorical_features, random_state=42, sampling_strategy={0:len(y[y == 0]),
                                                                                                  1:1500})

X = X.values

# Apply SMOTENC
X_resampled, y_resampled = smote_nc.fit_resample(X, y)

X_resampled = pd.DataFrame(X_resampled, columns=vehicle_vars + personal_vars + policy_vars + accident_vars)

In [9]:
# 본래는 integer이었던 컬럼들이 float으로 변형됨
# 이는 SMOT에서 내부적으로 데이터 셋을 oversampling하는 과정에서 interpolation이 활용되기 때문이다
int_columns = all_p.select_dtypes(include=['int64', 'int32']).columns

for i in int_columns:
    if i in X_resampled.columns:
        X_resampled[i] = X_resampled[i].astype(int)

In [10]:
all_p_resampled = pd.concat([X_resampled, y_resampled], axis=1)

In [None]:
all_p_resampled.FraudFound_P.value_counts()

In [12]:
X = all_p_resampled[vehicle_vars + personal_vars + policy_vars + accident_vars]
y = all_p_resampled["FraudFound_P"].astype(int)

seed = 42
np.random.seed(seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)

ebm = ExplainableBoostingClassifier(interactions=False)
ebm.fit(X_train, y_train)

y_pred = ebm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

Create predictions

In [None]:
print(report)

In [None]:
show(ebm.explain_global())

In [15]:
all_p_global_df = pd.DataFrame(ebm.explain_global().data())
all_p_global_df['policy_type'] = "All Perils"

# get predictions
X = all_p[vehicle_vars + personal_vars + policy_vars + accident_vars]
y = all_p["FraudFound_P"].astype(int)
y_pred = ebm.predict(X)

df['predictions'] = 99
df.loc[df['BasePolicy']=="All Perils", 'predictions'] = list(y_pred)

In [None]:
df.loc[(df['FraudFound_P']==1) & (df['FraudFound_P']==df['predictions'])].shape

In [None]:
# local explainer
ebm.to_json('powerBI/all_perils_ebm.json')

show(ebm.explain_local(X_test[:5], y_test[:5]), 0)


In [19]:
import json

with open('powerBI/all_perils_ebm.json', 'r') as f:
    all_perils_ebm = json.load(f)

In [None]:
all_perils_ebm['ebm'].keys()

# collision modeling

In [28]:
from imblearn.over_sampling import SMOTENC

X = collision[vehicle_vars + personal_vars + policy_vars + accident_vars]
y = collision[target_col].astype(int)

categorical_features = [X.columns.get_loc(col) for col in X.select_dtypes(include=['object']).columns]

smote_nc = SMOTENC(categorical_features=categorical_features, random_state=42, sampling_strategy={0:len(y[y == 0]),
                                                                                                  1:1500})

X = X.values

# Apply SMOTENC
X_resampled, y_resampled = smote_nc.fit_resample(X, y)

X_resampled = pd.DataFrame(X_resampled, columns=vehicle_vars + personal_vars + policy_vars + accident_vars)

In [29]:
int_columns = collision.select_dtypes(include=['int64', 'int32']).columns

for i in int_columns:
    if i in X_resampled.columns:
        X_resampled[i] = X_resampled[i].astype(int)

In [30]:
collision_resampled = pd.concat([X_resampled, y_resampled], axis=1)

In [None]:
collision_resampled.FraudFound_P.value_counts()

In [None]:
X = collision_resampled[vehicle_vars + personal_vars + policy_vars + accident_vars]
y = collision_resampled["FraudFound_P"].astype(int)

seed = 42
np.random.seed(seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)

ebm = ExplainableBoostingClassifier(interactions=False)
ebm.fit(X_train, y_train)

y_pred = ebm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(report)

In [None]:
show(ebm.explain_global())

In [34]:
ebm.to_json('powerBI/collision_ebm.json')

In [35]:
collision_global_df = pd.DataFrame(ebm.explain_global().data())
collision_global_df['policy_type'] = "Collision"

# get predictions
X = collision[vehicle_vars + personal_vars + policy_vars + accident_vars]
y = collision["FraudFound_P"].astype(int)
y_pred = ebm.predict(X)

df.loc[df['BasePolicy']=="Collision", 'predictions'] = list(y_pred)

In [36]:
df.to_csv("powerBI/raw_data_with_predictions.csv", index=False)

In [37]:
global_df = pd.concat([all_p_global_df, collision_global_df], axis=0)
global_df.to_csv("powerBI/global_df.csv", index=False)

In [None]:
df.loc[(df['FraudFound_P']==1) & (df['FraudFound_P']==df['predictions'])]