In [23]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
from sklearn.model_selection import GridSearchCV
import numpy as np

# scaler 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

# models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import LinearSVC


# pipeline
from sklearn.pipeline import Pipeline 
from imblearn.pipeline import Pipeline as Pipeline_imb 

# resampling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler 
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import CondensedNearestNeighbour

# model selection
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# scoring
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score, f1_score, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [24]:
# 데이터 불러오기
socar_df = pd.read_csv("insurance_fraud_detect_data.csv")
pd.set_option('display.max_columns', len(socar_df.columns))
socar_df.head()

Unnamed: 0,fraud_YN,car_model,sharing_type,age_group,has_previous_accident,cumulative_use_count,b2b,accident_ratio,pf_type,socarpass,socarsave,start_hour,duration,accident_hour,repair_cost,insure_cost,accident_location,car_part1,car_part2,repair_cnt,acc_type1,insurance_site_aid_YN,police_site_aid_YN,total_prsn_cnt,test_set
0,0,2,1,3,0,2,0,100,2,0,0,4,2,4,130254.7,0,1,0,0,1,0,0,0,-1,0
1,0,1,0,4,0,4,0,0,3,1,0,1,4,4,0.0,0,1,1,1,2,1,1,1,0,1
2,0,1,0,4,1,4,0,100,3,1,0,1,4,6,258130.6,12925040,3,1,0,2,1,1,1,0,1
3,0,3,1,3,0,4,0,0,2,0,0,3,3,5,1341009.0,0,3,1,0,1,0,0,0,-1,0
4,0,1,0,1,0,1,0,100,1,0,0,1,1,1,524030.6,1209410,1,0,0,2,0,0,0,-1,0


In [25]:
# one-hot encoding
socar_df = pd.get_dummies(socar_df, columns=["car_model", "sharing_type", "start_hour", "duration", "accident_hour",\
                                        "accident_location", "acc_type1", "insurance_site_aid_YN"])

In [26]:
X = socar_df.iloc[:, 1:]
y = socar_df.iloc[:, 0]

In [27]:
# train_set, test_set 분리
train_set = socar_df[socar_df["test_set"] == 0]
test_set = socar_df[socar_df["test_set"] == 1]

# "test_set" 컬럼 삭제
train_set = train_set.drop("test_set", axis=1)
test_set = test_set.drop("test_set", axis=1)

# features와 target분리
X_features = train_set.drop("fraud_YN", axis=1)
y_target = train_set["fraud_YN"]
X_test = test_set.drop("fraud_YN", axis=1)
y_test = test_set["fraud_YN"]

In [28]:
# pipeline

lr_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ("clf", LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ("clf", LinearSVC(random_state=13))])



# grid search params
lr_param_grid = [{'clf__penalty': ['l1', 'l2', 'elasticnet', 'none']}]

dt_param_grid = [{'clf__criterion':['gini', 'entropy'], 'clf__max_depth':[None,2,3,4,5,6]}]

rf_param_grid = [{'clf__n_estimators': [1, 5, 10, 20],
                  'clf__max_depth' : [2, 3, 4, 5, 10, 50, 100]}]

lgbm_param_grid = [{'clf__n_estimators': [800, 1000, 1200], 'clf__num_leaves': [32, 64],\
                   'clf__max_depth': [128, 160], 'clf__min_child_samples': [60, 100]}]

svm_param_grid = [{"clf__penalty": ["l1", "l2"],
                    "clf__loss": ["hinge", "squared_hinge"]}]




# grid search CV
LR = GridSearchCV(lr_pipe, lr_param_grid, cv=5, scoring = 'recall')
DT = GridSearchCV(dt_pipe, dt_param_grid, cv=5, scoring = 'recall')
RF = GridSearchCV(rf_pipe, rf_param_grid, cv=5, scoring = 'recall')
LGBM = GridSearchCV(lgbm_pipe, lgbm_param_grid, cv=5, scoring = 'recall')
SVM = GridSearchCV(svm_pipe, svm_param_grid, cv=5, scoring = 'recall')


In [29]:
# 샘플링 RandomOverSampler

over_sampling = RandomOverSampler(random_state=13)
X_train_over, y_train_over = over_sampling.fit_sample(X_features, y_target)

In [30]:
X_features.shape, y_target.shape

((12879, 53), (12879,))

In [31]:
X_train_over.shape, y_train_over.shape

((25690, 53), (25690,))

In [32]:
grids = []
grids.append(('LogisticRe', LR))
grids.append(('DecisionTree', DT))
grids.append(('RandomForest', RF))
grids.append(('LightGBM', LGBM))
grids.append(('SVC', SVM))

              
train_accuracy = []
train_recall = []
train_precision = []
train_f1 = []
train_roc = []
test_accuracy = []
test_recall = []
test_precision = []
test_f1 = []
test_roc = []
names = []
    
for name, grid in grids:
    clf = grid
    clf.fit(X_train_over, y_train_over)
    
    y_pred_train = clf.predict(X_train_over)
    y_pred_test = clf.predict(X_test)

    names.append(name)
    train_accuracy.append(accuracy_score(y_train_over, y_pred_train))
    train_recall.append(recall_score(y_train_over, y_pred_train))
    train_precision.append(precision_score(y_train_over, y_pred_train))
    train_f1.append(f1_score(y_train_over, y_pred_train))
    train_roc.append(roc_auc_score(y_train_over, y_pred_train))
    
    test_accuracy.append(accuracy_score(y_test, y_pred_test))
    test_recall.append(recall_score(y_test, y_pred_test))
    test_precision.append(precision_score(y_test, y_pred_test))
    test_f1.append(f1_score(y_test, y_pred_test))
    test_roc.append(roc_auc_score(y_test, y_pred_test))

result = pd.DataFrame({'model name':names, 
                       'train accuracy':train_accuracy,
                       'train precision':train_precision,
                       'train recall':train_recall,
                       'train f1':train_f1,
                       'train roc':train_roc,
                       'test accuracy':test_accuracy,
                       'test precision':test_precision,
                       'test recall':test_recall,
                       'test f1':test_f1,
                       'test roc':test_roc}) 
result

Unnamed: 0,model name,train accuracy,train precision,train recall,train f1,train roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.820125,0.78671,0.878396,0.830029,0.820125,0.669016,0.003868,0.571429,0.007685,0.620332
1,DecisionTree,1.0,1.0,1.0,1.0,1.0,0.995514,0.0,0.0,0.0,0.498876
2,RandomForest,0.922577,0.865916,1.0,0.92814,0.922577,0.805191,0.0,0.0,0.0,0.4035
3,LightGBM,1.0,1.0,1.0,1.0,1.0,0.997757,0.0,0.0,0.0,0.5
4,SVC,0.84204,0.786726,0.938497,0.855936,0.84204,0.673822,0.00295,0.428571,0.005859,0.551473
