In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
from sklearn.model_selection import GridSearchCV
import numpy as np

# scaler 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer

# models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import LinearSVC


# pipeline
from sklearn.pipeline import Pipeline 
from imblearn.pipeline import Pipeline as Pipeline_imb 

# resampling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler 
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import CondensedNearestNeighbour

# model selection
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# scoring
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score, f1_score, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [11]:
# 데이터 가지고오기
socar_df = pd.read_csv("../../0. raw_data/insurance_fraud_detect_data.csv")
pd.set_option('display.max_columns', len(socar_df.columns))

In [12]:
socar_df.columns

Index(['fraud_YN', 'car_model', 'sharing_type', 'age_group',
       'has_previous_accident', 'cumulative_use_count', 'b2b',
       'accident_ratio', 'pf_type', 'socarpass', 'socarsave', 'start_hour',
       'duration', 'accident_hour', 'repair_cost', 'insure_cost',
       'accident_location', 'car_part1', 'car_part2', 'repair_cnt',
       'acc_type1', 'insurance_site_aid_YN', 'police_site_aid_YN',
       'total_prsn_cnt', 'test_set'],
      dtype='object')

In [13]:
# one-hot encoding
# socar_df.drop(['sharing_type', 'has_previous_accident', 'socarpass', 'car_part2'], axis =1, inplace=True)
socar_df = pd.get_dummies(socar_df, columns=['car_model', 'age_group','has_previous_accident', 'cumulative_use_count',
                                             'accident_ratio','pf_type', 'start_hour', 'duration','accident_hour',
                                             'accident_location', 'acc_type1', 'insurance_site_aid_YN', 'total_prsn_cnt'])

In [14]:
# train_set, test_set 분리
train_set = socar_df[socar_df["test_set"] == 0]
test_set = socar_df[socar_df["test_set"] == 1]

# "test_set" 컬럼 삭제
train_set = train_set.drop("test_set", axis=1)
test_set = test_set.drop("test_set", axis=1)

# features와 target분리
X = train_set.drop("fraud_YN", axis=1)
y = train_set["fraud_YN"]
X_test = test_set.drop("fraud_YN", axis=1)
y_test = test_set["fraud_YN"]

In [6]:
# pipeline

lr_pipe = Pipeline([
    ("scaler", FunctionTransformer(np.log1p)),
    ("clf", LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ("scaler",   FunctionTransformer(np.log1p)),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ("scaler",  FunctionTransformer(np.log1p)),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ("scaler",  FunctionTransformer(np.log1p)),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ("scaler",  FunctionTransformer(np.log1p)),
    ("clf", LinearSVC(random_state=13))])

# grid search params
lr_param_grid = [{'clf__penalty': ['l2', 'elasticnet', 'none']}]

dt_param_grid = [{'clf__criterion':['gini', 'entropy'], 'clf__max_depth':[None,2,3,4,5,6]}]

rf_param_grid = [{'clf__n_estimators': [5, 10, 20],
                  'clf__max_depth' : [2, 3, 4, 5, 10, 50, 100]}]

lgbm_param_grid = [{'clf__n_estimators': [800, 1000, 1200], 'clf__num_leaves': [32, 64],\
                   'clf__max_depth': [128, 160], 'clf__min_child_samples': [60, 100]}]

svm_param_grid = [{"clf__penalty": ["l1", "l2"],
                    "clf__loss": ["hinge", "squared_hinge"]}]

# straitified KFold
skfold = StratifiedKFold(n_splits=3)

# grid search CV
LR = GridSearchCV(lr_pipe, lr_param_grid, cv=skfold, scoring = 'recall')
DT = GridSearchCV(dt_pipe, dt_param_grid, cv=skfold, scoring = 'recall')
RF = GridSearchCV(rf_pipe, rf_param_grid, cv=skfold, scoring = 'recall')
LGBM = GridSearchCV(lgbm_pipe, lgbm_param_grid, cv=skfold, scoring = 'recall')
SVM = GridSearchCV(svm_pipe, svm_param_grid, cv=skfold, scoring = 'recall')

In [24]:
# 샘플링 RandomOverSampler
over_sampling = RandomOverSampler(random_state=13)
X_train_over, y_train_over = over_sampling.fit_sample(X, y)

In [32]:
def performace(X_train_over, y_train_over, X_test, y_test):
    grids = []
    grids.append(('LogisticRe', LR))
    grids.append(('DecisionTree', DT))
    grids.append(('RandomForest', RF))
    grids.append(('LightGBM', LGBM))
    grids.append(('SVC', SVM))


    train_accuracy = []
    train_recall = []
#     train_precision = []
#     train_f1 = []
#     train_roc = []
    test_accuracy = []
    test_recall = []
#     test_precision = []
#     test_f1 = []
#     test_roc = []
    names = []

    for name, grid in grids:
        clf = grid
        clf.fit(X_train_over, y_train_over)

        y_pred_train = clf.predict(X_train_over)
        y_pred_test = clf.predict(X_test)

        names.append(name)
        train_accuracy.append(accuracy_score(y_train_over, y_pred_train))
        train_recall.append(recall_score(y_train_over, y_pred_train))
#         train_precision.append(precision_score(y_train_over, y_pred_train))
#         train_f1.append(f1_score(y_train_over, y_pred_train))
#         train_roc.append(roc_auc_score(y_train_over, y_pred_train))

        test_accuracy.append(accuracy_score(y_test, y_pred_test))
        test_recall.append(recall_score(y_test, y_pred_test))
#         test_precision.append(precision_score(y_test, y_pred_test))
#         test_f1.append(f1_score(y_test, y_pred_test))
#         test_roc.append(roc_auc_score(y_test, y_pred_test))

    result = pd.DataFrame({'model name':names, 
                           'train accuracy':train_accuracy,
                           'train recall':train_recall,
#                            'train f1':train_f1,
#                            'train roc':train_roc,
                           'test accuracy':test_accuracy,
#                            'test precision':test_precision,
                           'test recall':test_recall,})
#                            'test f1':test_f1,
#                            'test roc':test_roc}) 
    return result

In [18]:
# 오버샘플링 RandomOverSampler
over_sampling = RandomOverSampler(random_state=13)
X_train_over, y_train_over = over_sampling.fit_sample(X, y)
performace(X_train_over, y_train_over, X_test, y_test)

Unnamed: 0,model name,train accuracy,train precision,train recall,train f1,train roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.89276,0.840648,0.969249,0.90038,0.89276,0.717719,0.004535,0.571429,0.008999,0.644738
1,DecisionTree,1.0,1.0,1.0,1.0,1.0,0.995194,0.0,0.0,0.0,0.498715
2,RandomForest,0.994745,0.989599,1.0,0.994773,0.994745,0.979173,0.0,0.0,0.0,0.490687
3,LightGBM,1.0,1.0,1.0,1.0,1.0,0.997757,0.0,0.0,0.0,0.5
4,SVC,0.892838,0.840762,0.969249,0.900445,0.892838,0.726049,0.003513,0.428571,0.006969,0.577645


In [19]:
# 오버샘플링 SMOTE 
over_sampling = SMOTE(random_state=13)
X_train_over, y_train_over = over_sampling.fit_sample(X, y)
performace(X_train_over, y_train_over, X_test, y_test)

Unnamed: 0,model name,train accuracy,train precision,train recall,train f1,train roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.998677,1.0,0.997353,0.998675,0.998677,0.997757,0.0,0.0,0.0,0.5
1,DecisionTree,0.942624,0.897838,0.99891,0.945681,0.942624,0.816085,0.005236,0.428571,0.010345,0.622764
2,RandomForest,0.992799,0.987298,0.998443,0.992839,0.992799,0.982698,0.0,0.0,0.0,0.492453
3,LightGBM,1.0,1.0,1.0,1.0,1.0,0.997757,0.0,0.0,0.0,0.5
4,SVC,0.998677,1.0,0.997353,0.998675,0.998677,0.997757,0.0,0.0,0.0,0.5


In [20]:
# 복합샘플링 SMOTEENN
combine_sampling = SMOTEENN(random_state=13)
X_train_over, y_train_over = combine_sampling.fit_sample(X, y)
performace(X_train_over, y_train_over, X_test, y_test)

Unnamed: 0,model name,train accuracy,train precision,train recall,train f1,train roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.998736,1.0,0.997545,0.998771,0.998773,0.997116,0.0,0.0,0.0,0.499679
1,DecisionTree,1.0,1.0,1.0,1.0,1.0,0.984941,0.0,0.0,0.0,0.493577
2,RandomForest,0.999747,1.0,0.999509,0.999754,0.999755,0.997757,0.0,0.0,0.0,0.5
3,LightGBM,1.0,1.0,1.0,1.0,1.0,0.997757,0.0,0.0,0.0,0.5
4,SVC,0.998736,1.0,0.997545,0.998771,0.998773,0.997757,0.0,0.0,0.0,0.5


In [17]:
# 언더샘플링 CondensedNearestNeighbour
under_sampling = CondensedNearestNeighbour(random_state=13)
X_train_over, y_train_over = under_sampling.fit_sample(X, y)
performace(X_train_over, y_train_over, X_test, y_test)

Unnamed: 0,model name,train accuracy,train precision,train recall,train f1,train roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.932065,0.736842,0.411765,0.528302,0.698397,0.875681,0.002611,0.142857,0.005128,0.510093
1,DecisionTree,1.0,1.0,1.0,1.0,1.0,0.89875,0.003215,0.142857,0.006289,0.521653
2,RandomForest,0.953804,1.0,0.5,0.666667,0.75,0.982057,0.019608,0.142857,0.034483,0.5634
3,LightGBM,0.991848,1.0,0.911765,0.953846,0.955882,0.9529,0.0,0.0,0.0,0.477521
4,SVC,0.915761,0.636364,0.205882,0.311111,0.596953,0.945851,0.0,0.0,0.0,0.473988


In [16]:
# 언더샘플링 Random Under Sampling
under_sampling = RandomUnderSampler(random_state=13)
X_train_over, y_train_over = under_sampling.fit_sample(X, y)
performace(X_train_over, y_train_over, X_test, y_test)

Unnamed: 0,model name,train accuracy,train precision,train recall,train f1,train roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,1.0,1.0,1.0,1.0,1.0,0.473566,0.003038,0.714286,0.00605,0.593655
1,DecisionTree,0.75,0.757576,0.735294,0.746269,0.75,0.509452,0.002611,0.571429,0.005198,0.540371
2,RandomForest,0.691176,0.740741,0.588235,0.655738,0.691176,0.617751,0.004181,0.714286,0.008313,0.66591
3,LightGBM,0.5,0.0,0.0,0.0,0.5,0.997757,0.0,0.0,0.0,0.5
4,SVC,0.897059,0.909091,0.882353,0.895522,0.897059,0.521307,0.00334,0.714286,0.006649,0.61758


In [21]:
### 변수삭제

In [25]:
# 데이터 가지고오기
socar_df = pd.read_csv("../../0. raw_data/insurance_fraud_detect_data.csv")
pd.set_option('display.max_columns', len(socar_df.columns))

In [26]:
# 동일분표 변수삭제, one-hot encoding
socar_df.drop(['sharing_type', 'has_previous_accident', 'socarpass', 'car_part2'], axis =1, inplace=True)
socar_df = pd.get_dummies(socar_df, columns=['car_model', 'age_group', 'cumulative_use_count',
                                             'accident_ratio','pf_type', 'start_hour', 'duration','accident_hour',
                                             'accident_location', 'acc_type1', 'insurance_site_aid_YN', 'total_prsn_cnt'])

In [27]:
# train_set, test_set 분리
train_set = socar_df[socar_df["test_set"] == 0]
test_set = socar_df[socar_df["test_set"] == 1]

# "test_set" 컬럼 삭제
train_set = train_set.drop("test_set", axis=1)
test_set = test_set.drop("test_set", axis=1)

# features와 target분리
X = train_set.drop("fraud_YN", axis=1)
y = train_set["fraud_YN"]
X_test = test_set.drop("fraud_YN", axis=1)
y_test = test_set["fraud_YN"]

In [28]:
# 오버샘플링 RandomOverSampler
over_sampling = RandomOverSampler(random_state=13)
X_train_over, y_train_over = over_sampling.fit_sample(X, y)
performace(X_train_over, y_train_over, X_test, y_test)

Unnamed: 0,model name,train accuracy,train precision,train recall,train f1,train roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.892448,0.840194,0.969249,0.900119,0.892448,0.718039,0.00454,0.571429,0.009009,0.644899
1,DecisionTree,1.0,1.0,1.0,1.0,1.0,0.994873,0.0,0.0,0.0,0.498555
2,RandomForest,0.987116,0.974879,1.0,0.98728,0.987116,0.950657,0.013245,0.285714,0.025316,0.618933
3,LightGBM,1.0,1.0,1.0,1.0,1.0,0.997757,0.0,0.0,0.0,0.5
4,SVC,0.890619,0.837538,0.969249,0.898593,0.890619,0.712272,0.003344,0.428571,0.006637,0.57074


In [29]:
# 오버샘플링 SMOTE 
over_sampling = SMOTE(random_state=13)
X_train_over, y_train_over = over_sampling.fit_sample(X, y)
performace(X_train_over, y_train_over, X_test, y_test)

Unnamed: 0,model name,train accuracy,train precision,train recall,train f1,train roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.998677,1.0,0.997353,0.998675,0.998677,0.997757,0.0,0.0,0.0,0.5
1,DecisionTree,0.942624,0.897838,0.99891,0.945681,0.942624,0.816085,0.005236,0.428571,0.010345,0.622764
2,RandomForest,0.994862,0.991115,0.998677,0.994881,0.994862,0.986222,0.0,0.0,0.0,0.49422
3,LightGBM,1.0,1.0,1.0,1.0,1.0,0.997757,0.0,0.0,0.0,0.5
4,SVC,0.998677,1.0,0.997353,0.998675,0.998677,0.997757,0.0,0.0,0.0,0.5


In [30]:
# 복합샘플링 SMOTEENN
combine_sampling = SMOTEENN(random_state=13)
X_train_over, y_train_over = combine_sampling.fit_sample(X, y)
performace(X_train_over, y_train_over, X_test, y_test)

Unnamed: 0,model name,train accuracy,train precision,train recall,train f1,train roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.998738,1.0,0.997545,0.998771,0.998773,0.997437,0.0,0.0,0.0,0.499839
1,DecisionTree,1.0,1.0,1.0,1.0,1.0,0.987824,0.030303,0.142857,0.05,0.56629
2,RandomForest,0.999811,1.0,0.999632,0.999816,0.999816,0.997757,0.0,0.0,0.0,0.5
3,LightGBM,1.0,1.0,1.0,1.0,1.0,0.997757,0.0,0.0,0.0,0.5
4,SVC,0.998738,1.0,0.997545,0.998771,0.998773,0.997757,0.0,0.0,0.0,0.5


In [31]:
# 언더샘플링 CondensedNearestNeighbour
under_sampling = CondensedNearestNeighbour(random_state=13)
X_train_over, y_train_over = under_sampling.fit_sample(X, y)
performace(X_train_over, y_train_over, X_test, y_test)

Unnamed: 0,model name,train accuracy,train precision,train recall,train f1,train roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.923288,0.6875,0.323529,0.44,0.654212,0.880167,0.00271,0.142857,0.005319,0.512341
1,DecisionTree,1.0,1.0,1.0,1.0,1.0,0.900673,0.006515,0.285714,0.012739,0.593885
2,RandomForest,0.906849,0.0,0.0,0.0,0.5,0.997757,0.0,0.0,0.0,0.5
3,LightGBM,0.986301,1.0,0.852941,0.920635,0.926471,0.963153,0.0,0.0,0.0,0.482659
4,SVC,0.912329,0.625,0.147059,0.238095,0.568998,0.950336,0.0,0.0,0.0,0.476236


In [33]:
# 언더샘플링 Random Under Sampling
under_sampling = RandomUnderSampler(random_state=13)
X_train_over, y_train_over = under_sampling.fit_sample(X, y)
performace(X_train_over, y_train_over, X_test, y_test)

Unnamed: 0,model name,train accuracy,train recall,test accuracy,test recall
0,LogisticRe,1.0,1.0,0.490227,0.714286
1,DecisionTree,0.75,0.735294,0.509773,0.571429
2,RandomForest,0.897059,0.970588,0.546299,0.428571
3,LightGBM,0.5,0.0,0.997757,0.0
4,SVC,1.0,1.0,0.544377,0.571429
