In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# scaler 
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import LinearSVC

# pipeline
from sklearn.pipeline import Pipeline 

# resampling
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, CondensedNearestNeighbour, NearMiss
from imblearn.combine import SMOTETomek, SMOTEENN

# model selection
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict, StratifiedKFold, KFold, cross_val_score

# scoring
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

# pca
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

## 데이터 불러오기

In [44]:
socar_df = pd.read_csv("insurance_fraud_detect_data.csv")
pd.set_option('display.max_columns', len(socar_df.columns))
socar = socar_df.copy()
socar.tail()

Unnamed: 0,fraud_YN,car_model,sharing_type,age_group,has_previous_accident,cumulative_use_count,b2b,accident_ratio,pf_type,socarpass,socarsave,start_hour,duration,accident_hour,repair_cost,insure_cost,accident_location,car_part1,car_part2,repair_cnt,acc_type1,insurance_site_aid_YN,police_site_aid_YN,total_prsn_cnt,test_set
15995,0,2,0,2,0,2,0,100,2,0,0,1,1,1,0.0,0,1,1,0,1,0,0,0,-1,0
15996,0,2,0,2,1,4,0,100,1,1,0,5,2,6,618439.4418,0,2,1,0,1,0,0,0,-1,0
15997,0,2,1,2,0,4,0,100,1,0,0,1,3,5,0.0,0,0,0,0,1,0,0,0,-1,1
15998,0,2,0,2,0,2,0,100,2,0,0,6,1,2,0.0,0,4,1,0,1,0,0,0,-1,0
15999,0,2,0,2,0,1,0,100,1,0,0,6,5,6,391560.2156,1188750,2,0,0,1,0,0,0,-1,0


# 데이터 전처리 & 모델링

## 원핫인코딩

In [45]:
socar_df.drop(['sharing_type', 'has_previous_accident', 'b2b', 'socarpass',\
                  'car_part2', 'police_site_aid_YN'], axis=1, inplace=True)

In [46]:
# accident_ratio 재 분류
for i, j in socar_df.iterrows():
    if j['accident_ratio'] < 40:
        socar_df.loc[i, 'accident_ratio'] = 0
    else:
        socar_df.loc[i, 'accident_ratio'] = 1

socar_df['accident_ratio'].value_counts()

1    14345
0     1655
Name: accident_ratio, dtype: int64

In [47]:
# one-hot encoding
socar_df = pd.get_dummies(socar_df, columns=["car_model", "age_group", "cumulative_use_count",\
                                                   "pf_type", "start_hour", "duration", "accident_hour",\
                                                   "accident_location", "acc_type1", "insurance_site_aid_YN",\
                                                  "total_prsn_cnt"])

## 데이터 분리

In [48]:
# train_set, test_set 분리
train_set = socar_df[socar_df["test_set"] == 0]
test_set = socar_df[socar_df["test_set"] == 1]

# "test_set" 컬럼 삭제
train_set = train_set.drop("test_set", axis=1)
test_set = test_set.drop("test_set", axis=1)

# features와 target분리
X = train_set.drop("fraud_YN", axis=1)
y = train_set["fraud_YN"]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=13, stratify=y)
X_test, y_test = test_set.drop("fraud_YN", axis=1), test_set["fraud_YN"]

## 샘플링

In [49]:
over_sampling = RandomOverSampler(random_state=13)
X_train_over, y_train_over = over_sampling.fit_sample(X_train, y_train)

## 성능 평가 함수

In [50]:
def get_score(X_train, y_train, X_valid, y_valid, X_test, y_test):    
    grids = [('LogisticRegression', LR), ('DecisionTree', DT), ('RandomForest', RF), ('LightGBM', LGBM), ('LinearSVC', SVM)]
                  
    df = []
    
    for name, grid in grids:
        clf = grid
        clf.fit(X_train, y_train)
        
        y_pred_valid = clf.predict(X_valid)
        y_pred_test = clf.predict(X_test)
        
        print('{} 교차검증 recall:{}, 최적의 파라미터:{}'.format(name, round(clf.best_score_,3), clf.best_params_))
    
        df.append({
            'model_name' : name,
            'valid_accuracy' : accuracy_score(y_valid, y_pred_valid),
            'valid_recall' : recall_score(y_valid, y_pred_valid),
            'valid_precision' : precision_score(y_valid, y_pred_valid),
            'valid_f1' : f1_score(y_valid, y_pred_valid),
            'valid_roc' : roc_auc_score(y_valid, y_pred_valid),
            'test_accuracy' : accuracy_score(y_test, y_pred_test),
            'test_recall' : recall_score(y_test, y_pred_test),
            'test_precision' : precision_score(y_test, y_pred_test),
            'test_f1' : f1_score(y_test, y_pred_test),
            'test_roc' : roc_auc_score(y_test, y_pred_test)
        })
    
    result = pd.DataFrame(df) 
    
    return result


## 파이프라인

In [51]:
# pipeline
scaler = ("scaler", StandardScaler())
pca = ('pca', PCA(n_components=5))

lr_pipe = Pipeline([scaler, pca, ("clf", LogisticRegression(random_state=13))])
dt_pipe = Pipeline([scaler, pca, ("clf", DecisionTreeClassifier(random_state=13))])
rf_pipe = Pipeline([scaler, pca, ("clf", RandomForestClassifier(random_state=13))])
lgbm_pipe = Pipeline([scaler, pca, ("clf", LGBMClassifier(random_state=13))])
svm_pipe = Pipeline([scaler, pca, ("clf", LinearSVC(random_state=13))])


# grid search params
lr_param_grid = [{'clf__penalty': ['l1', 'l2', 'elasticnet', 'none']}]
dt_param_grid = [{'clf__criterion':['gini'], 'clf__max_depth':[None,2,3,4,5,6]}]
rf_param_grid = [{'clf__n_estimators': [1, 5, 10, 20], 'clf__max_depth' : [2, 3, 4, 5, 10, 50]}]
lgbm_param_grid = [{'clf__n_estimators': [500, 1000], 'clf__num_leaves': [16, 32],\
                   'clf__max_depth': [128, 160], 'clf__min_child_samples': [60, 100]}]
svm_param_grid = [{"clf__penalty": ["l1", "l2"],
                    "clf__loss": ["hinge", "squared_hinge"]}]


# grid search CV
LR = GridSearchCV(lr_pipe, lr_param_grid, cv=5, scoring = 'recall')
DT = GridSearchCV(dt_pipe, dt_param_grid, cv=5, scoring = 'recall')
RF = GridSearchCV(rf_pipe, rf_param_grid, cv=5, scoring = 'recall')
LGBM = GridSearchCV(lgbm_pipe, lgbm_param_grid, cv=5, scoring = 'recall')
SVM = GridSearchCV(svm_pipe, svm_param_grid, cv=5, scoring = 'recall')

In [52]:
get_score(X_train_over, y_train_over, X_valid, y_valid, X_test, y_test)

LogisticRegression 교차검증 recall:0.515, 최적의 파라미터:{'clf__penalty': 'l2'}
DecisionTree 교차검증 recall:0.909, 최적의 파라미터:{'clf__criterion': 'gini', 'clf__max_depth': None}
RandomForest 교차검증 recall:0.955, 최적의 파라미터:{'clf__max_depth': 50, 'clf__n_estimators': 5}
LightGBM 교차검증 recall:0.986, 최적의 파라미터:{'clf__max_depth': 128, 'clf__min_child_samples': 60, 'clf__n_estimators': 500, 'clf__num_leaves': 16}
LinearSVC 교차검증 recall:0.502, 최적의 파라미터:{'clf__loss': 'squared_hinge', 'clf__penalty': 'l2'}


Unnamed: 0,model_name,valid_accuracy,valid_recall,valid_precision,valid_f1,valid_roc,test_accuracy,test_recall,test_precision,test_f1,test_roc
0,LogisticRegression,0.733696,0.714286,0.007257,0.014368,0.724017,0.50817,0.571429,0.002604,0.005185,0.539728
1,DecisionTree,0.993012,0.0,0.0,0.0,0.497859,0.994233,0.0,0.0,0.0,0.498234
2,RandomForest,0.997671,0.142857,1.0,0.25,0.571429,0.996155,0.0,0.0,0.0,0.499197
3,LightGBM,0.997283,0.0,0.0,0.0,0.5,0.997116,0.0,0.0,0.0,0.499679
4,LinearSVC,0.738742,0.571429,0.005935,0.011747,0.655313,0.509452,0.571429,0.002611,0.005198,0.540371
