In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# scaler 
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import LinearSVC

# pipeline
from sklearn.pipeline import Pipeline 

# resampling
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, CondensedNearestNeighbour, NearMiss
from imblearn.combine import SMOTETomek, SMOTEENN

# model selection
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict, StratifiedKFold, KFold, cross_val_score

# scoring
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

# pca
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

## 데이터 불러오기

In [48]:
socar_df = pd.read_csv("insurance_fraud_detect_data.csv")
pd.set_option('display.max_columns', len(socar_df.columns))
socar_df.tail()

Unnamed: 0,fraud_YN,car_model,sharing_type,age_group,has_previous_accident,cumulative_use_count,b2b,accident_ratio,pf_type,socarpass,socarsave,start_hour,duration,accident_hour,repair_cost,insure_cost,accident_location,car_part1,car_part2,repair_cnt,acc_type1,insurance_site_aid_YN,police_site_aid_YN,total_prsn_cnt,test_set
15995,0,2,0,2,0,2,0,100,2,0,0,1,1,1,0.0,0,1,1,0,1,0,0,0,-1,0
15996,0,2,0,2,1,4,0,100,1,1,0,5,2,6,618439.4418,0,2,1,0,1,0,0,0,-1,0
15997,0,2,1,2,0,4,0,100,1,0,0,1,3,5,0.0,0,0,0,0,1,0,0,0,-1,1
15998,0,2,0,2,0,2,0,100,2,0,0,6,1,2,0.0,0,4,1,0,1,0,0,0,-1,0
15999,0,2,0,2,0,1,0,100,1,0,0,6,5,6,391560.2156,1188750,2,0,0,1,0,0,0,-1,0


# 데이터 전처리 & 모델링

## 데이터 분리

In [49]:
# train_set, test_set 분리
train_set = socar_df[socar_df["test_set"] == 0]
test_set = socar_df[socar_df["test_set"] == 1]

# "test_set" 컬럼 삭제
train_set = train_set.drop("test_set", axis=1)
test_set = test_set.drop("test_set", axis=1)

# features와 target분리
X = train_set.drop("fraud_YN", axis=1)
y = train_set["fraud_YN"]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=13, stratify=y)
X_test, y_test = test_set.drop("fraud_YN", axis=1), test_set["fraud_YN"]

## 샘플링

In [50]:
over_sampling = RandomOverSampler(random_state=13)
X_train_over, y_train_over = over_sampling.fit_sample(X_train, y_train)

## 파이프라인

#### Trial3. 변수삭제 x >> 결측치 처리 x >> One-Hot Encoding x >> RandomOverSampling >> SS >> PCA(n=4)

In [51]:
# 파이프라인 설정

lr_pipe = Pipeline([
    ('scaler', StandardScaler()), 
    ('pca', PCA(n_components=4)),
    ('clf', LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=4)),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=4)),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=4)),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=4)),
    ("clf", LinearSVC(random_state=13))])

In [52]:
# 파이프라인 세팅설정

dt_pipe.set_params(clf__max_depth=4)
rf_pipe.set_params(clf__n_jobs=-1, clf__n_estimators=100)
lgbm_pipe.set_params(clf__n_estimators=1000, clf__num_leaves=64, clf__n_jobs=-1, clf__boost_from_average=False)

Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=4)),
                ('clf',
                 LGBMClassifier(boost_from_average=False, n_estimators=1000,
                                num_leaves=64, random_state=13))])

In [53]:
# 성능 평가 함수

def get_clf_eval(y_valid, y_pred_valid, y_test, y_pred_test):
    
    accuracy = accuracy_score(y_valid,y_pred_valid) 
    precision = precision_score(y_valid,y_pred_valid) 
    recall = recall_score(y_valid,y_pred_valid) 
    f1 = f1_score(y_valid,y_pred_valid)
    roc_auc = roc_auc_score(y_valid,y_pred_valid)
    
    accuracy_t = accuracy_score(y_test,y_pred_test) 
    precision_t = precision_score(y_test,y_pred_test) 
    recall_t = recall_score(y_test,y_pred_test) 
    f1_t = f1_score(y_test,y_pred_test)
    roc_auc_t = roc_auc_score(y_test,y_pred_test)
    
    return accuracy, precision, recall, f1, roc_auc, accuracy_t, precision_t, recall_t, f1_t, roc_auc_t

In [54]:
# 결과( 피피티에 넣을 거)
pipes = [lr_pipe, dt_pipe, rf_pipe, lgbm_pipe, svm_pipe]

col_names = ['Accuracy', 'precision', 'recall', 'f1', 'roc',\
             'Accuracy_t', 'precision_t', 'recall_t', 'f1_t', 'roc_t']    
index = ['LogiReg', 'DecisionTree', 'RandomFore', 'LGBM', 'SVC']
tmp = []

for pipe in pipes:
    clf = pipe
    clf.fit(X_train_over, y_train_over)
    
    y_pred_valid = clf.predict(X_valid)
    y_pred_test = clf.predict(X_test)
    
    tmp.append(get_clf_eval(y_valid, y_pred_valid, y_test, y_pred_test))

pd.DataFrame(tmp, columns=col_names, index=index)

Unnamed: 0,Accuracy,precision,recall,f1,roc,Accuracy_t,precision_t,recall_t,f1_t,roc_t
LogiReg,0.708463,0.006631,0.714286,0.013141,0.711366,0.522589,0.002683,0.571429,0.00534,0.546954
DecisionTree,0.906444,0.004237,0.142857,0.00823,0.525691,0.878244,0.002667,0.142857,0.005236,0.511377
RandomFore,0.996894,0.0,0.0,0.0,0.499805,0.997757,0.0,0.0,0.0,0.5
LGBM,0.996506,0.0,0.0,0.0,0.499611,0.997116,0.0,0.0,0.0,0.499679
SVC,0.717003,0.006831,0.714286,0.013532,0.715648,0.529958,0.002725,0.571429,0.005424,0.550647
