In [1]:
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# scaler 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler

# models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import LinearSVC

# pipeline
from sklearn.pipeline import Pipeline 

# resampling
from imblearn.over_sampling import RandomOverSampler 
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import NearMiss
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek

# model selection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# scoring
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# pca
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

In [2]:
# 데이터 불러오기
socar_df = pd.read_csv("insurance_fraud_detect_data.csv")
pd.set_option('display.max_columns', len(socar_df.columns))
socar_df.head()

Unnamed: 0,fraud_YN,car_model,sharing_type,age_group,has_previous_accident,cumulative_use_count,b2b,accident_ratio,pf_type,socarpass,socarsave,start_hour,duration,accident_hour,repair_cost,insure_cost,accident_location,car_part1,car_part2,repair_cnt,acc_type1,insurance_site_aid_YN,police_site_aid_YN,total_prsn_cnt,test_set
0,0,2,1,3,0,2,0,100,2,0,0,4,2,4,130254.7,0,1,0,0,1,0,0,0,-1,0
1,0,1,0,4,0,4,0,0,3,1,0,1,4,4,0.0,0,1,1,1,2,1,1,1,0,1
2,0,1,0,4,1,4,0,100,3,1,0,1,4,6,258130.6,12925040,3,1,0,2,1,1,1,0,1
3,0,3,1,3,0,4,0,0,2,0,0,3,3,5,1341009.0,0,3,1,0,1,0,0,0,-1,0
4,0,1,0,1,0,1,0,100,1,0,0,1,1,1,524030.6,1209410,1,0,0,2,0,0,0,-1,0


In [3]:
socar_copy1 = socar_df.copy()

In [4]:
socar_copy1.drop(['sharing_type', 'has_previous_accident', 'b2b', 'socarpass',\
                  'car_part2', 'police_site_aid_YN'], axis=1, inplace=True)

In [5]:
# accident_ratio 분류하기
for i, j in socar_copy1.iterrows():
    if j['accident_ratio'] < 40:
        socar_copy1.loc[i, 'accident_ratio'] = 0
    else:
        socar_copy1.loc[i, 'accident_ratio'] = 1

socar_copy1['accident_ratio'].value_counts()

1    14345
0     1655
Name: accident_ratio, dtype: int64

In [6]:
# one-hot encoding
socar_copy1 = pd.get_dummies(socar_copy1, columns=["car_model", "age_group", "cumulative_use_count",\
                                                   "pf_type", "start_hour", "duration", "accident_hour",\
                                                   "accident_location", "acc_type1", "insurance_site_aid_YN",\
                                                  "total_prsn_cnt"])

In [7]:
# train_set, test_set 분리
train_set = socar_copy1[socar_copy1["test_set"] == 0]
test_set = socar_copy1[socar_copy1["test_set"] == 1]

# "test_set" 컬럼 삭제
train_set = train_set.drop("test_set", axis=1)
test_set = test_set.drop("test_set", axis=1)

# features와 target분리
X_features = train_set.drop("fraud_YN", axis=1)
y_target = train_set["fraud_YN"]
X_test = test_set.drop("fraud_YN", axis=1)
y_test = test_set["fraud_YN"]

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_features, y_target, test_size=0.2,\
                                                      random_state=13, stratify=y_target)

In [9]:
# 샘플링 RandomOverSampler

over_sampling = RandomOverSampler(random_state=13)
X_train_over, y_train_over = over_sampling.fit_sample(X_train, y_train)

In [10]:
def get_score(X_train_over, y_train_over):    
    grids = []
    grids.append(('LogisticRe', LR))
    grids.append(('DecisionTree', DT))
    grids.append(('RandomForest', RF))
    grids.append(('LightGBM', LGBM))
    grids.append(('SVC', SVM))
    
                  
    valid_accuracy = []
    valid_recall = []
    valid_precision = []
    valid_f1 = []
    valid_roc = []
    test_accuracy = []
    test_recall = []
    test_precision = []
    test_f1 = []
    test_roc = []
    names = []
    
    for name, grid in grids:
        clf = grid
        clf.fit(X_train_over, y_train_over)
        
        y_pred_valid = clf.predict(X_valid)
        y_pred_test = clf.predict(X_test)
    
        names.append(name)
        valid_accuracy.append(accuracy_score(y_valid, y_pred_valid))
        valid_recall.append(recall_score(y_valid, y_pred_valid))
        valid_precision.append(precision_score(y_valid, y_pred_valid))
        valid_f1.append(f1_score(y_valid, y_pred_valid))
        valid_roc.append(roc_auc_score(y_valid, y_pred_valid))
        
        test_accuracy.append(accuracy_score(y_test, y_pred_test))
        test_recall.append(recall_score(y_test, y_pred_test))
        test_precision.append(precision_score(y_test, y_pred_test))
        test_f1.append(f1_score(y_test, y_pred_test))
        test_roc.append(roc_auc_score(y_test, y_pred_test))
    
    result = pd.DataFrame({'model name':names, 
                           'valid accuracy':valid_accuracy,
                           'valid precision':valid_precision,
                           'valid recall':valid_recall,
                           'valid f1':valid_f1,
                           'valid roc':valid_roc,
                           'test accuracy':test_accuracy,
                           'test precision':test_precision,
                           'test recall':test_recall,
                           'test f1':test_f1,
                           'test roc':test_roc}) 
    return result

In [11]:
# pipeline

lr_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ('pca', PCA(n_components=5)),
    ("clf", LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=5)),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ('pca', PCA(n_components=5)),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=5)),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ('pca', PCA(n_components=5)),
    ("clf", LinearSVC(random_state=13))])



# grid search params
lr_param_grid = [{'clf__penalty': ['l1', 'l2', 'elasticnet', 'none']}]

dt_param_grid = [{'clf__criterion':['gini'], 'clf__max_depth':[None,2,3,4,5,6]}]

rf_param_grid = [{'clf__n_estimators': [1, 5, 10, 20],
                  'clf__max_depth' : [2, 3, 4, 5, 10, 50]}]

lgbm_param_grid = [{'clf__n_estimators': [500, 1000], 'clf__num_leaves': [16, 32],\
                   'clf__max_depth': [128, 160], 'clf__min_child_samples': [60, 100]}]

svm_param_grid = [{"clf__penalty": ["l1", "l2"],
                    "clf__loss": ["hinge", "squared_hinge"]}]




# grid search CV
LR = GridSearchCV(lr_pipe, lr_param_grid, cv=5, scoring = 'recall')
DT = GridSearchCV(dt_pipe, dt_param_grid, cv=5, scoring = 'recall')
RF = GridSearchCV(rf_pipe, rf_param_grid, cv=5, scoring = 'recall')
LGBM = GridSearchCV(lgbm_pipe, lgbm_param_grid, cv=5, scoring = 'recall')
SVM = GridSearchCV(svm_pipe, svm_param_grid, cv=5, scoring = 'recall')

In [12]:
# ppt에 넣었던 성능
get_score(X_train_over, y_train_over)

Unnamed: 0,model name,valid accuracy,valid precision,valid recall,valid f1,valid roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.733696,0.007257,0.714286,0.014368,0.724017,0.50817,0.002604,0.571429,0.005185,0.539728
1,DecisionTree,0.964674,0.0,0.0,0.0,0.483651,0.935598,0.0,0.0,0.0,0.46885
2,RandomForest,0.997671,1.0,0.142857,0.25,0.571429,0.996155,0.0,0.0,0.0,0.499197
3,LightGBM,0.996894,0.0,0.0,0.0,0.499805,0.997116,0.0,0.0,0.0,0.499679
4,SVC,0.737966,0.005917,0.571429,0.011713,0.654924,0.509132,0.002609,0.571429,0.005195,0.54021


In [27]:
# pipeline

lr_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ('pca', PCA(n_components=3)),
    ("clf", LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ('pca', PCA(n_components=3)),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ('pca', PCA(n_components=3)),
    ("clf", LinearSVC(random_state=13))])



# grid search params
lr_param_grid = [{'clf__penalty': ['l1', 'l2', 'elasticnet', 'none']}]

dt_param_grid = [{'clf__criterion':['gini'], 'clf__max_depth':[None,2,3,4,5,6]}]

rf_param_grid = [{'clf__n_estimators': [1, 5, 10, 20],
                  'clf__max_depth' : [2, 3, 4, 5, 10, 50]}]

lgbm_param_grid = [{'clf__n_estimators': [500, 1000], 'clf__num_leaves': [16, 32],\
                   'clf__max_depth': [128, 160], 'clf__min_child_samples': [60, 100]}]

svm_param_grid = [{"clf__penalty": ["l1", "l2"],
                    "clf__loss": ["hinge", "squared_hinge"]}]




# grid search CV
LR = GridSearchCV(lr_pipe, lr_param_grid, scoring = 'recall')
DT = GridSearchCV(dt_pipe, dt_param_grid, scoring = 'recall')
RF = GridSearchCV(rf_pipe, rf_param_grid, scoring = 'recall')
LGBM = GridSearchCV(lgbm_pipe, lgbm_param_grid, scoring = 'recall')
SVM = GridSearchCV(svm_pipe, svm_param_grid, scoring = 'recall')

In [269]:
get_score(X_train_over, y_train_over)

Unnamed: 0,model name,valid accuracy,valid precision,valid recall,valid f1,valid roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.747283,0.006135,0.571429,0.01214,0.659595,0.516181,0.001988,0.428571,0.003958,0.472475
1,DecisionTree,0.995342,0.0,0.0,0.0,0.499027,0.993912,0.0,0.0,0.0,0.498073
2,RandomForest,0.994953,0.0,0.0,0.0,0.498832,0.995835,0.0,0.0,0.0,0.499037
3,LightGBM,0.996894,0.0,0.0,0.0,0.499805,0.997437,0.0,0.0,0.0,0.499839
4,SVC,0.748447,0.006163,0.571429,0.012195,0.660179,0.516822,0.001991,0.428571,0.003963,0.472796


In [270]:
# pipeline

lr_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ('pca', PCA(n_components=7)),
    ("clf", LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=7)),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ('pca', PCA(n_components=7)),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=7)),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ('pca', PCA(n_components=7)),
    ("clf", LinearSVC(random_state=13))])



# grid search params
lr_param_grid = [{'clf__penalty': ['l1', 'l2', 'elasticnet', 'none']}]

dt_param_grid = [{'clf__criterion':['gini'], 'clf__max_depth':[None,2,3,4,5,6]}]

rf_param_grid = [{'clf__n_estimators': [1, 5, 10, 20],
                  'clf__max_depth' : [2, 3, 4, 5, 10, 50]}]

lgbm_param_grid = [{'clf__n_estimators': [500, 1000], 'clf__num_leaves': [16, 32],\
                   'clf__max_depth': [128, 160], 'clf__min_child_samples': [60, 100]}]

svm_param_grid = [{"clf__penalty": ["l1", "l2"],
                    "clf__loss": ["hinge", "squared_hinge"]}]




# grid search CV
LR = GridSearchCV(lr_pipe, lr_param_grid, scoring = 'recall')
DT = GridSearchCV(dt_pipe, dt_param_grid, scoring = 'recall')
RF = GridSearchCV(rf_pipe, rf_param_grid, scoring = 'recall')
LGBM = GridSearchCV(lgbm_pipe, lgbm_param_grid, scoring = 'recall')
SVM = GridSearchCV(svm_pipe, svm_param_grid, scoring = 'recall')

In [271]:
get_score(X_train_over, y_train_over)

Unnamed: 0,model name,valid accuracy,valid precision,valid recall,valid f1,valid roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.711568,0.005376,0.571429,0.010652,0.641689,0.548863,0.004246,0.857143,0.008451,0.702656
1,DecisionTree,0.958851,0.009901,0.142857,0.018519,0.551966,0.938481,0.0,0.0,0.0,0.470295
2,RandomForest,0.987966,0.0,0.0,0.0,0.495329,0.984941,0.0,0.0,0.0,0.493577
3,LightGBM,0.997283,0.0,0.0,0.0,0.5,0.997116,0.0,0.0,0.0,0.499679
4,SVC,0.710404,0.005355,0.571429,0.01061,0.641105,0.547901,0.004237,0.857143,0.008433,0.702175


In [272]:
# pipeline

lr_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LinearSVC(random_state=13))])



# grid search params
lr_param_grid = [{'clf__penalty': ['l1', 'l2', 'elasticnet', 'none']}]

dt_param_grid = [{'clf__criterion':['gini'], 'clf__max_depth':[None,2,3,4,5,6]}]

rf_param_grid = [{'clf__n_estimators': [1, 5, 10, 20],
                  'clf__max_depth' : [2, 3, 4, 5, 10, 50]}]

lgbm_param_grid = [{'clf__n_estimators': [500, 1000], 'clf__num_leaves': [16, 32],\
                   'clf__max_depth': [128, 160], 'clf__min_child_samples': [60, 100]}]

svm_param_grid = [{"clf__penalty": ["l1", "l2"],
                    "clf__loss": ["hinge", "squared_hinge"]}]




# grid search CV
LR = GridSearchCV(lr_pipe, lr_param_grid, scoring = 'recall')
DT = GridSearchCV(dt_pipe, dt_param_grid, scoring = 'recall')
RF = GridSearchCV(rf_pipe, rf_param_grid, scoring = 'recall')
LGBM = GridSearchCV(lgbm_pipe, lgbm_param_grid, scoring = 'recall')
SVM = GridSearchCV(svm_pipe, svm_param_grid, scoring = 'recall')

In [273]:
get_score(X_train_over, y_train_over)

Unnamed: 0,model name,valid accuracy,valid precision,valid recall,valid f1,valid roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.691382,0.005025,0.571429,0.009963,0.631569,0.48446,0.002484,0.571429,0.004947,0.527847
1,DecisionTree,0.99573,0.0,0.0,0.0,0.499221,0.995835,0.0,0.0,0.0,0.499037
2,RandomForest,0.997283,0.0,0.0,0.0,0.5,0.997116,0.0,0.0,0.0,0.499679
3,LightGBM,0.997283,0.0,0.0,0.0,0.5,0.997116,0.0,0.0,0.0,0.499679
4,SVC,0.690994,0.005019,0.571429,0.00995,0.631374,0.483499,0.00248,0.571429,0.004938,0.527365


In [274]:
# pipeline

lr_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LinearSVC(C=1, random_state=13))])



# grid search params
lr_param_grid = [{'clf__penalty': ['l1', 'l2', 'elasticnet', 'none']}]

dt_param_grid = [{'clf__criterion':['gini'], 'clf__max_depth':[None,2,3,4,5,6]}]

rf_param_grid = [{'clf__n_estimators': [1, 5, 10, 20],
                  'clf__max_depth' : [2, 3, 4, 5, 10, 50]}]

lgbm_param_grid = [{'clf__n_estimators': [500, 1000], 'clf__num_leaves': [16, 32],\
                   'clf__max_depth': [128, 160], 'clf__min_child_samples': [60, 100]}]

svm_param_grid = [{"clf__penalty": ["l1", "l2"],
                    "clf__loss": ["hinge", "squared_hinge"]}]




# grid search CV
LR = GridSearchCV(lr_pipe, lr_param_grid, scoring = 'recall')
DT = GridSearchCV(dt_pipe, dt_param_grid, scoring = 'recall')
RF = GridSearchCV(rf_pipe, rf_param_grid, scoring = 'recall')
LGBM = GridSearchCV(lgbm_pipe, lgbm_param_grid, scoring = 'recall')
SVM = GridSearchCV(svm_pipe, svm_param_grid, scoring = 'recall')

In [275]:
get_score(X_train_over, y_train_over)

Unnamed: 0,model name,valid accuracy,valid precision,valid recall,valid f1,valid roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.691382,0.005025,0.571429,0.009963,0.631569,0.486062,0.002492,0.571429,0.004963,0.528649
1,DecisionTree,0.995342,0.0,0.0,0.0,0.499027,0.995514,0.0,0.0,0.0,0.498876
2,RandomForest,0.997283,0.0,0.0,0.0,0.5,0.997757,0.0,0.0,0.0,0.5
3,LightGBM,0.997283,0.0,0.0,0.0,0.5,0.997116,0.0,0.0,0.0,0.499679
4,SVC,0.690606,0.005013,0.571429,0.009938,0.631179,0.483499,0.00248,0.571429,0.004938,0.527365
