In [3]:
# 1-1. 데이터 가져오기 

import pandas as pd 
import matplotlib.pyplot as plt 


socar = pd.read_csv('../../0. raw_data/insurance_fraud_detect_data.csv')
socar 

Unnamed: 0,fraud_YN,car_model,sharing_type,age_group,has_previous_accident,cumulative_use_count,b2b,accident_ratio,pf_type,socarpass,...,insure_cost,accident_location,car_part1,car_part2,repair_cnt,acc_type1,insurance_site_aid_YN,police_site_aid_YN,total_prsn_cnt,test_set
0,0,2,1,3,0,2,0,100,2,0,...,0,1,0,0,1,0,0,0,-1,0
1,0,1,0,4,0,4,0,0,3,1,...,0,1,1,1,2,1,1,1,0,1
2,0,1,0,4,1,4,0,100,3,1,...,12925040,3,1,0,2,1,1,1,0,1
3,0,3,1,3,0,4,0,0,2,0,...,0,3,1,0,1,0,0,0,-1,0
4,0,1,0,1,0,1,0,100,1,0,...,1209410,1,0,0,2,0,0,0,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15995,0,2,0,2,0,2,0,100,2,0,...,0,1,1,0,1,0,0,0,-1,0
15996,0,2,0,2,1,4,0,100,1,1,...,0,2,1,0,1,0,0,0,-1,0
15997,0,2,1,2,0,4,0,100,1,0,...,0,0,0,0,1,0,0,0,-1,1
15998,0,2,0,2,0,2,0,100,2,0,...,0,4,1,0,1,0,0,0,-1,0


In [4]:
# train-test 분리

df = socar[socar['test_set']==0]
df_test = socar[socar['test_set']==1]

In [5]:
# test_set 분리 
X_test = df_test.drop('fraud_YN', axis = 1)
y_test = df_test['fraud_YN']

In [6]:
# train-valid 분리 

from sklearn.model_selection import train_test_split

y_target = df['fraud_YN']
X_features = df.drop('fraud_YN', axis = 1)

X_train, X_valid, y_train, y_valid = train_test_split(X_features, y_target, test_size = 0.2, random_state = 13, stratify = y_target)

# 비율 확인 
print(y_train.value_counts()/y_train.shape[0]*100) 
print(y_valid.value_counts()/y_valid.shape[0]*100)

0    99.73794
1     0.26206
Name: fraud_YN, dtype: float64
0    99.728261
1     0.271739
Name: fraud_YN, dtype: float64


# trial 1 

In [7]:
# resampling
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import RandomOverSampler 
from imblearn.combine import SMOTEENN


In [8]:
df_1 = df.copy()

y_target = df_1['fraud_YN']
X_features = df_1.drop('fraud_YN', axis=1)

y_target = df_1['fraud_YN'].reset_index()
y_target = y_target.drop('index', axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(X_features, y_target, test_size = 0.2, random_state = 13, stratify = y_target)

# train 데이터 샘플링 - 오버샘플링 
over_sampling = RandomOverSampler(random_state=13)
X_train_over, y_train_over = over_sampling.fit_sample(X_train, y_train)

# 비율 확인 
print(y_train.value_counts()/y_train.shape[0]*100) 
print(y_valid.value_counts()/y_valid.shape[0]*100)

fraud_YN
0           99.73794
1            0.26206
dtype: float64
fraud_YN
0           99.728261
1            0.271739
dtype: float64


In [13]:
# 파이프라인 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import LinearSVC

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler


# 파이프라인 생성 
lr_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ("clf", LogisticRegression(random_state=13))])
df_pipe =  Pipeline([
    ("scaler", RobustScaler()),
    ("clf", DecisionTreeClassifier(random_state=13))])
rf_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ("clf", RandomForestClassifier(random_state=13))])
lgbm_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ("clf", LGBMClassifier(random_state=13))])
svm_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ("clf", LinearSVC(random_state=13))])


# 각 모델 파라미터 정해주기 
lr_param_grid = [{'clf__penalty': ['l1', 'l2', 'elasticnet', 'none']}]

dt_param_grid = [{'clf__criterion':['gini', 'entropy'], 'clf__max_depth':[None,2,3,4,5,6]}]

rf_param_grid = [{'clf__n_estimators': [1, 5, 10, 20],
                  'clf__max_depth' : [2, 3, 4, 5, 10, 50, 100]}]

lgbm_param_grid = [{'clf__n_estimators': [200,500,800,1000], 'clf__num_leaves': [32, 64],\
                   'clf__max_depth': [80,128, 160], 'clf__min_child_samples': [60, 100]}]

svm_param_grid = [{"clf__penalty": ["l1", "l2"],
                    "clf__loss": ["hinge", "squared_hinge"]}]

# grid search 
lr_clf = GridSearchCV(lr_pipe, lr_param_grid, cv=5, refit=True, scoring='recall')
dt_clf = GridSearchCV(df_pipe, dt_param_grid, cv=5, refit=True, scoring='recall')
rf_clf = GridSearchCV(rf_pipe, rf_param_grid, cv=5, refit=True, scoring='recall')
lgbm_clf = GridSearchCV(lgbm_pipe, lgbm_param_grid, cv=5, refit=True, scoring='recall')
svm_clf = GridSearchCV(svm_pipe, svm_param_grid, cv=5, refit=True,scoring='recall')


In [10]:
# 학습 / 예측 / 평가 함수 

# 평가 함수 

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score

def get_clf_eval(y_test, pred):
    # confusion = confusion_matrix(y_test, pred) #오차행렬
    accuracy = accuracy_score(y_test,pred) # 정확도 
    precision = precision_score(y_test,pred) # 정밀도
    recall = recall_score(y_test,pred) # 재현율 
    f1 = f1_score(y_test,pred)
    roc_auc = roc_auc_score(y_test,pred)
    return accuracy, precision, recall, f1, roc_auc

# 성능 DF 반환 

def get_result(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    return get_clf_eval(y_test, pred)

# 다수의 모델 성능 비교 

def get_result_df(models, model_names, X_train, y_train, X_test, y_test):
    col_names = ['Accuracy', 'precision', 'recall', 'f1', 'roc']
    tmp = []
    for model in models:
        tmp.append(get_result(model, X_train, y_train, X_test, y_test))

    return pd.DataFrame(tmp, columns=col_names, index=model_names)


# 학습 진행 
models=[lr_clf,dt_clf,rf_clf,lgbm_clf,svm_clf] 
model_names = ['LogiReg', 'DecisionTree', 'RandomFore','LGBM','SVC']
results = get_result_df(models, model_names, X_train_over, y_train_over, X_valid, y_valid)
results 


Unnamed: 0,Accuracy,precision,recall,f1,roc
LogiReg,0.693323,0.001274,0.142857,0.002525,0.41884
DecisionTree,0.993401,0.0,0.0,0.0,0.498054
RandomFore,0.96778,0.012821,0.142857,0.023529,0.556442
LGBM,0.996894,0.0,0.0,0.0,0.499805
SVC,0.695264,0.001282,0.142857,0.002541,0.419813


In [11]:
# 함수 - 테스트 예측값 
def get_result_test(model, X_test, y_test):
    pred = model.predict(X_test)

    return get_clf_eval(y_test, pred)

# 함수 - 테스트 예측값 DF 
def get_result_df_test(models, model_names, X_test, y_test):
    col_names = ['Accuracy_test', 'precision_test', 'recall_test', 'f1_Test', 'roc_test']
    tmp = []
    for model in models:
        tmp.append(get_result_test(model, X_test, y_test))

    return pd.DataFrame(tmp, columns=col_names, index=model_names)

models=[lr_clf,dt_clf,rf_clf,lgbm_clf,svm_clf] 
model_names = ['LogiReg', 'DecisionTree', 'RandomFore','LGBM','SVC']
results_test = get_result_df_test(models, model_names, X_test, y_test)
results_test

Unnamed: 0,Accuracy_test,precision_test,recall_test,f1_Test,roc_test
LogiReg,0.643384,0.001802,0.285714,0.003581,0.464951
DecisionTree,0.993271,0.0,0.0,0.0,0.497752
RandomFore,0.957065,0.015267,0.285714,0.028986,0.622144
LGBM,0.997437,0.0,0.0,0.0,0.499839
SVC,0.658122,0.002814,0.428571,0.005592,0.543605


In [12]:
total_results = results.join(results_test)
total_results

Unnamed: 0,Accuracy,precision,recall,f1,roc,Accuracy_test,precision_test,recall_test,f1_Test,roc_test
LogiReg,0.693323,0.001274,0.142857,0.002525,0.41884,0.643384,0.001802,0.285714,0.003581,0.464951
DecisionTree,0.993401,0.0,0.0,0.0,0.498054,0.993271,0.0,0.0,0.0,0.497752
RandomFore,0.96778,0.012821,0.142857,0.023529,0.556442,0.957065,0.015267,0.285714,0.028986,0.622144
LGBM,0.996894,0.0,0.0,0.0,0.499805,0.997437,0.0,0.0,0.0,0.499839
SVC,0.695264,0.001282,0.142857,0.002541,0.419813,0.658122,0.002814,0.428571,0.005592,0.543605
