In [1]:
# 데이터 가져오기

In [2]:
socar_df = pd.read_csv("insurance_fraud_detect_data.csv")
pd.set_option('display.max_columns', len(socar_df.columns))
socar_df.head()

Unnamed: 0,fraud_YN,car_model,sharing_type,age_group,has_previous_accident,cumulative_use_count,b2b,accident_ratio,pf_type,socarpass,socarsave,start_hour,duration,accident_hour,repair_cost,insure_cost,accident_location,car_part1,car_part2,repair_cnt,acc_type1,insurance_site_aid_YN,police_site_aid_YN,total_prsn_cnt,test_set
0,0,2,1,3,0,2,0,100,2,0,0,4,2,4,130254.7,0,1,0,0,1,0,0,0,-1,0
1,0,1,0,4,0,4,0,0,3,1,0,1,4,4,0.0,0,1,1,1,2,1,1,1,0,1
2,0,1,0,4,1,4,0,100,3,1,0,1,4,6,258130.6,12925040,3,1,0,2,1,1,1,0,1
3,0,3,1,3,0,4,0,0,2,0,0,3,3,5,1341009.0,0,3,1,0,1,0,0,0,-1,0
4,0,1,0,1,0,1,0,100,1,0,0,1,1,1,524030.6,1209410,1,0,0,2,0,0,0,-1,0


In [3]:
# train set, test set 나누기

In [4]:
train_set = socar_df[socar_df["test_set"] == 0]
test_set = socar_df[socar_df["test_set"] == 1]

In [5]:
train_set.drop("test_set", axis=1, inplace=True)
test_set.drop("test_set", axis=1, inplace=True)

In [6]:
train_set.reset_index(drop=True, inplace=True)
test_set.reset_index(drop=True, inplace=True)

In [7]:
# features와 target 나누기
X_features = train_set.drop("fraud_YN", axis=1)
y_target = train_set["fraud_YN"]
X_test = test_set.drop("fraud_YN", axis=1)
y_test = test_set["fraud_YN"]

In [8]:
# train-validation 분리 

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_features, y_target, test_size = 0.2, random_state = 13, stratify = y_target)

# 비율 확인 
print(y_train.value_counts()/y_train.shape[0]*100) 
print(y_valid.value_counts()/y_valid.shape[0]*100)

0    99.73794
1     0.26206
Name: fraud_YN, dtype: float64
0    99.728261
1     0.271739
Name: fraud_YN, dtype: float64


In [9]:
# 샘플링 RamdomOverSampler

from imblearn.over_sampling import RandomOverSampler 

ros = RandomOverSampler(random_state=13)
X_train_over, y_train_over = ros.fit_sample(X_train, y_train)

In [10]:
X_train.shape, y_train.shape

((10303, 23), (10303,))

In [11]:
X_train_over.shape, y_train_over.shape

((20552, 23), (20552,))

In [12]:
from sklearn.pipeline import Pipeline

In [13]:
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC

# 파이프라인 모델 설정

lr_pipe = Pipeline([
    ('scaler', StandardScaler()), 
    ('pca', PCA(n_components=2)),
    ('clf', LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ("clf", SVC(random_state=13, kernel='linear'))])

In [14]:
# 파이프라인 세팅설정

dt_pipe.set_params(clf__max_depth=4)
rf_pipe.set_params(clf__n_jobs=-1, clf__n_estimators=100)
lgbm_pipe.set_params(clf__n_estimators=1000, clf__num_leaves=64, clf__n_jobs=-1, clf__boost_from_average=False)

Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=2)),
                ('clf',
                 LGBMClassifier(boost_from_average=False, n_estimators=1000,
                                num_leaves=64, random_state=13))])

In [17]:
# 성능 평가 함수

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def get_clf_eval(y_valid, y_pred_valid, y_test, y_pred_test):
    
    accuracy = accuracy_score(y_valid,y_pred_valid) 
    precision = precision_score(y_valid,y_pred_valid) 
    recall = recall_score(y_valid,y_pred_valid) 
    f1 = f1_score(y_valid,y_pred_valid)
    roc_auc = roc_auc_score(y_valid,y_pred_valid)
    
    accuracy_t = accuracy_score(y_test,y_pred_test) 
    precision_t = precision_score(y_test,y_pred_test) 
    recall_t = recall_score(y_test,y_pred_test) 
    f1_t = f1_score(y_test,y_pred_test)
    roc_auc_t = roc_auc_score(y_test,y_pred_test)
    
    return accuracy, precision, recall, f1, roc_auc, accuracy_t, precision_t, recall_t, f1_t, roc_auc_t

In [18]:
pipes = [lr_pipe, dt_pipe, rf_pipe, lgbm_pipe, svm_pipe]

col_names = ['Accuracy', 'precision', 'recall', 'f1', 'roc',\
             'Accuracy_t', 'precision_t', 'recall_t', 'f1_t', 'roc_t']    
index = ['LogiReg', 'DecisionTree', 'RandomFore', 'LGBM', 'SVC']
tmp = []

for pipe in pipes:
    clf = pipe
    clf.fit(X_train_over, y_train_over)
    
    y_pred_valid = clf.predict(X_valid)
    y_pred_test = clf.predict(X_test)
    
    tmp.append(get_clf_eval(y_valid, y_pred_valid, y_test, y_pred_test))

pd.DataFrame(tmp, columns=col_names, index=index)

Unnamed: 0,Accuracy,precision,recall,f1,roc,Accuracy_t,precision_t,recall_t,f1_t,roc_t
LogiReg,0.720109,0.006906,0.714286,0.01368,0.717205,0.518424,0.001997,0.428571,0.003976,0.473598
DecisionTree,0.735248,0.002946,0.285714,0.005831,0.511094,0.755847,0.001321,0.142857,0.002618,0.450041
RandomFore,0.996894,0.0,0.0,0.0,0.499805,0.997116,0.0,0.0,0.0,0.499679
LGBM,0.996506,0.25,0.142857,0.181818,0.570845,0.994873,0.0,0.0,0.0,0.498555
SVC,0.758929,0.006431,0.571429,0.012719,0.665434,0.527715,0.002037,0.428571,0.004054,0.478255
