In [9]:
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# scaler 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler

# models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import LinearSVC

# pipeline
from sklearn.pipeline import Pipeline 

# resampling
from imblearn.over_sampling import RandomOverSampler 
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import NearMiss
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek

# model selection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# scoring
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# pca
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

In [10]:
# Road dataset
socar = pd.read_csv("insurance_fraud_detect_data.csv")
pd.set_option('display.max_columns', len(socar.columns))
socar.head()

Unnamed: 0,fraud_YN,car_model,sharing_type,age_group,has_previous_accident,cumulative_use_count,b2b,accident_ratio,pf_type,socarpass,socarsave,start_hour,duration,accident_hour,repair_cost,insure_cost,accident_location,car_part1,car_part2,repair_cnt,acc_type1,insurance_site_aid_YN,police_site_aid_YN,total_prsn_cnt,test_set
0,0,2,1,3,0,2,0,100,2,0,0,4,2,4,130254.7,0,1,0,0,1,0,0,0,-1,0
1,0,1,0,4,0,4,0,0,3,1,0,1,4,4,0.0,0,1,1,1,2,1,1,1,0,1
2,0,1,0,4,1,4,0,100,3,1,0,1,4,6,258130.6,12925040,3,1,0,2,1,1,1,0,1
3,0,3,1,3,0,4,0,0,2,0,0,3,3,5,1341009.0,0,3,1,0,1,0,0,0,-1,0
4,0,1,0,1,0,1,0,100,1,0,0,1,1,1,524030.6,1209410,1,0,0,2,0,0,0,-1,0


In [17]:
socar_copy1 = socar.copy()

In [18]:
# insurance_site_aid_YN, police_site_aid_YN 합치기
for i, j in socar_copy1.iterrows():
    if (j['insurance_site_aid_YN'] == 0) & (j['police_site_aid_YN'] == 0):
        socar_copy1.loc[i, 'insure_police'] = 0
    else :
        socar_copy1.loc[i, 'insure_police'] = 1
        
socar_copy1['insure_police'].value_counts()

0.0    11325
1.0     4675
Name: insure_police, dtype: int64

In [19]:
# Drop columns(b2b, sharing_type, has_previous_accident, socarpass, car_part2, police_site_aid_YN etc)

socar_copy1.drop(["b2b", "sharing_type", "has_previous_accident", "socarpass",\
                  "car_part2", "police_site_aid_YN", "insurance_site_aid_YN"], axis=1, inplace=True)

In [20]:
socar_copy1.shape

(16000, 19)

In [21]:
# Drop datas (car_model : 4, 5)

to_drop = socar_copy1[socar_copy1["car_model"] >= 4].index
socar_copy1.drop(to_drop, axis=0, inplace=True)
socar_copy1.shape

(15544, 19)

In [22]:
# Drop datas (duration : 5)

to_drop = socar_copy1[socar_copy1["duration"] == 5].index
socar_copy1.drop(to_drop, axis=0, inplace=True)
socar_copy1.shape

(14977, 19)

In [23]:
# Drop datas (accident_location : 3, 5)

to_drop = socar_copy1[(socar_copy1["accident_location"] == 3) | (socar_copy1["accident_location"] == 5)].index
socar_copy1.drop(to_drop, axis=0, inplace=True)
socar_copy1.shape

(14152, 19)

In [24]:
# Drop datas (total_prsn_cnt : 4, 5)

to_drop = socar_copy1[socar_copy1["total_prsn_cnt"] >= 4].index
socar_copy1.drop(to_drop, axis=0, inplace=True)
socar_copy1.shape

(14048, 19)

In [25]:
# Reclassify categorical feature (accident_ratio)

for i, j in socar_copy1.iterrows():
    if j['accident_ratio'] < 40:
        socar_copy1.loc[i, 'accident_ratio'] = 0
    else:
        socar_copy1.loc[i, 'accident_ratio'] = 1

socar_copy1['accident_ratio'].value_counts()

1    12668
0     1380
Name: accident_ratio, dtype: int64

In [26]:
# Reclassify categorical feature (repair_cnt)

for i, j in socar_copy1.iterrows():
    if j['repair_cnt'] == 0:
        socar_copy1.loc[i, 'repair_cnt'] = 0
    elif j['repair_cnt'] == 1:
        socar_copy1.loc[i, 'repair_cnt'] = 1
    elif j['repair_cnt'] == 2:
        socar_copy1.loc[i, 'repair_cnt'] = 2
    elif j['repair_cnt'] == 3:
        socar_copy1.loc[i, 'repair_cnt'] = 3
    else:
        socar_copy1.loc[i, 'repair_cnt'] = 4
        

socar_copy1['repair_cnt'].value_counts()

1    10306
2     2628
3      543
4      363
0      208
Name: repair_cnt, dtype: int64

In [27]:
socar_copy1.columns

Index(['fraud_YN', 'car_model', 'age_group', 'cumulative_use_count',
       'accident_ratio', 'pf_type', 'socarsave', 'start_hour', 'duration',
       'accident_hour', 'repair_cost', 'insure_cost', 'accident_location',
       'car_part1', 'repair_cnt', 'acc_type1', 'total_prsn_cnt', 'test_set',
       'insure_police'],
      dtype='object')

In [28]:
# OneHotEncoding

socar_copy1 = pd.get_dummies(socar_copy1, columns=['car_model', 'age_group', 'cumulative_use_count',
       'pf_type', 'start_hour', 'duration',
       'accident_hour', 'accident_location',
       'repair_cnt', 'acc_type1', 'total_prsn_cnt'])

In [29]:
# train_set, test_set 분리
train_set = socar_copy1[socar_copy1["test_set"] == 0]
test_set = socar_copy1[socar_copy1["test_set"] == 1]

# "test_set" 컬럼 삭제
train_set = train_set.drop("test_set", axis=1)
test_set = test_set.drop("test_set", axis=1)

# features와 target분리
X_features = train_set.drop("fraud_YN", axis=1)
y_target = train_set["fraud_YN"]
X_test = test_set.drop("fraud_YN", axis=1)
y_test = test_set["fraud_YN"]

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_features, y_target, test_size=0.2,\
                                                      random_state=13, stratify=y_target)

In [31]:
# RandomOverSampler

over_sampling = RandomOverSampler(random_state=13)
X_train_over, y_train_over = over_sampling.fit_sample(X_train, y_train)

In [32]:
def get_score(X_train_over, y_train_over):    
    grids = []
    grids.append(('LogisticRe', LR))
    grids.append(('DecisionTree', DT))
    grids.append(('RandomForest', RF))
    grids.append(('LightGBM', LGBM))
    grids.append(('SVC', SVM))
    
                  
    valid_accuracy = []
    valid_recall = []
    valid_precision = []
    valid_f1 = []
    valid_roc = []
    test_accuracy = []
    test_recall = []
    test_precision = []
    test_f1 = []
    test_roc = []
    names = []
    
    for name, grid in grids:
        clf = grid
        clf.fit(X_train_over, y_train_over)
        
        y_pred_valid = clf.predict(X_valid)
        y_pred_test = clf.predict(X_test)
        print('교차검증점수: ', clf.best_score_)
        print('최적의 파라미터: ', clf.best_params_)
        print('검증세트평가: ', clf.score(X_valid, y_valid))
        print('테스트평가: ', clf.score(X_test, y_test))
    
        names.append(name)
        valid_accuracy.append(accuracy_score(y_valid, y_pred_valid))
        valid_recall.append(recall_score(y_valid, y_pred_valid))
        valid_precision.append(precision_score(y_valid, y_pred_valid))
        valid_f1.append(f1_score(y_valid, y_pred_valid))
        valid_roc.append(roc_auc_score(y_valid, y_pred_valid))
        
        test_accuracy.append(accuracy_score(y_test, y_pred_test))
        test_recall.append(recall_score(y_test, y_pred_test))
        test_precision.append(precision_score(y_test, y_pred_test))
        test_f1.append(f1_score(y_test, y_pred_test))
        test_roc.append(roc_auc_score(y_test, y_pred_test))
    
    result = pd.DataFrame({'model name':names, 
                           'valid accuracy':valid_accuracy,
                           'valid precision':valid_precision,
                           'valid recall':valid_recall,
                           'valid f1':valid_f1,
                           'valid roc':valid_roc,
                           'test accuracy':test_accuracy,
                           'test precision':test_precision,
                           'test recall':test_recall,
                           'test f1':test_f1,
                           'test roc':test_roc}) 
    return result

In [33]:
# pipeline

lr_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ("clf", LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ("clf", LinearSVC(random_state=13))])


# grid search params
lr_param_grid = [{'clf__C': [0.01, 0.1, 1]}]

dt_param_grid = [{'clf__criterion':['gini', 'entropy'], 'clf__max_depth':[None,2,3,4]}]

rf_param_grid = [{'clf__n_estimators': [10, 50, 100, 200]}]

lgbm_param_grid = [{'clf__n_estimators': [10, 30, 50, 100], 'clf__num_leaves': [4, 8, 16]}]

svm_param_grid = [{"clf__C": [0.01, 0.1, 1]}]


kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=13)

# grid search CV
LR = GridSearchCV(lr_pipe, lr_param_grid, cv=kfold, scoring = 'recall')
DT = GridSearchCV(dt_pipe, dt_param_grid, cv=kfold, scoring = 'recall')
RF = GridSearchCV(rf_pipe, rf_param_grid, cv=kfold, scoring = 'recall')
LGBM = GridSearchCV(lgbm_pipe, lgbm_param_grid, cv=kfold, scoring = 'recall')
SVM = GridSearchCV(svm_pipe, svm_param_grid, cv=kfold, scoring = 'recall')

In [34]:
# pca 적용x
get_score(X_train_over, y_train_over)

교차검증점수:  0.9625543296556337
최적의 파라미터:  {'clf__C': 0.1}
검증세트평가:  0.42857142857142855
테스트평가:  0.5714285714285714
교차검증점수:  1.0
최적의 파라미터:  {'clf__criterion': 'gini', 'clf__max_depth': None}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  1.0
최적의 파라미터:  {'clf__n_estimators': 10}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  1.0
최적의 파라미터:  {'clf__n_estimators': 10, 'clf__num_leaves': 16}
검증세트평가:  0.2857142857142857
테스트평가:  0.2857142857142857
교차검증점수:  0.9625543296556337
최적의 파라미터:  {'clf__C': 0.01}
검증세트평가:  0.42857142857142855
테스트평가:  0.5714285714285714


Unnamed: 0,model name,valid accuracy,valid precision,valid recall,valid f1,valid roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.790667,0.006383,0.428571,0.012579,0.610184,0.694782,0.004678,0.571429,0.009281,0.63326
1,DecisionTree,0.991556,0.0,0.0,0.0,0.497325,0.993567,0.0,0.0,0.0,0.498029
2,RandomForest,0.996889,0.0,0.0,0.0,0.5,0.997498,0.0,0.0,0.0,0.5
3,LightGBM,0.877778,0.007353,0.285714,0.014337,0.58267,0.857756,0.005063,0.285714,0.00995,0.572452
4,SVC,0.783111,0.00616,0.428571,0.012146,0.606394,0.687634,0.004571,0.571429,0.00907,0.629677


In [35]:
# pipeline

lr_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=7)),
    ("clf", LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=7)),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=7)),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=7)),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=7)),
    ("clf", LinearSVC(random_state=13))])


# grid search params
lr_param_grid = [{'clf__C': [0.01, 0.1, 1]}]

dt_param_grid = [{'clf__criterion':['gini', 'entropy'], 'clf__max_depth':[None,2,3,4]}]

rf_param_grid = [{'clf__n_estimators': [10, 50, 100, 200]}]

lgbm_param_grid = [{'clf__n_estimators': [10, 30, 50, 100], 'clf__num_leaves': [4, 8, 16]}]

svm_param_grid = [{"clf__C": [0.01, 0.1, 1]}]


kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=13)

# grid search CV
LR = GridSearchCV(lr_pipe, lr_param_grid, cv=kfold, scoring = 'recall')
DT = GridSearchCV(dt_pipe, dt_param_grid, cv=kfold, scoring = 'recall')
RF = GridSearchCV(rf_pipe, rf_param_grid, cv=kfold, scoring = 'recall')
LGBM = GridSearchCV(lgbm_pipe, lgbm_param_grid, cv=kfold, scoring = 'recall')
SVM = GridSearchCV(svm_pipe, svm_param_grid, cv=kfold, scoring = 'recall')

In [36]:
# pca=7
get_score(X_train_over, y_train_over)

교차검증점수:  0.5787362086258776
최적의 파라미터:  {'clf__C': 0.01}
검증세트평가:  0.5714285714285714
테스트평가:  0.8571428571428571
교차검증점수:  0.9032653516103867
최적의 파라미터:  {'clf__criterion': 'gini', 'clf__max_depth': None}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.9891897915970133
최적의 파라미터:  {'clf__n_estimators': 100}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.9672350384486794
최적의 파라미터:  {'clf__n_estimators': 50, 'clf__num_leaves': 8}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.5787362086258776
최적의 파라미터:  {'clf__C': 1}
검증세트평가:  0.5714285714285714
테스트평가:  0.8571428571428571


Unnamed: 0,model name,valid accuracy,valid precision,valid recall,valid f1,valid roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.688444,0.005698,0.571429,0.011283,0.630119,0.475697,0.004076,0.857143,0.008114,0.665942
1,DecisionTree,0.994222,0.0,0.0,0.0,0.498663,0.994996,0.0,0.0,0.0,0.498746
2,RandomForest,0.996889,0.0,0.0,0.0,0.5,0.997498,0.0,0.0,0.0,0.5
3,LightGBM,0.973333,0.0,0.0,0.0,0.488185,0.974982,0.0,0.0,0.0,0.488714
4,SVC,0.691111,0.005747,0.571429,0.01138,0.631457,0.480343,0.004112,0.857143,0.008186,0.66827


In [37]:
# pipeline

lr_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=5)),
    ("clf", LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=5)),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=5)),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=5)),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=5)),
    ("clf", LinearSVC(random_state=13))])


# grid search params
lr_param_grid = [{'clf__C': [0.01, 0.1, 1]}]

dt_param_grid = [{'clf__criterion':['gini', 'entropy'], 'clf__max_depth':[None,2,3,4]}]

rf_param_grid = [{'clf__n_estimators': [10, 50, 100, 200]}]

lgbm_param_grid = [{'clf__n_estimators': [10, 30, 50, 100], 'clf__num_leaves': [4, 8, 16]}]

svm_param_grid = [{"clf__C": [0.01, 0.1, 1]}]


kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=13)

# grid search CV
LR = GridSearchCV(lr_pipe, lr_param_grid, cv=kfold, scoring = 'recall')
DT = GridSearchCV(dt_pipe, dt_param_grid, cv=kfold, scoring = 'recall')
RF = GridSearchCV(rf_pipe, rf_param_grid, cv=kfold, scoring = 'recall')
LGBM = GridSearchCV(lgbm_pipe, lgbm_param_grid, cv=kfold, scoring = 'recall')
SVM = GridSearchCV(svm_pipe, svm_param_grid, cv=kfold, scoring = 'recall')

In [38]:
# pca=5
get_score(X_train_over, y_train_over)

교차검증점수:  0.6043686615401761
최적의 파라미터:  {'clf__C': 1}
검증세트평가:  0.42857142857142855
테스트평가:  0.8571428571428571
교차검증점수:  0.988409673464839
최적의 파라미터:  {'clf__criterion': 'gini', 'clf__max_depth': None}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  1.0
최적의 파라미터:  {'clf__n_estimators': 10}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  1.0
최적의 파라미터:  {'clf__n_estimators': 50, 'clf__num_leaves': 8}
검증세트평가:  0.0
테스트평가:  0.2857142857142857
교차검증점수:  0.5935584531371894
최적의 파라미터:  {'clf__C': 1}
검증세트평가:  0.42857142857142855
테스트평가:  0.8571428571428571


Unnamed: 0,model name,valid accuracy,valid precision,valid recall,valid f1,valid roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.697778,0.004418,0.428571,0.008746,0.563595,0.512152,0.00438,0.857143,0.008715,0.684215
1,DecisionTree,0.996444,0.0,0.0,0.0,0.499777,0.996426,0.0,0.0,0.0,0.499463
2,RandomForest,0.996889,0.0,0.0,0.0,0.5,0.997141,0.0,0.0,0.0,0.499821
3,LightGBM,0.976889,0.0,0.0,0.0,0.489969,0.97248,0.027027,0.285714,0.049383,0.629959
4,SVC,0.703556,0.004505,0.428571,0.008915,0.566493,0.518227,0.004435,0.857143,0.008824,0.68726


In [41]:
# pipeline

lr_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LinearSVC(random_state=13))])


# grid search params
lr_param_grid = [{'clf__C': [0.01, 0.1, 1]}]

dt_param_grid = [{'clf__criterion':['gini', 'entropy'], 'clf__max_depth':[None,2,3,4]}]

rf_param_grid = [{'clf__n_estimators': [10, 50, 100, 200]}]

lgbm_param_grid = [{'clf__n_estimators': [10, 30, 50, 100], 'clf__num_leaves': [4, 8, 16]}]

svm_param_grid = [{"clf__C": [0.1, 1, 10]}]


kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=13)

# grid search CV
LR = GridSearchCV(lr_pipe, lr_param_grid, cv=kfold, scoring = 'recall')
DT = GridSearchCV(dt_pipe, dt_param_grid, cv=kfold, scoring = 'recall')
RF = GridSearchCV(rf_pipe, rf_param_grid, cv=kfold, scoring = 'recall')
LGBM = GridSearchCV(lgbm_pipe, lgbm_param_grid, cv=kfold, scoring = 'recall')
SVM = GridSearchCV(svm_pipe, svm_param_grid, cv=kfold, scoring = 'recall')

In [42]:
# pca=6
get_score(X_train_over, y_train_over)

교차검증점수:  0.5528808648166722
최적의 파라미터:  {'clf__C': 0.01}
검증세트평가:  0.42857142857142855
테스트평가:  0.8571428571428571
교차검증점수:  0.8901147888108771
최적의 파라미터:  {'clf__criterion': 'gini', 'clf__max_depth': None}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.9893012370444668
최적의 파라미터:  {'clf__n_estimators': 100}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.9783795831940264
최적의 파라미터:  {'clf__n_estimators': 10, 'clf__num_leaves': 8}
검증세트평가:  0.14285714285714285
테스트평가:  0.42857142857142855
교차검증점수:  0.5645826367992868
최적의 파라미터:  {'clf__C': 10}
검증세트평가:  0.42857142857142855
테스트평가:  0.8571428571428571


Unnamed: 0,model name,valid accuracy,valid precision,valid recall,valid f1,valid roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.687556,0.004274,0.428571,0.008463,0.558468,0.5,0.004274,0.857143,0.008505,0.678124
1,DecisionTree,0.993333,0.0,0.0,0.0,0.498217,0.995711,0.0,0.0,0.0,0.499104
2,RandomForest,0.996889,0.0,0.0,0.0,0.5,0.997498,0.0,0.0,0.0,0.5
3,LightGBM,0.880444,0.003788,0.142857,0.00738,0.512802,0.871694,0.00838,0.428571,0.016438,0.650688
4,SVC,0.685333,0.004243,0.428571,0.008403,0.557353,0.49178,0.004205,0.857143,0.008368,0.674003


In [43]:
# pipeline

lr_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LinearSVC(random_state=13))])


# grid search params
lr_param_grid = [{'clf__C': [0.001, 0.01, 0.1, 1]}]

dt_param_grid = [{'clf__criterion':['gini', 'entropy'], 'clf__max_depth':[None,2,3,4]}]

rf_param_grid = [{'clf__n_estimators': [100, 200, 500, 1000]}]

lgbm_param_grid = [{'clf__n_estimators': [10, 30, 50, 100], 'clf__num_leaves': [4, 8, 16]}]

svm_param_grid = [{"clf__C": [1, 10, 100]}]


kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=13)

# grid search CV
LR = GridSearchCV(lr_pipe, lr_param_grid, cv=kfold, scoring = 'recall')
DT = GridSearchCV(dt_pipe, dt_param_grid, cv=kfold, scoring = 'recall')
RF = GridSearchCV(rf_pipe, rf_param_grid, cv=kfold, scoring = 'recall')
LGBM = GridSearchCV(lgbm_pipe, lgbm_param_grid, cv=kfold, scoring = 'recall')
SVM = GridSearchCV(svm_pipe, svm_param_grid, cv=kfold, scoring = 'recall')

In [44]:
# 하이퍼 파라미터 변경
get_score(X_train_over, y_train_over)

교차검증점수:  0.5528808648166722
최적의 파라미터:  {'clf__C': 0.001}
검증세트평가:  0.42857142857142855
테스트평가:  0.7142857142857143
교차검증점수:  0.8975816337902597
최적의 파라미터:  {'clf__criterion': 'gini', 'clf__max_depth': None}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.9630001114454475
최적의 파라미터:  {'clf__n_estimators': 500}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.9880753371224786
최적의 파라미터:  {'clf__n_estimators': 100, 'clf__num_leaves': 8}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.5528808648166722
최적의 파라미터:  {'clf__C': 1}
검증세트평가:  0.42857142857142855
테스트평가:  0.8571428571428571


Unnamed: 0,model name,valid accuracy,valid precision,valid recall,valid f1,valid roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.706222,0.004545,0.428571,0.008996,0.56783,0.510722,0.003644,0.714286,0.007252,0.612249
1,DecisionTree,0.996,0.0,0.0,0.0,0.499554,0.995354,0.0,0.0,0.0,0.498925
2,RandomForest,0.996889,0.0,0.0,0.0,0.5,0.997498,0.0,0.0,0.0,0.5
3,LightGBM,0.988444,0.0,0.0,0.0,0.495765,0.988563,0.0,0.0,0.0,0.495521
4,SVC,0.689778,0.004304,0.428571,0.008523,0.559582,0.503574,0.004304,0.857143,0.008565,0.679915


In [45]:
# pipeline

lr_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LinearSVC(random_state=13))])


# grid search params
lr_param_grid = [{'clf__C': [0.01, 0.05, 0.1, 1]}]

dt_param_grid = [{'clf__criterion':['gini', 'entropy'], 'clf__max_depth':[None,2,3,4]}]

rf_param_grid = [{'clf__n_estimators': [100, 200, 500]}]

lgbm_param_grid = [{'clf__n_estimators': [10, 30, 50, 100], 'clf__num_leaves': [4, 8, 16]}]

svm_param_grid = [{"clf__C": [1, 10, 100], "clf__penalty": ["l1", "l2"]}]


kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=13)

# grid search CV
LR = GridSearchCV(lr_pipe, lr_param_grid, cv=kfold, scoring = 'recall')
DT = GridSearchCV(dt_pipe, dt_param_grid, cv=kfold, scoring = 'recall')
RF = GridSearchCV(rf_pipe, rf_param_grid, cv=kfold, scoring = 'recall')
LGBM = GridSearchCV(lgbm_pipe, lgbm_param_grid, cv=kfold, scoring = 'recall')
SVM = GridSearchCV(svm_pipe, svm_param_grid, cv=kfold, scoring = 'recall')

In [46]:
# 하이퍼 파라미터 변경
get_score(X_train_over, y_train_over)

교차검증점수:  0.5528808648166722
최적의 파라미터:  {'clf__C': 0.01}
검증세트평가:  0.42857142857142855
테스트평가:  0.8571428571428571
교차검증점수:  0.8775214532486348
최적의 파라미터:  {'clf__criterion': 'entropy', 'clf__max_depth': None}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  1.0
최적의 파라미터:  {'clf__n_estimators': 200}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.9882982280173854
최적의 파라미터:  {'clf__n_estimators': 50, 'clf__num_leaves': 8}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.5539953192912069
최적의 파라미터:  {'clf__C': 10, 'clf__penalty': 'l2'}
검증세트평가:  0.42857142857142855
테스트평가:  0.8571428571428571


Unnamed: 0,model name,valid accuracy,valid precision,valid recall,valid f1,valid roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.688444,0.004286,0.428571,0.008487,0.558913,0.5,0.004274,0.857143,0.008505,0.678124
1,DecisionTree,0.991556,0.0,0.0,0.0,0.497325,0.995354,0.0,0.0,0.0,0.498925
2,RandomForest,0.996889,0.0,0.0,0.0,0.5,0.997498,0.0,0.0,0.0,0.5
3,LightGBM,0.974222,0.0,0.0,0.0,0.488631,0.976054,0.0,0.0,0.0,0.489251
4,SVC,0.684889,0.004237,0.428571,0.008392,0.55713,0.493567,0.004219,0.857143,0.008397,0.674899


In [47]:
# pipeline

lr_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LinearSVC(random_state=13))])


# grid search params
lr_param_grid = [{'clf__C': [0.01, 0.1, 1]}]

dt_param_grid = [{'clf__criterion':['gini', 'entropy'], 'clf__max_depth':[None,2,3,4]}]

rf_param_grid = [{'clf__n_estimators': [100, 200, 500]}]

lgbm_param_grid = [{'clf__n_estimators': [10, 30, 50, 100], 'clf__num_leaves': [4, 8, 16]}]

svm_param_grid = [{"clf__C": [1, 10, 100], "clf__penalty": ["l1", "l2"],\
                  "clf__loss": ["hinge", "squared_hinge"]}]


kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=13)

# grid search CV
LR = GridSearchCV(lr_pipe, lr_param_grid, cv=kfold, scoring = 'recall')
DT = GridSearchCV(dt_pipe, dt_param_grid, cv=kfold, scoring = 'recall')
RF = GridSearchCV(rf_pipe, rf_param_grid, cv=kfold, scoring = 'recall')
LGBM = GridSearchCV(lgbm_pipe, lgbm_param_grid, cv=kfold, scoring = 'recall')
SVM = GridSearchCV(svm_pipe, svm_param_grid, cv=kfold, scoring = 'recall')

In [48]:
# 하이퍼 파라미터 변경 (svm loss는 안 넣는 것이 낫다)
get_score(X_train_over, y_train_over)

교차검증점수:  0.5528808648166722
최적의 파라미터:  {'clf__C': 0.01}
검증세트평가:  0.42857142857142855
테스트평가:  0.8571428571428571
교차검증점수:  0.9391507856904046
최적의 파라미터:  {'clf__criterion': 'gini', 'clf__max_depth': None}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.9893012370444668
최적의 파라미터:  {'clf__n_estimators': 500}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  1.0
최적의 파라미터:  {'clf__n_estimators': 50, 'clf__num_leaves': 8}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.5808536721274936
최적의 파라미터:  {'clf__C': 100, 'clf__loss': 'squared_hinge', 'clf__penalty': 'l2'}
검증세트평가:  0.42857142857142855
테스트평가:  1.0


Unnamed: 0,model name,valid accuracy,valid precision,valid recall,valid f1,valid roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.686667,0.004261,0.428571,0.008439,0.558022,0.499643,0.00427,0.857143,0.008499,0.677944
1,DecisionTree,0.991556,0.0,0.0,0.0,0.497325,0.994639,0.0,0.0,0.0,0.498567
2,RandomForest,0.996889,0.0,0.0,0.0,0.5,0.997498,0.0,0.0,0.0,0.5
3,LightGBM,0.977778,0.0,0.0,0.0,0.490415,0.976412,0.0,0.0,0.0,0.48943
4,SVC,0.556444,0.003009,0.428571,0.005976,0.492707,0.454253,0.004563,1.0,0.009085,0.726442


In [49]:
# pipeline

lr_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LinearSVC(random_state=13))])


# grid search params
lr_param_grid = [{'clf__C': [0.01, 0.1, 1]}]

dt_param_grid = [{'clf__criterion':['gini', 'entropy'], 'clf__max_depth':[None,2,3,4]}]

rf_param_grid = [{'clf__n_estimators': [100, 200, 500]}]

lgbm_param_grid = [{'clf__n_estimators': [10, 30, 50, 100], 'clf__num_leaves': [4, 8, 16]}]

svm_param_grid = [{"clf__C": [0.1, 1, 10, 100, 200], "clf__penalty": ["l1", "l2"],\
                  "clf__loss": ["hinge", "squared_hinge"]}]


kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=13)

# grid search CV
LR = GridSearchCV(lr_pipe, lr_param_grid, cv=kfold, scoring = 'recall')
DT = GridSearchCV(dt_pipe, dt_param_grid, cv=kfold, scoring = 'recall')
RF = GridSearchCV(rf_pipe, rf_param_grid, cv=kfold, scoring = 'recall')
LGBM = GridSearchCV(lgbm_pipe, lgbm_param_grid, cv=kfold, scoring = 'recall')
SVM = GridSearchCV(svm_pipe, svm_param_grid, cv=kfold, scoring = 'recall')

In [50]:
# 하이퍼 파라미터 변경
get_score(X_train_over, y_train_over)

교차검증점수:  0.5528808648166722
최적의 파라미터:  {'clf__C': 0.01}
검증세트평가:  0.42857142857142855
테스트평가:  0.8571428571428571
교차검증점수:  0.9010364426613172
최적의 파라미터:  {'clf__criterion': 'gini', 'clf__max_depth': None}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.9745904379806086
최적의 파라미터:  {'clf__n_estimators': 100}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.9760392287975036
최적의 파라미터:  {'clf__n_estimators': 50, 'clf__num_leaves': 8}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.6439317953861584
최적의 파라미터:  {'clf__C': 200, 'clf__loss': 'squared_hinge', 'clf__penalty': 'l2'}
검증세트평가:  0.5714285714285714
테스트평가:  1.0


Unnamed: 0,model name,valid accuracy,valid precision,valid recall,valid f1,valid roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.687556,0.004274,0.428571,0.008463,0.558468,0.499285,0.004267,0.857143,0.008493,0.677765
1,DecisionTree,0.991556,0.0,0.0,0.0,0.497325,0.995354,0.0,0.0,0.0,0.498925
2,RandomForest,0.996889,0.0,0.0,0.0,0.5,0.997498,0.0,0.0,0.0,0.5
3,LightGBM,0.974222,0.0,0.0,0.0,0.488631,0.974625,0.0,0.0,0.0,0.488535
4,SVC,0.134222,0.002052,0.571429,0.00409,0.352143,0.113653,0.002815,1.0,0.005613,0.555715


In [51]:
# pipeline

lr_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LinearSVC(random_state=13))])


# grid search params
lr_param_grid = [{'clf__C': [0.001, 0.01, 0.1, 1]}]

dt_param_grid = [{'clf__criterion':['gini', 'entropy'], 'clf__max_depth':[None,2,3,4]}]

rf_param_grid = [{'clf__n_estimators': [100, 200, 500]}]

lgbm_param_grid = [{'clf__n_estimators': [10, 30, 50, 100], 'clf__num_leaves': [4, 8, 16]}]

svm_param_grid = [{"clf__C": [0.1, 1, 10, 50], "clf__penalty": ["l1", "l2"],\
                  "clf__loss": ["hinge", "squared_hinge"]}]


kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=13)

# grid search CV
LR = GridSearchCV(lr_pipe, lr_param_grid, cv=kfold, scoring = 'recall')
DT = GridSearchCV(dt_pipe, dt_param_grid, cv=kfold, scoring = 'recall')
RF = GridSearchCV(rf_pipe, rf_param_grid, cv=kfold, scoring = 'recall')
LGBM = GridSearchCV(lgbm_pipe, lgbm_param_grid, cv=kfold, scoring = 'recall')
SVM = GridSearchCV(svm_pipe, svm_param_grid, cv=kfold, scoring = 'recall')

In [52]:
# 하이퍼 파라미터 변경
get_score(X_train_over, y_train_over)

교차검증점수:  0.5528808648166722
최적의 파라미터:  {'clf__C': 0.001}
검증세트평가:  0.42857142857142855
테스트평가:  0.7142857142857143
교차검증점수:  0.8679371447676362
최적의 파라미터:  {'clf__criterion': 'entropy', 'clf__max_depth': None}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.9893012370444668
최적의 파라미터:  {'clf__n_estimators': 500}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.9767079014822245
최적의 파라미터:  {'clf__n_estimators': 30, 'clf__num_leaves': 8}
검증세트평가:  0.0
테스트평가:  0.2857142857142857
교차검증점수:  0.5528808648166722
최적의 파라미터:  {'clf__C': 0.1, 'clf__loss': 'squared_hinge', 'clf__penalty': 'l2'}
검증세트평가:  0.42857142857142855
테스트평가:  0.8571428571428571


Unnamed: 0,model name,valid accuracy,valid precision,valid recall,valid f1,valid roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.705333,0.004532,0.428571,0.008969,0.567384,0.510365,0.003642,0.714286,0.007246,0.612069
1,DecisionTree,0.993333,0.0,0.0,0.0,0.498217,0.995354,0.0,0.0,0.0,0.498925
2,RandomForest,0.996889,0.0,0.0,0.0,0.5,0.997498,0.0,0.0,0.0,0.5
3,LightGBM,0.936444,0.0,0.0,0.0,0.469683,0.9396,0.012048,0.285714,0.023121,0.613477
4,SVC,0.691111,0.004323,0.428571,0.008559,0.560251,0.495354,0.004234,0.857143,0.008427,0.675795


In [53]:
# pipeline

lr_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LinearSVC(random_state=13))])


# grid search params
lr_param_grid = [{'clf__C': [0.001, 0.01, 0.1, 1]}]

dt_param_grid = [{'clf__criterion':['gini', 'entropy'], 'clf__max_depth':[None,2,3,4]}]

rf_param_grid = [{'clf__n_estimators': [100, 200, 500]}]

lgbm_param_grid = [{'clf__n_estimators': [10, 30, 50, 100], 'clf__num_leaves': [4, 8, 16]}]

svm_param_grid = [{"clf__C": [1, 10, 50], "clf__penalty": ["l1", "l2"],\
                  "clf__loss": ["hinge", "squared_hinge"]}]


kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=13)

# grid search CV
LR = GridSearchCV(lr_pipe, lr_param_grid, cv=kfold, scoring = 'recall')
DT = GridSearchCV(dt_pipe, dt_param_grid, cv=kfold, scoring = 'recall')
RF = GridSearchCV(rf_pipe, rf_param_grid, cv=kfold, scoring = 'recall')
LGBM = GridSearchCV(lgbm_pipe, lgbm_param_grid, cv=kfold, scoring = 'recall')
SVM = GridSearchCV(svm_pipe, svm_param_grid, cv=kfold, scoring = 'recall')

In [54]:
# 하이퍼 파라미터 변경
get_score(X_train_over, y_train_over)

교차검증점수:  0.5665886548534492
최적의 파라미터:  {'clf__C': 0.001}
검증세트평가:  0.42857142857142855
테스트평가:  0.7142857142857143
교차검증점수:  0.900256324529143
최적의 파라미터:  {'clf__criterion': 'entropy', 'clf__max_depth': None}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  1.0
최적의 파라미터:  {'clf__n_estimators': 200}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.9882982280173854
최적의 파라미터:  {'clf__n_estimators': 10, 'clf__num_leaves': 8}
검증세트평가:  0.0
테스트평가:  0.42857142857142855
교차검증점수:  0.559456146216427
최적의 파라미터:  {'clf__C': 50, 'clf__loss': 'squared_hinge', 'clf__penalty': 'l2'}
검증세트평가:  0.42857142857142855
테스트평가:  0.8571428571428571


Unnamed: 0,model name,valid accuracy,valid precision,valid recall,valid f1,valid roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.706667,0.004552,0.428571,0.009009,0.568053,0.510722,0.003644,0.714286,0.007252,0.612249
1,DecisionTree,0.992444,0.0,0.0,0.0,0.497771,0.996426,0.0,0.0,0.0,0.499463
2,RandomForest,0.996889,0.0,0.0,0.0,0.5,0.997498,0.0,0.0,0.0,0.5
3,LightGBM,0.872444,0.0,0.0,0.0,0.437584,0.869192,0.008219,0.428571,0.016129,0.649434
4,SVC,0.667556,0.004016,0.428571,0.007958,0.548436,0.460686,0.003963,0.857143,0.00789,0.658417


In [55]:
# pipeline

lr_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LinearSVC(random_state=13))])


# grid search params
lr_param_grid = [{'clf__C': [0.001, 0.01, 0.1, 1]}]

dt_param_grid = [{'clf__criterion':['gini', 'entropy'], 'clf__max_depth':[None,2,3,4]}]

rf_param_grid = [{'clf__n_estimators': [100, 200, 500]}]

lgbm_param_grid = [{'clf__n_estimators': [10, 30, 50, 100], 'clf__num_leaves': [4, 8, 16]}]

svm_param_grid = [{"clf__C": [1, 10, 80, 90], "clf__penalty": ["l1", "l2"],\
                  "clf__loss": ["hinge", "squared_hinge"]}]


kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=13)

# grid search CV
LR = GridSearchCV(lr_pipe, lr_param_grid, cv=kfold, scoring = 'recall')
DT = GridSearchCV(dt_pipe, dt_param_grid, cv=kfold, scoring = 'recall')
RF = GridSearchCV(rf_pipe, rf_param_grid, cv=kfold, scoring = 'recall')
LGBM = GridSearchCV(lgbm_pipe, lgbm_param_grid, cv=kfold, scoring = 'recall')
SVM = GridSearchCV(svm_pipe, svm_param_grid, cv=kfold, scoring = 'recall')

In [56]:
# 하이퍼 파라미터 변경
get_score(X_train_over, y_train_over)

교차검증점수:  0.5528808648166722
최적의 파라미터:  {'clf__C': 0.001}
검증세트평가:  0.42857142857142855
테스트평가:  0.7142857142857143
교차검증점수:  0.8728407444555891
최적의 파라미터:  {'clf__criterion': 'entropy', 'clf__max_depth': None}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.9866265463055833
최적의 파라미터:  {'clf__n_estimators': 500}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.9550874846762509
최적의 파라미터:  {'clf__n_estimators': 50, 'clf__num_leaves': 4}
검증세트평가:  0.14285714285714285
테스트평가:  0.2857142857142857
교차검증점수:  0.554441101081021
최적의 파라미터:  {'clf__C': 10, 'clf__loss': 'squared_hinge', 'clf__penalty': 'l2'}
검증세트평가:  0.42857142857142855
테스트평가:  0.8571428571428571


Unnamed: 0,model name,valid accuracy,valid precision,valid recall,valid f1,valid roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.707111,0.004559,0.428571,0.009023,0.568276,0.510722,0.003644,0.714286,0.007252,0.612249
1,DecisionTree,0.990667,0.0,0.0,0.0,0.496879,0.996069,0.0,0.0,0.0,0.499283
2,RandomForest,0.996889,0.0,0.0,0.0,0.5,0.997498,0.0,0.0,0.0,0.5
3,LightGBM,0.924889,0.006098,0.142857,0.011696,0.535093,0.919585,0.009009,0.285714,0.017467,0.603445
4,SVC,0.686222,0.004255,0.428571,0.008427,0.557799,0.494996,0.004231,0.857143,0.008421,0.675615


In [57]:
# pipeline

lr_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=6)),
    ("clf", LinearSVC(random_state=13))])


# grid search params
lr_param_grid = [{'clf__C': [0.001, 0.01, 0.1, 1]}]

dt_param_grid = [{'clf__criterion':['gini', 'entropy'], 'clf__max_depth':[None,2,3,4]}]

rf_param_grid = [{'clf__n_estimators': [100, 200, 500]}]

lgbm_param_grid = [{'clf__n_estimators': [10, 30, 50, 100], 'clf__num_leaves': [4, 8, 16]}]

svm_param_grid = [{"clf__C": [80, 90], "clf__penalty": ["l1", "l2"],\
                  "clf__loss": ["hinge", "squared_hinge"]}]


kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=13)

# grid search CV
LR = GridSearchCV(lr_pipe, lr_param_grid, cv=kfold, scoring = 'recall')
DT = GridSearchCV(dt_pipe, dt_param_grid, cv=kfold, scoring = 'recall')
RF = GridSearchCV(rf_pipe, rf_param_grid, cv=kfold, scoring = 'recall')
LGBM = GridSearchCV(lgbm_pipe, lgbm_param_grid, cv=kfold, scoring = 'recall')
SVM = GridSearchCV(svm_pipe, svm_param_grid, cv=kfold, scoring = 'recall')

In [58]:
# 하이퍼 파라미터 변경
get_score(X_train_over, y_train_over)

교차검증점수:  0.5528808648166722
최적의 파라미터:  {'clf__C': 0.001}
검증세트평가:  0.42857142857142855
테스트평가:  0.7142857142857143
교차검증점수:  0.868828708347264
최적의 파라미터:  {'clf__criterion': 'gini', 'clf__max_depth': None}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  1.0
최적의 파라미터:  {'clf__n_estimators': 200}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  1.0
최적의 파라미터:  {'clf__n_estimators': 50, 'clf__num_leaves': 8}
검증세트평가:  0.0
테스트평가:  0.14285714285714285
교차검증점수:  0.5661428730636354
최적의 파라미터:  {'clf__C': 80, 'clf__loss': 'squared_hinge', 'clf__penalty': 'l2'}
검증세트평가:  0.2857142857142857
테스트평가:  0.42857142857142855


Unnamed: 0,model name,valid accuracy,valid precision,valid recall,valid f1,valid roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.706222,0.004545,0.428571,0.008996,0.56783,0.510365,0.003642,0.714286,0.007246,0.612069
1,DecisionTree,0.992889,0.0,0.0,0.0,0.497994,0.995711,0.0,0.0,0.0,0.499104
2,RandomForest,0.996889,0.0,0.0,0.0,0.5,0.997498,0.0,0.0,0.0,0.5
3,LightGBM,0.972444,0.0,0.0,0.0,0.48774,0.976769,0.016667,0.142857,0.029851,0.560859
4,SVC,0.812444,0.004773,0.285714,0.00939,0.549901,0.644746,0.003021,0.428571,0.006,0.53693


In [59]:
# pipeline

lr_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=7)),
    ("clf", LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=7)),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=7)),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=7)),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=7)),
    ("clf", LinearSVC(random_state=13))])


# grid search params
lr_param_grid = [{'clf__C': [0.01, 0.1, 1]}]

dt_param_grid = [{'clf__criterion':['gini', 'entropy'], 'clf__max_depth':[None,2,3,4]}]

rf_param_grid = [{'clf__n_estimators': [100, 200, 500]}]

lgbm_param_grid = [{'clf__n_estimators': [10, 30, 50, 100], 'clf__num_leaves': [4, 8, 16]}]

svm_param_grid = [{"clf__C": [1, 10, 100], "clf__penalty": ["l1", "l2"],\
                  "clf__loss": ["hinge", "squared_hinge"]}]


kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=13)

# grid search CV
LR = GridSearchCV(lr_pipe, lr_param_grid, cv=kfold, scoring = 'recall')
DT = GridSearchCV(dt_pipe, dt_param_grid, cv=kfold, scoring = 'recall')
RF = GridSearchCV(rf_pipe, rf_param_grid, cv=kfold, scoring = 'recall')
LGBM = GridSearchCV(lgbm_pipe, lgbm_param_grid, cv=kfold, scoring = 'recall')
SVM = GridSearchCV(svm_pipe, svm_param_grid, cv=kfold, scoring = 'recall')

In [60]:
# pca=7, 하이퍼 파라미터 변경
get_score(X_train_over, y_train_over)

교차검증점수:  0.5787362086258776
최적의 파라미터:  {'clf__C': 0.1}
검증세트평가:  0.5714285714285714
테스트평가:  0.8571428571428571
교차검증점수:  0.9167502507522567
최적의 파라미터:  {'clf__criterion': 'entropy', 'clf__max_depth': None}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.9767079014822245
최적의 파라미터:  {'clf__n_estimators': 200}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.9874066644377577
최적의 파라미터:  {'clf__n_estimators': 100, 'clf__num_leaves': 16}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.6089379248857684
최적의 파라미터:  {'clf__C': 10, 'clf__loss': 'hinge', 'clf__penalty': 'l2'}
검증세트평가:  0.42857142857142855
테스트평가:  0.5714285714285714


Unnamed: 0,model name,valid accuracy,valid precision,valid recall,valid f1,valid roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.686222,0.005658,0.571429,0.011204,0.629005,0.474267,0.004065,0.857143,0.008092,0.665225
1,DecisionTree,0.994222,0.0,0.0,0.0,0.498663,0.994639,0.0,0.0,0.0,0.498567
2,RandomForest,0.996889,0.0,0.0,0.0,0.5,0.997498,0.0,0.0,0.0,0.5
3,LightGBM,0.996444,0.0,0.0,0.0,0.499777,0.997498,0.0,0.0,0.0,0.5
4,SVC,0.699556,0.004444,0.428571,0.008798,0.564486,0.491065,0.002807,0.571429,0.005587,0.531146


In [61]:
# pipeline

lr_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=7)),
    ("clf", LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=7)),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=7)),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=7)),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=7)),
    ("clf", LinearSVC(random_state=13))])


# grid search params
lr_param_grid = [{'clf__C': [0.01, 0.1, 1]}]

dt_param_grid = [{'clf__criterion':['gini', 'entropy'], 'clf__max_depth':[None,2,3,4]}]

rf_param_grid = [{'clf__n_estimators': [100, 200, 500]}]

lgbm_param_grid = [{'clf__n_estimators': [10, 30, 50, 100], 'clf__num_leaves': [4, 8, 16]}]

svm_param_grid = [{"clf__C": [100], "clf__penalty": ["l2"],\
                  "clf__loss": ["squared_hinge"]}]


kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=13)

# grid search CV
LR = GridSearchCV(lr_pipe, lr_param_grid, cv=kfold, scoring = 'recall')
DT = GridSearchCV(dt_pipe, dt_param_grid, cv=kfold, scoring = 'recall')
RF = GridSearchCV(rf_pipe, rf_param_grid, cv=kfold, scoring = 'recall')
LGBM = GridSearchCV(lgbm_pipe, lgbm_param_grid, cv=kfold, scoring = 'recall')
SVM = GridSearchCV(svm_pipe, svm_param_grid, cv=kfold, scoring = 'recall')

In [62]:
get_score(X_train_over, y_train_over)

교차검증점수:  0.5682603365652513
최적의 파라미터:  {'clf__C': 0.1}
검증세트평가:  0.5714285714285714
테스트평가:  0.8571428571428571
교차검증점수:  0.8560124818901147
최적의 파라미터:  {'clf__criterion': 'entropy', 'clf__max_depth': None}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  1.0
최적의 파라미터:  {'clf__n_estimators': 500}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.9891897915970133
최적의 파라미터:  {'clf__n_estimators': 100, 'clf__num_leaves': 8}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.4585980162710354
최적의 파라미터:  {'clf__C': 100, 'clf__loss': 'squared_hinge', 'clf__penalty': 'l2'}
검증세트평가:  0.7142857142857143
테스트평가:  1.0


Unnamed: 0,model name,valid accuracy,valid precision,valid recall,valid f1,valid roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.686222,0.005658,0.571429,0.011204,0.629005,0.474982,0.004071,0.857143,0.008103,0.665583
1,DecisionTree,0.990667,0.0,0.0,0.0,0.496879,0.994996,0.0,0.0,0.0,0.498746
2,RandomForest,0.996889,0.0,0.0,0.0,0.5,0.997498,0.0,0.0,0.0,0.5
3,LightGBM,0.988889,0.0,0.0,0.0,0.495988,0.991065,0.0,0.0,0.0,0.496775
4,SVC,0.294222,0.003143,0.714286,0.006258,0.503598,0.175125,0.003024,1.0,0.006029,0.586528


In [63]:
# pipeline

lr_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=8)),
    ("clf", LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=8)),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=8)),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=8)),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=8)),
    ("clf", LinearSVC(random_state=13))])


# grid search params
lr_param_grid = [{'clf__C': [0.01, 0.1, 1]}]

dt_param_grid = [{'clf__criterion':['gini', 'entropy'], 'clf__max_depth':[None,2,3,4]}]

rf_param_grid = [{'clf__n_estimators': [100, 200, 500]}]

lgbm_param_grid = [{'clf__n_estimators': [10, 30, 50, 100], 'clf__num_leaves': [4, 8, 16]}]

svm_param_grid = [{"clf__C": [100], "clf__penalty": ["l2"],\
                  "clf__loss": ["squared_hinge"]}]


kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=13)

# grid search CV
LR = GridSearchCV(lr_pipe, lr_param_grid, cv=kfold, scoring = 'recall')
DT = GridSearchCV(dt_pipe, dt_param_grid, cv=kfold, scoring = 'recall')
RF = GridSearchCV(rf_pipe, rf_param_grid, cv=kfold, scoring = 'recall')
LGBM = GridSearchCV(lgbm_pipe, lgbm_param_grid, cv=kfold, scoring = 'recall')
SVM = GridSearchCV(svm_pipe, svm_param_grid, cv=kfold, scoring = 'recall')

In [64]:
# pca=8
get_score(X_train_over, y_train_over)

교차검증점수:  0.7004346372450686
최적의 파라미터:  {'clf__C': 0.1}
검증세트평가:  0.2857142857142857
테스트평가:  0.7142857142857143
교차검증점수:  0.8914521341803187
최적의 파라미터:  {'clf__criterion': 'gini', 'clf__max_depth': None}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  1.0
최적의 파라미터:  {'clf__n_estimators': 200}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.9895241279393737
최적의 파라미터:  {'clf__n_estimators': 50, 'clf__num_leaves': 8}
검증세트평가:  0.0
테스트평가:  0.14285714285714285
교차검증점수:  0.6681154574835618
최적의 파라미터:  {'clf__C': 100, 'clf__loss': 'squared_hinge', 'clf__penalty': 'l2'}
검증세트평가:  0.0
테스트평가:  0.2857142857142857


Unnamed: 0,model name,valid accuracy,valid precision,valid recall,valid f1,valid roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.695111,0.002928,0.285714,0.005797,0.491052,0.537884,0.003858,0.714286,0.007675,0.625864
1,DecisionTree,0.995111,0.0,0.0,0.0,0.499108,0.994639,0.0,0.0,0.0,0.498567
2,RandomForest,0.996889,0.0,0.0,0.0,0.5,0.997498,0.0,0.0,0.0,0.5
3,LightGBM,0.981333,0.0,0.0,0.0,0.492198,0.978556,0.018182,0.142857,0.032258,0.561755
4,SVC,0.823111,0.0,0.0,0.0,0.41284,0.728735,0.002646,0.285714,0.005242,0.50778


In [65]:
# pipeline

lr_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=8)),
    ("clf", LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=8)),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=8)),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=8)),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=8)),
    ("clf", LinearSVC(random_state=13))])


# grid search params
lr_param_grid = [{'clf__C': [0.01, 0.1, 1]}]

dt_param_grid = [{'clf__criterion':['gini', 'entropy'], 'clf__max_depth':[None,2,3,4]}]

rf_param_grid = [{'clf__n_estimators': [100, 200, 500]}]

lgbm_param_grid = [{'clf__n_estimators': [10, 30, 50, 100], 'clf__num_leaves': [4, 8, 16]}]

svm_param_grid = [{"clf__C": [1, 10, 100], "clf__penalty": ["l1", "l2"],\
                  "clf__loss": ["hinge", "squared_hinge"]}]


kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=13)

# grid search CV
LR = GridSearchCV(lr_pipe, lr_param_grid, cv=kfold, scoring = 'recall')
DT = GridSearchCV(dt_pipe, dt_param_grid, cv=kfold, scoring = 'recall')
RF = GridSearchCV(rf_pipe, rf_param_grid, cv=kfold, scoring = 'recall')
LGBM = GridSearchCV(lgbm_pipe, lgbm_param_grid, cv=kfold, scoring = 'recall')
SVM = GridSearchCV(svm_pipe, svm_param_grid, cv=kfold, scoring = 'recall')

In [66]:
# pca=8, 하이퍼 파라미터 변경
get_score(X_train_over, y_train_over)

교차검증점수:  0.700768973587429
최적의 파라미터:  {'clf__C': 0.01}
검증세트평가:  0.2857142857142857
테스트평가:  0.7142857142857143
교차검증점수:  0.9263345592332554
최적의 파라미터:  {'clf__criterion': 'entropy', 'clf__max_depth': None}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  1.0
최적의 파라미터:  {'clf__n_estimators': 200}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.9866265463055833
최적의 파라미터:  {'clf__n_estimators': 100, 'clf__num_leaves': 4}
검증세트평가:  0.0
테스트평가:  0.2857142857142857
교차검증점수:  0.7164827816783683
최적의 파라미터:  {'clf__C': 10, 'clf__loss': 'hinge', 'clf__penalty': 'l2'}
검증세트평가:  0.42857142857142855
테스트평가:  0.5714285714285714


Unnamed: 0,model name,valid accuracy,valid precision,valid recall,valid f1,valid roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.692,0.002899,0.285714,0.005739,0.489491,0.536097,0.003843,0.714286,0.007645,0.624968
1,DecisionTree,0.995556,0.0,0.0,0.0,0.499331,0.993209,0.0,0.0,0.0,0.49785
2,RandomForest,0.996889,0.0,0.0,0.0,0.5,0.997498,0.0,0.0,0.0,0.5
3,LightGBM,0.963556,0.0,0.0,0.0,0.483281,0.960686,0.018692,0.285714,0.035088,0.624047
4,SVC,0.698667,0.004431,0.428571,0.008772,0.564041,0.558256,0.003234,0.571429,0.006431,0.564826


In [67]:
# pipeline

lr_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=9)),
    ("clf", LogisticRegression(random_state=13))])

dt_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=9)),
    ("clf", DecisionTreeClassifier(random_state=13))])

rf_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=9)),
    ("clf", RandomForestClassifier(random_state=13))])

lgbm_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=9)),
    ("clf", LGBMClassifier(random_state=13))])

svm_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ('pca', PCA(n_components=9)),
    ("clf", LinearSVC(random_state=13))])


# grid search params
lr_param_grid = [{'clf__C': [0.01, 0.1, 1]}]

dt_param_grid = [{'clf__criterion':['gini', 'entropy'], 'clf__max_depth':[None,2,3,4]}]

rf_param_grid = [{'clf__n_estimators': [100, 200, 500]}]

lgbm_param_grid = [{'clf__n_estimators': [10, 30, 50, 100], 'clf__num_leaves': [4, 8, 16]}]

svm_param_grid = [{"clf__C": [1, 10, 100], "clf__penalty": ["l1", "l2"],\
                  "clf__loss": ["hinge", "squared_hinge"]}]


kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=13)

# grid search CV
LR = GridSearchCV(lr_pipe, lr_param_grid, cv=kfold, scoring = 'recall')
DT = GridSearchCV(dt_pipe, dt_param_grid, cv=kfold, scoring = 'recall')
RF = GridSearchCV(rf_pipe, rf_param_grid, cv=kfold, scoring = 'recall')
LGBM = GridSearchCV(lgbm_pipe, lgbm_param_grid, cv=kfold, scoring = 'recall')
SVM = GridSearchCV(svm_pipe, svm_param_grid, cv=kfold, scoring = 'recall')

In [68]:
# pca=9
get_score(X_train_over, y_train_over)

교차검증점수:  0.7154797726512871
최적의 파라미터:  {'clf__C': 0.01}
검증세트평가:  0.5714285714285714
테스트평가:  0.7142857142857143
교차검증점수:  0.8985846428173409
최적의 파라미터:  {'clf__criterion': 'gini', 'clf__max_depth': None}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.9895241279393737
최적의 파라미터:  {'clf__n_estimators': 100}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  1.0
최적의 파라미터:  {'clf__n_estimators': 50, 'clf__num_leaves': 16}
검증세트평가:  0.0
테스트평가:  0.0
교차검증점수:  0.7586091608157807
최적의 파라미터:  {'clf__C': 1, 'clf__loss': 'hinge', 'clf__penalty': 'l2'}
검증세트평가:  0.5714285714285714
테스트평가:  0.8571428571428571


Unnamed: 0,model name,valid accuracy,valid precision,valid recall,valid f1,valid roc,test accuracy,test precision,test recall,test f1,test roc
0,LogisticRe,0.698667,0.005891,0.571429,0.011662,0.635246,0.533953,0.003826,0.714286,0.00761,0.623893
1,DecisionTree,0.995556,0.0,0.0,0.0,0.499331,0.993924,0.0,0.0,0.0,0.498209
2,RandomForest,0.996889,0.0,0.0,0.0,0.5,0.997141,0.0,0.0,0.0,0.499821
3,LightGBM,0.995556,0.0,0.0,0.0,0.499331,0.991065,0.0,0.0,0.0,0.496775
4,SVC,0.680444,0.005556,0.571429,0.011004,0.626107,0.517155,0.004425,0.857143,0.008804,0.686723
