In [35]:
import pandas as pd

# 데이터 불러오기
socar = pd.read_csv("insurance_fraud_detect_data.csv")
pd.set_option('display.max_columns', len(socar.columns))
socar .head()

Unnamed: 0,fraud_YN,car_model,sharing_type,age_group,has_previous_accident,cumulative_use_count,b2b,accident_ratio,pf_type,socarpass,socarsave,start_hour,duration,accident_hour,repair_cost,insure_cost,accident_location,car_part1,car_part2,repair_cnt,acc_type1,insurance_site_aid_YN,police_site_aid_YN,total_prsn_cnt,test_set
0,0,2,1,3,0,2,0,100,2,0,0,4,2,4,130254.7,0,1,0,0,1,0,0,0,-1,0
1,0,1,0,4,0,4,0,0,3,1,0,1,4,4,0.0,0,1,1,1,2,1,1,1,0,1
2,0,1,0,4,1,4,0,100,3,1,0,1,4,6,258130.6,12925040,3,1,0,2,1,1,1,0,1
3,0,3,1,3,0,4,0,0,2,0,0,3,3,5,1341009.0,0,3,1,0,1,0,0,0,-1,0
4,0,1,0,1,0,1,0,100,1,0,0,1,1,1,524030.6,1209410,1,0,0,2,0,0,0,-1,0


In [36]:
# 테스트 셋 분리

train_df = socar[socar['test_set']==0]
test_df = socar[socar['test_set']==1]

In [37]:
# "test_set" 컬럼은 train 세트인지 test세트인지 확인하는 컬럼이므로 세트 분할 후 삭제

train_df.drop("test_set", axis=1, inplace=True)
train_df.reset_index(drop=True, inplace=True)
test_df.drop("test_set", axis=1, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [38]:
# 학습해야 할 train 세트
X_features = train_df.iloc[:, 1:]
y_target = train_df.iloc[:, 0]

# 나중에 test 세트를 예측해야 하니까 test 세트도 변수에 넣어줌.
X_test = test_df.iloc[:, 1:]
y_test = test_df.iloc[:, 0]

In [39]:
# train-valid 분리 

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_features, y_target, test_size = 0.3, random_state = 13, stratify = y_target)

# 비율 확인 
print(y_train.value_counts()/y_train.shape[0]*100) 
print(y_valid.value_counts()/y_valid.shape[0]*100)

0    99.733777
1     0.266223
Name: fraud_YN, dtype: float64
0    99.741201
1     0.258799
Name: fraud_YN, dtype: float64


In [40]:
# 학습 / 예측 / 평가 함수 

# 평가 함수 

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score

def get_clf_eval(y_test, pred):
    # confusion = confusion_matrix(y_test, pred) #오차행렬
    accuracy = accuracy_score(y_test,pred) # 정확도 
    precision = precision_score(y_test,pred) # 정밀도
    recall = recall_score(y_test,pred) # 재현율 
    f1 = f1_score(y_test,pred)
    roc_auc = roc_auc_score(y_test,pred)
    return accuracy, precision, recall, f1, roc_auc

# 성능 DF 반환 

def get_result(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    return get_clf_eval(y_test, pred)

# 다수의 모델 성능 비교 

def get_result_df(models, model_names, X_train, y_train, X_test, y_test):
    col_names = ['Accuracy', 'precision', 'recall', 'f1', 'roc']
    tmp = []
    for model in models:
        tmp.append(get_result(model, X_train, y_train, X_test, y_test))

    return pd.DataFrame(tmp, columns=col_names, index=model_names)


# 성능 확인 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import LinearSVC


# 모델 객체 생성 
lr_clf = LogisticRegression(random_state=13)
dt_clf = DecisionTreeClassifier(random_state=13, max_depth=4)
rf_clf = RandomForestClassifier(random_state=13, n_jobs = -1, n_estimators= 100 )
lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=64, n_jobs=-1, boost_from_average = False)
svm_clf = LinearSVC()

# 학습 진행 
models=[lr_clf,dt_clf,rf_clf,lgbm_clf, svm_clf] 
model_names = ['LogiReg', 'DecisionTree', 'RandomFore','LGBM', "SVC"]
results = get_result_df(models, model_names, X_train, y_train, X_valid, y_valid)
results

Unnamed: 0,Accuracy,precision,recall,f1,roc
LogiReg,0.997412,0.0,0.0,0.0,0.5
DecisionTree,0.997412,0.0,0.0,0.0,0.5
RandomFore,0.997412,0.0,0.0,0.0,0.5
LGBM,0.997153,0.0,0.0,0.0,0.49987
SVC,0.975414,0.0,0.0,0.0,0.488972


In [None]:
# Trial 1 (전처리만)

In [41]:
# accident_hour 미확인 행 최빈값으로 채움

X_features_copy = X_features.copy()
no_time = X_features_copy[X_features_copy['accident_hour']==-1]
time = X_features_copy[X_features_copy['accident_hour']!=-1]

time['accident_hour'].idxmax()

X_features_copy.loc[(X_features_copy.accident_hour == -1), 'accident_hour'] = time['accident_hour'].idxmax()

In [42]:
# total_prsn_cnt 미확인 행 최빈값으로 채움

no_person = X_features_copy[X_features_copy['total_prsn_cnt']==-1]
person = X_features_copy[X_features_copy['total_prsn_cnt']!=-1]

person['total_prsn_cnt'].idxmax()

X_features_copy.loc[(X_features_copy.total_prsn_cnt == -1), 'total_prsn_cnt'] = person['total_prsn_cnt'].idxmax()

In [43]:
# accident_location 확인불가 행 최빈값으로 채움

no_location = X_features_copy[X_features_copy['accident_location']==5]
location = X_features_copy[X_features_copy['accident_location']!=5]

location['accident_location'].idxmax()

X_features_copy.loc[(X_features_copy.accident_location == 5), 'accident_location'] = location['accident_location'].idxmax()

In [44]:
# repair_cost 평균값으로 채움

repair_cost_mean = X_features_copy[X_features_copy['repair_cost']!=0]['repair_cost'].mean()
repair_cost_mean

452453.9622485912

In [45]:
X_features_copy.loc[(X_features_copy.repair_cost == 0), 'repair_cost'] = repair_cost_mean

In [46]:
# insure_cost 평균값으로 채움

insure_cost_mean = X_features_copy[X_features_copy['insure_cost']!=0]['insure_cost'].mean()

X_features_copy.loc[(X_features_copy.insure_cost == 0), 'insure_cost'] = insure_cost_mean

In [47]:
# 원핫인코딩
X_features_copy = pd.get_dummies(X_features_copy, columns=["age_group", "cumulative_use_count", "accident_ratio",\
                                                           "pf_type", "start_hour", "accident_hour",\
                                                           "accident_location", "acc_type1"])

In [None]:
# trial 1 (전처리)

In [None]:
# train-valid 분리 

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_features_copy, y_target, test_size = 0.3, random_state = 13, stratify = y_target)

# 비율 확인 
print(y_train.value_counts()/y_train.shape[0]*100) 
print(y_valid.value_counts()/y_valid.shape[0]*100)

In [48]:
# 모델 객체 생성 
lr_clf = LogisticRegression(random_state=13)
dt_clf = DecisionTreeClassifier(random_state=13, max_depth=4)
rf_clf = RandomForestClassifier(random_state=13, n_jobs = -1, n_estimators= 100 )
lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=64, n_jobs=-1, boost_from_average = False)
svm_clf = LinearSVC()

# 학습 진행 
models=[lr_clf,dt_clf,rf_clf,lgbm_clf, svm_clf] 
model_names = ['LogiReg', 'DecisionTree', 'RandomFore','LGBM', "SVC"]
results = get_result_df(models, model_names, X_train, y_train, X_valid, y_valid)
results

Unnamed: 0,Accuracy,precision,recall,f1,roc
LogiReg,0.997412,0.0,0.0,0.0,0.5
DecisionTree,0.997412,0.0,0.0,0.0,0.5
RandomFore,0.997412,0.0,0.0,0.0,0.5
LGBM,0.997153,0.0,0.0,0.0,0.49987
SVC,0.985248,0.0,0.0,0.0,0.493902


In [None]:
# Trial 2 (전처리 + 오버샘플링)

In [49]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=13)
X_train_over, y_train_over = smote.fit_sample(X_train, y_train)

lr_clf = LogisticRegression(random_state=13)
dt_clf = DecisionTreeClassifier(random_state=13, max_depth=4)
rf_clf = RandomForestClassifier(random_state=13, n_jobs = -1, n_estimators= 100 )
lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=64, n_jobs=-1, boost_from_average = False)
svm_clf = LinearSVC()

models=[lr_clf,dt_clf,rf_clf,lgbm_clf,svm_clf] 
model_names = ['LogiReg', 'DecisionTree', 'RandomFore','LGBM','SVC']
results = get_result_df(models, model_names, X_train_over, y_train_over, X_valid, y_valid)

results

Unnamed: 0,Accuracy,precision,recall,f1,roc
LogiReg,0.624482,0.004129,0.6,0.008202,0.612273
DecisionTree,0.76294,0.002198,0.2,0.004348,0.4822
RandomFore,0.994565,0.0,0.0,0.0,0.498573
LGBM,0.99353,0.0,0.0,0.0,0.498054
SVC,0.45264,0.001421,0.3,0.002829,0.376518


In [None]:
# Trial 3 (전처리 + 언더샘플링:RandomUnderSampler)

In [50]:
from imblearn.under_sampling import RandomUnderSampler

under_sampling = RandomUnderSampler(random_state=6)
X_train_under, y_train_under = under_sampling.fit_sample(X_train, y_train)

lr_clf = LogisticRegression(random_state=13)
dt_clf = DecisionTreeClassifier(random_state=13, max_depth=4)
rf_clf = RandomForestClassifier(random_state=13, n_jobs = -1, n_estimators= 100 )
lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=64, n_jobs=-1, boost_from_average = False)
svm_clf = LinearSVC()

models=[lr_clf,dt_clf,rf_clf,lgbm_clf,svm_clf] 
model_names = ['LogiReg', 'DecisionTree', 'RandomFore','LGBM','SVC']
results = get_result_df(models, model_names, X_train_over, y_train_over, X_valid, y_valid)

results

Unnamed: 0,Accuracy,precision,recall,f1,roc
LogiReg,0.624482,0.004129,0.6,0.008202,0.612273
DecisionTree,0.76294,0.002198,0.2,0.004348,0.4822
RandomFore,0.994565,0.0,0.0,0.0,0.498573
LGBM,0.99353,0.0,0.0,0.0,0.498054
SVC,0.673137,0.002383,0.3,0.004728,0.487052


In [51]:
# Trial 4 (전처리 + 언더샘플링:NearMiss)

In [52]:
from imblearn.under_sampling import NearMiss

under_sampling = NearMiss(n_neighbors_ver3=3)
X_train_under, y_train_under = under_sampling.fit_sample(X_train, y_train)

lr_clf = LogisticRegression(random_state=13)
dt_clf = DecisionTreeClassifier(random_state=13, max_depth=4)
rf_clf = RandomForestClassifier(random_state=13, n_jobs = -1, n_estimators= 100 )
lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=64, n_jobs=-1, boost_from_average = False)
svm_clf = LinearSVC()

models=[lr_clf,dt_clf,rf_clf,lgbm_clf,svm_clf] 
model_names = ['LogiReg', 'DecisionTree', 'RandomFore','LGBM','SVC']
results = get_result_df(models, model_names, X_train_over, y_train_over, X_valid, y_valid)

results

Unnamed: 0,Accuracy,precision,recall,f1,roc
LogiReg,0.624482,0.004129,0.6,0.008202,0.612273
DecisionTree,0.76294,0.002198,0.2,0.004348,0.4822
RandomFore,0.994565,0.0,0.0,0.0,0.498573
LGBM,0.99353,0.0,0.0,0.0,0.498054
SVC,0.464803,0.002418,0.5,0.004812,0.482356


In [None]:
# Trial 5 (전처리 + ss + pca + RandomUnderSampler)

In [65]:
X_features_copy2 = X_features_copy.copy()

In [66]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_scaled = scaler.fit_transform(X_features_copy2)

pca = PCA(n_components=3)
pca.fit(df_scaled)
df_pca = pca.transform(df_scaled)

pca_columns = ['component_1','component_2','component_3']

DF_pca = pd.DataFrame(df_pca, columns=pca_columns)
DF_pca['target'] = y_target 
DF_pca.head()

Unnamed: 0,component_1,component_2,component_3,target
0,-1.494263,-1.739527,1.034499,0
1,-0.261361,-2.292721,4.811336,0
2,-1.579345,2.505007,-0.899672,0
3,4.287617,0.699186,-0.356418,1
4,4.457215,-1.808762,2.16,0


In [67]:
X_features = DF_pca.drop('target', axis= 1)
y_target = DF_pca['target']

X_train, X_valid, y_train, y_valid = train_test_split(X_features, y_target, test_size =0.3, random_state=13, stratify= y_target)

In [68]:
from imblearn.under_sampling import RandomUnderSampler

under_sampling = RandomUnderSampler(random_state=13)
X_train_under, y_train_under = under_sampling.fit_sample(X_train, y_train)

lr_clf = LogisticRegression(random_state=13)
dt_clf = DecisionTreeClassifier(random_state=13, max_depth=4)
rf_clf = RandomForestClassifier(random_state=13, n_jobs = -1, n_estimators= 100 )
lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=64, n_jobs=-1, boost_from_average = False)
svm_clf = LinearSVC()

models=[lr_clf,dt_clf,rf_clf,lgbm_clf,svm_clf] 
model_names = ['LogiReg', 'DecisionTree', 'RandomFore','LGBM','SVC']
results = get_result_df(models, model_names, X_train_under, y_train_under, X_valid, y_valid)

results

Unnamed: 0,Accuracy,precision,recall,f1,roc
LogiReg,0.535455,0.002786,0.5,0.00554,0.517774
DecisionTree,0.598344,0.003222,0.5,0.006402,0.549299
RandomFore,0.408903,0.003059,0.7,0.006092,0.554074
LGBM,0.399586,0.002584,0.6,0.005146,0.499533
SVC,0.52795,0.003286,0.6,0.006536,0.563882


In [None]:
# trial 6 (전처리 + ss + pca + NearMiss)

In [69]:
X_features_copy3 = X_features_copy.copy()

In [70]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_scaled = scaler.fit_transform(X_features_copy3)

pca = PCA(n_components=3)
pca.fit(df_scaled)
df_pca = pca.transform(df_scaled)

pca_columns = ['component_1','component_2','component_3']

DF_pca = pd.DataFrame(df_pca, columns=pca_columns)
DF_pca['target'] = y_target 
DF_pca.head()

Unnamed: 0,component_1,component_2,component_3,target
0,-1.494275,-1.747538,1.026111,0
1,-0.261373,-2.280984,4.80776,0
2,-1.579343,2.514411,-0.88148,0
3,4.287621,0.705293,-0.35972,1
4,4.457239,-1.811992,2.168928,0


In [71]:
X_features = DF_pca.drop('target', axis= 1)
y_target = DF_pca['target']

X_train, X_valid, y_train, y_valid = train_test_split(X_features, y_target, test_size =0.3, random_state=13, stratify= y_target)

In [72]:
from imblearn.under_sampling import NearMiss

under_sampling = NearMiss(n_neighbors_ver3=3)
X_train_under, y_train_under = under_sampling.fit_sample(X_train, y_train)

lr_clf = LogisticRegression(random_state=13)
dt_clf = DecisionTreeClassifier(random_state=13, max_depth=4)
rf_clf = RandomForestClassifier(random_state=13, n_jobs = -1, n_estimators= 100 )
lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=64, n_jobs=-1, boost_from_average = False)
svm_clf = LinearSVC()

models=[lr_clf,dt_clf,rf_clf,lgbm_clf,svm_clf] 
model_names = ['LogiReg', 'DecisionTree', 'RandomFore','LGBM','SVC']
results = get_result_df(models, model_names, X_train_under, y_train_under, X_valid, y_valid)

results

Unnamed: 0,Accuracy,precision,recall,f1,roc
LogiReg,0.346014,0.003158,0.8,0.006292,0.572418
DecisionTree,0.109472,0.002609,0.9,0.005204,0.50371
RandomFore,0.099638,0.002581,0.9,0.005147,0.49878
LGBM,0.236801,0.003044,0.9,0.006067,0.56754
SVC,0.353261,0.002797,0.7,0.005571,0.526181
