## libraries

In [6]:
import pandas as pd
import math
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import train_test_split

# sampling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline

# Classifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Boosting
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
import xgboost
import lightgbm
import catboost

# Bagging
from sklearn.ensemble import BaggingClassifier

# Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import IsolationForest

# DBSCAN
from sklearn.cluster import DBSCAN

## Data

In [8]:
df = pd.read_csv('../creditcard.csv')

std_scaler = StandardScaler()
rob_scaler = RobustScaler()

df['scaled_amount'] = rob_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_time'] = rob_scaler.fit_transform(df['Time'].values.reshape(-1,1))

df.drop(['Time','Amount'], axis=1, inplace=True)

scaled_amount = df['scaled_amount']
scaled_time = df['scaled_time']

df.drop(['scaled_amount', 'scaled_time'], axis=1, inplace=True)
df.insert(0, 'scaled_amount', scaled_amount)
df.insert(1, 'scaled_time', scaled_time)

df = df.sample(frac=1)  #shuffle


# undersampling
fraud_df = df.loc[df['Class'] == 1]
non_fraud_df = df.loc[df['Class'] == 0][:492]
normal_distributed_df = pd.concat([fraud_df, non_fraud_df])
df_un = normal_distributed_df.sample(frac=1, random_state=42)

# oversampling with SMOTE
sm = SMOTE(sampling_strategy='minority')
X_sm, y_sm = sm.fit_resample(df.drop('Class', axis=1), df['Class'])

In [9]:
X_sm.shape, y_sm.shape

((568630, 30), (568630,))

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Class', axis=1), df['Class'], test_size=0.3, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((199364, 30), (199364,), (85443, 30), (85443,))

In [12]:
X_train_un, X_test_un, y_train_un, y_test_un = train_test_split(df_un.drop('Class', axis=1), df_un['Class'], test_size=0.3, random_state=42)
X_train_un.shape, y_train_un.shape, X_test_un.shape, y_test_un.shape

((688, 30), (688,), (296, 30), (296,))

In [13]:
X_train_ov, X_test_ov, y_train_ov, y_test_ov = train_test_split(X_sm, y_sm, test_size=0.3, random_state=42)
X_train_ov.shape, y_train_ov.shape, X_test_ov.shape, y_test_ov.shape

((398041, 30), (398041,), (170589, 30), (170589,))

# Classifier

In [None]:
classifiers = {
    "LogisiticRegression": LogisticRegression(),
    "KNearest": KNeighborsClassifier(),
    "Support Vector Classifier": SVC(),
    "DecisionTreeClassifier": DecisionTreeClassifier()
}

for key, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    training_score = cross_val_score(classifier, X_train, y_train, cv=5)
    print(classifier.__class__.__name__, "training score:", training_score.mean())

In [None]:
classifiers = {
    "LogisiticRegression": LogisticRegression(),
    "KNearest": KNeighborsClassifier(),
    "Support Vector Classifier": SVC(),
    "DecisionTreeClassifier": DecisionTreeClassifier()
}

for key, classifier in classifiers.items():
    classifier.fit(X_train_un, y_train_un)
    training_score = cross_val_score(classifier, X_train_un, y_train_un, cv=5)
    print(classifier.__class__.__name__, "undersampling training score:", training_score.mean())

In [None]:
classifiers = {
    "LogisiticRegression": LogisticRegression(),
    "KNearest": KNeighborsClassifier(),
    "Support Vector Classifier": SVC(),
    "DecisionTreeClassifier": DecisionTreeClassifier()
}

for key, classifier in classifiers.items():
    classifier.fit(X_train_ov, y_train_ov)
    training_score = cross_val_score(classifier, X_train_ov, y_train_ov, cv=5)
    print(classifier.__class__.__name__, "oversampling training score:", training_score.mean())

# Boosting
여러개의 학습모형을 연결하여 앞의 모형을 보완해 가면서 뒤로 갈수록 더욱 강한 모형을 만듬

## AdaBoost
- 이전 모형을 보완하여 새로운 모형을 만드는 방법의 기본은 이전 모형이 잘 맞추지 못한 데이터에 대해 가중치를 더 높임
- 새로운 모형은 가중치가 더 높은 데이터를 잘 구분하려고 하므로 어려운 데이터를 점점 더 잘 분류할 수 있게 됨
- 단점: Bagging이나 Pating처럼 병렬화가 안됨

In [26]:
ada_clf = AdaBoostClassifier( 
    DecisionTreeClassifier(max_depth=1), 
    n_estimators=200, 
    algorithm="SAMME.R", 
    learning_rate=0.5 
)
ada_clf.fit(X_train, y_train) 

y_pred = ada_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9993562960102056

In [None]:
ada_clf = AdaBoostClassifier()
ada_clf.fit(X_train, y_train) 

y_pred = ada_clf.predict(X_test)
print('accuracy:', accuracy_score(y_test, y_pred))

In [None]:
ada_clf_un = AdaBoostClassifier()
ada_clf_un.fit(X_train_un, y_train_un) 

y_pred_un = ada_clf.predict(X_test_un)
print('undersampling accuracy:', accuracy_score(y_test_un, y_pred_un))

In [None]:
ada_clf_ov = AdaBoostClassifier()
ada_clf_ov.fit(X_train_ov, y_train_ov) 

y_pred_ov = ada_clf.predict(X_test_ov)
print('undersampling accuracy:', accuracy_score(y_test_ov, y_pred_ov))

## Gradient Boost
이전 모형이 만든 잔여 오차에 대한 새로운 모형을 학습시킴

In [49]:
gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_train, y_train)

y_pred = gb_clf.predict(X_test)
print('accuracy:', accuracy_score(y_test, y_pred))

0.9988179254005595

In [None]:
ada_clf_un = AdaBoostClassifier()
ada_clf_un.fit(X_train_un, y_train_un) 

y_pred_un = ada_clf.predict(X_test_un)
print('undersampling accuracy:', accuracy_score(y_test_un, y_pred_un))

In [None]:
ada_clf_ov = AdaBoostClassifier()
ada_clf_ov.fit(X_train_ov, y_train_ov) 

y_pred_ov = ada_clf.predict(X_test_ov)
print('oversampling accuracy:', accuracy_score(y_test_ov, y_pred_ov))

## 확률적 그래디언트 부스트 (Stochastic Gradient Boost)
- 그래디언트 부스트 방안이 트리를 학습할때 사용할 학습 데이터의 비율 지정
- 데이터를 무작위로 뽑아서 학습

## XGBOOST

In [33]:
xgb_clf = xgboost.XGBClassifier(max_depth=2, eta=1)
xgb_clf.fit(X_train, y_train)

y_pred = xgb_clf.predict(X_test)
print('accuracy:', accuracy_score(y_test, y_pred))



0.9994850368081645


In [None]:
xgb_clf = xgboost.XGBClassifier(max_depth=2, eta=1)
xgb_clf.fit(X_train, y_train)

y_pred = xgb_clf.predict(X_test)
print('undersampling accuracy:', accuracy_score(y_test_un, y_pred_un))

In [None]:
xgb_clf = xgboost.XGBClassifier(max_depth=2, eta=1)
xgb_clf.fit(X_train, y_train)

y_pred = xgb_clf.predict(X_test)
print('oversampling accuracy:', accuracy_score(y_test_ov, y_pred_ov))

## lightGBM

In [None]:
lgb_clf = lightgbm.LGBMClassifier()
lgb_clf.fit(X_train, y_train)

y_pred = lgb_clf.predict(X_test)
print('accuracy:', accuracy_score(y_test, y_pred))

In [42]:
lgb_clf = lightgbm.LGBMClassifier()
lgb_clf.fit(X_train, y_train)

y_pred = lgb_clf.predict(X_test)
print('undersampling accuracy:', accuracy_score(y_test_un, y_pred_un))

0.996477183619489


In [None]:
lgb_clf = lightgbm.LGBMClassifier()
lgb_clf.fit(X_train, y_train)

y_pred = lgb_clf.predict(X_test)
print('oversampling accuracy:', accuracy_score(y_test_ov, y_pred_ov))

## CATBOOST

In [48]:
cat_clf = catboost.CatBoostClassifier()
cat_clf.fit(X_train, y_train)

y_pred = cat_clf.predict(X_test)
print('accuracy:', accuracy_score(y_test, y_pred))

Learning rate set to 0.098829
0:	learn: 0.3769827	total: 72.6ms	remaining: 1m 12s
1:	learn: 0.2031768	total: 91.1ms	remaining: 45.5s
2:	learn: 0.1124142	total: 110ms	remaining: 36.6s
3:	learn: 0.0641434	total: 128ms	remaining: 31.9s
4:	learn: 0.0387952	total: 146ms	remaining: 29s
5:	learn: 0.0243379	total: 163ms	remaining: 27s
6:	learn: 0.0165542	total: 180ms	remaining: 25.6s
7:	learn: 0.0118911	total: 198ms	remaining: 24.5s
8:	learn: 0.0091057	total: 214ms	remaining: 23.6s
9:	learn: 0.0072142	total: 231ms	remaining: 22.8s
10:	learn: 0.0059931	total: 246ms	remaining: 22.1s
11:	learn: 0.0051635	total: 262ms	remaining: 21.6s
12:	learn: 0.0045369	total: 277ms	remaining: 21.1s
13:	learn: 0.0040811	total: 292ms	remaining: 20.6s
14:	learn: 0.0038058	total: 309ms	remaining: 20.3s
15:	learn: 0.0035599	total: 324ms	remaining: 19.9s
16:	learn: 0.0033912	total: 339ms	remaining: 19.6s
17:	learn: 0.0032564	total: 356ms	remaining: 19.4s
18:	learn: 0.0031452	total: 371ms	remaining: 19.2s
19:	learn: 0

0.9994382219725431

In [None]:
cat_clf = catboost.CatBoostClassifier()
cat_clf.fit(X_train, y_train)

y_pred = cat_clf.predict(X_test)
print('undersampling accuracy:', accuracy_score(y_test_un, y_pred_un))

In [None]:
cat_clf = catboost.CatBoostClassifier()
cat_clf.fit(X_train, y_train)

y_pred = cat_clf.predict(X_test)
print('oversampling accuracy:', accuracy_score(y_test_ov, y_pred_ov))

## Bagging

In [53]:
bag_clf = BaggingClassifier(
    LogisticRegression(max_iter=100000), 
    n_estimators = 500, 
    max_samples = 100, 
    bootstrap = True, 
    n_jobs = -1
)

bag_clf.fit(X_train, y_train)

y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9982444436641972

In [None]:
bag_clf = BaggingClassifier()

bag_clf.fit(X_train, y_train)

y_pred = bag_clf.predict(X_test)
print('accuracy:', accuracy_score(y_test, y_pred))

In [None]:
bag_clf = BaggingClassifier()

bag_clf.fit(X_train_un, y_train_un)

y_pred_un = bag_clf.predict(X_test_un)
print('undersampling accuracy:', accuracy_score(y_test_un, y_pred_un))

In [None]:
bag_clf = BaggingClassifier()

bag_clf.fit(X_train, y_train)

y_pred = bag_clf.predict(X_test)
print('oversampling accuracy:', accuracy_score(y_test_ov, y_pred_ov))

## Random Forest

In [56]:
rnd_clf = RandomForestClassifier(
    n_estimators = 500,  
    max_features = 'auto', 
    max_samples=0.5,    
    bootstrap = True,
    n_jobs=-1
)

rnd_clf.fit(X_train, y_train)

y_pred = rnd_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9994850368081645

In [None]:
rnd_clf = RandomForestClassifier()
rnd_clf.fit(X_train, y_train)
y_pred = rnd_clf.predict(X_test)
print('accuracy:', accuracy_score(y_test, y_pred))

In [None]:
rnd_clf = RandomForestClassifier()
rnd_clf.fit(X_train, y_train)
y_pred = rnd_clf.predict(X_test)
print('undersampling accuracy:', accuracy_score(y_test_un, y_pred_un))

In [None]:
rnd_clf = RandomForestClassifier()
rnd_clf.fit(X_train, y_train)
y_pred = rnd_clf.predict(X_test)
print('oversampling accuracy:', accuracy_score(y_test_ov, y_pred_ov))

## Isolation Forest

In [63]:
iforest = IsolationForest()#(n_estimators=100, max_samples='auto', 
                         # contamination=0.05, max_features=1.0, 
                         # bootstrap=False, n_jobs=-1, random_state=1)

iforest.fit(X_train, y_train)

y_pred = iforest.predict(X_test)
print('accuracy:', accuracy_score(y_test, y_pred))

0.00029259272263380267

In [None]:
iforest = IsolationForest()#(n_estimators=100, max_samples='auto', 
                         # contamination=0.05, max_features=1.0, 
                         # bootstrap=False, n_jobs=-1, random_state=1)

iforest.fit(X_train, y_train)

y_pred = iforest.predict(X_test)
print('undersampling accuracy:', accuracy_score(y_test_un, y_pred_un))

In [None]:
iforest = IsolationForest()#(n_estimators=100, max_samples='auto', 
                         # contamination=0.05, max_features=1.0, 
                         # bootstrap=False, n_jobs=-1, random_state=1)

iforest.fit(X_train, y_train)

y_pred = iforest.predict(X_test)
print('oversampling accuracy:', accuracy_score(y_test_ov, y_pred_ov))

## Extra tree

In [66]:
et_clf = ExtraTreesClassifier(
    n_estimators = 500, 
    max_leaf_nodes = 16, 
    max_features = 'auto', 
    n_jobs=-1
)

et_clf.fit(X_train, y_train)

y_pred = et_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9988530365272755

In [None]:
et_clf = ExtraTreesClassifier()
et_clf.fit(X_train, y_train)
y_pred = et_clf.predict(X_test)
print('accuracy:', accuracy_score(y_test, y_pred))

In [None]:
et_clf = ExtraTreesClassifier()
et_clf.fit(X_train, y_train)
y_pred = et_clf.predict(X_test)
print('undersampling accuracy:', accuracy_score(y_test_un, y_pred_un))

In [None]:
et_clf = ExtraTreesClassifier()
et_clf.fit(X_train, y_train)
y_pred = et_clf.predict(X_test)
print('oversampling accuracy:', accuracy_score(y_test_ov, y_pred_ov))

## DBSCAN

In [None]:
dbscan = DBSCAN()
dbscan.fit(X_train, y_train)
y_pred = dbscan.predict(X_test)
print('accuracy:', accuracy_score(y_test, y_pred))

In [None]:
dbscan = DBSCAN()
dbscan.fit(X_train, y_train)
y_pred = dbscan.predict(X_test)
print('undersampling accuracy:', accuracy_score(y_test_un, y_pred_un))

In [None]:
dbscan = DBSCAN()
dbscan.fit(X_train, y_train)
y_pred = dbscan.predict(X_test)
print('oversampling accuracy:', accuracy_score(y_test_ov, y_pred_ov))

# Result
- AdaBoost: 0.9993562960102056
- GradientBoosting: 0.9988179254005595
- XGBoost: 0.9994850368081645
- lightGBM: 0.996477183619489
- CATBOOST: 0.9994382219725431
- Bagging: 0.9982444436641972
- Random Forest: 0.9994850368081645
- Extra tree: 0.9988530365272755