# 1. Base Model

In [1]:
# 경고 메세지 안뜨게 하기
import warnings
warnings.filterwarnings(action='ignore') 

# Dataset Loading
import pandas as pd
dataset = pd.read_csv('../data/er_data.csv')

# Dataset Columns
dataset.columns = ['DATE', 'CAD', 'CHF', 'EUR', 'GBP', 'JPY(100)', 'BITCOIN', 'TETHER', 'USD']

# Dataset Index
dataset = dataset.set_index('DATE')

# Train Test Split
train = dataset['2018-02-01':'2019-01-31']
test = dataset['2019-02-01':'2019-02-28']

# Train (sliding Window)
train['USD_1'] = train['USD'].shift(-1)
train['Target'] = train['USD_1'] - train['USD']
train['Target'] = train['Target'].apply(lambda x : 'UP' if x > 0 else 'DOWN')
train = train.dropna()

# Test (sliding Window)
test['USD_1'] = test['USD'].shift(-1)
test['Target'] = test['USD_1'] - test['USD']
test['Target'] = test['Target'].apply(lambda x : 'UP' if x > 0 else 'DOWN')
test = test.dropna()

# Target, Input Split
train_input = train[['CAD', "CHF", "EUR", 'GBP', 'JPY(100)', 'BITCOIN', 'TETHER']]
train_target = train['Target']

test_input = test[['CAD', "CHF", "EUR", 'GBP', 'JPY(100)', 'BITCOIN', 'TETHER']]
test_target = test['Target']

# Transform to Numpy Array
import numpy as np
train = np.array(train)
test = np.array(test)

# Data Normalization
mean = train_input.mean(axis=0)
train_input -= mean
std = train_input.std(axis=0)
train_input /= std
test_input -= mean
test_input /= std

# Logistic Regression
print('Logistic Regression')
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(train_input, train_target)
from sklearn.metrics import confusion_matrix
y_pred = clf.predict(test_input)
y_true = test_target
print(confusion_matrix(y_true, y_pred))
from sklearn.metrics import accuracy_score
y_pred = clf.predict(train_input)
y_true = train_target
print('Train Accuracy :',accuracy_score(y_true, y_pred))
y_pred = clf.predict(test_input)
y_true = test_target
print('Test Accuracy :',accuracy_score(y_true, y_pred))
print('\n')

# DecisionTreeClassifier
print('DecisionTreeClassifier')
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0).fit(train_input, train_target)
from sklearn.metrics import confusion_matrix
y_pred = clf.predict(test_input)
y_true = test_target
print(confusion_matrix(y_true, y_pred))
from sklearn.metrics import accuracy_score
y_pred = clf.predict(train_input)
y_true = train_target
print('Train Accuracy :',accuracy_score(y_true, y_pred))
y_pred = clf.predict(test_input)
y_true = test_target
print('Test Accuracy :',accuracy_score(y_true, y_pred))
print('\n')

# RandomForestClassifier
print('RandomForestClassifier')
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, random_state=0).fit(train_input, train_target)
from sklearn.metrics import confusion_matrix
y_pred = clf.predict(test_input)
y_true = test_target
print(confusion_matrix(y_true, y_pred))
from sklearn.metrics import accuracy_score
y_pred = clf.predict(train_input)
y_true = train_target
print('Train Accuracy :',accuracy_score(y_true, y_pred))
y_pred = clf.predict(test_input)
y_true = test_target
print('Test Accuracy :',accuracy_score(y_true, y_pred))
print('\n')


# GradientBoostingClassifier
print('GradientBoostingClassifier')
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(max_depth=2, random_state=0).fit(train_input, train_target)
from sklearn.metrics import confusion_matrix
y_pred = clf.predict(test_input)
y_true = test_target
print(confusion_matrix(y_true, y_pred))
from sklearn.metrics import accuracy_score
y_pred = clf.predict(train_input)
y_true = train_target
print('Train Accuracy :',accuracy_score(y_true, y_pred))
y_pred = clf.predict(test_input)
y_true = test_target
print('Test Accuracy :',accuracy_score(y_true, y_pred))
print('\n')

# XGBoost
print('XGBClassifier')
from xgboost import XGBClassifier
model = XGBClassifier(booster='gbtree', 
                      colsample_bylevel=0.9, 
                      colsample_bytree=0.8, 
                      gamma=0, 
                      max_depth=8, 
                      min_child_weight=3, 
                      n_estimators=50, 
                      nthread=4, 
                      objective='binary:logistic', 
                      random_state=2, 
                      silent= True)

model.fit(train_input,train_target, eval_set=[(train_input, train_target)], early_stopping_rounds=50,verbose=0)
y_pred = model.predict(test_input)
y_true = test_target
print(confusion_matrix(y_true, y_pred))
from sklearn.metrics import accuracy_score
y_pred = model.predict(train_input)
y_true = train_target
print('Train Accuracy :',accuracy_score(y_true, y_pred))
y_pred = model.predict(test_input)
y_true = test_target
print('Test Accuracy :',accuracy_score(y_true, y_pred))
print('\n\n')

# LightGBM

Logistic Regression
[[7 0]
 [9 0]]
Train Accuracy : 0.6008230452674898
Test Accuracy : 0.4375


DecisionTreeClassifier
[[7 0]
 [7 2]]
Train Accuracy : 1.0
Test Accuracy : 0.5625


RandomForestClassifier
[[4 3]
 [6 3]]
Train Accuracy : 0.6831275720164609
Test Accuracy : 0.4375


GradientBoostingClassifier
[[7 0]
 [7 2]]
Train Accuracy : 0.9382716049382716
Test Accuracy : 0.5625


XGBClassifier
[[6 1]
 [4 5]]
Train Accuracy : 0.9629629629629629
Test Accuracy : 0.6875





# 2. Over Sampling for SMOTE

# 3. Model Stacking