In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import auc, accuracy_score
from sklearn.model_selection import GridSearchCV
import pickle
%matplotlib inline

In [2]:
X_train = pd.read_csv('Data/random_split/Train.csv')
y_train = X_train['up_down']
X_train = X_train.drop(columns = ['up_down','Unnamed: 0'])

X_test = pd.read_csv('Data/random_split/Test_features.csv')
X_test = X_test.drop(columns = ['Unnamed: 0'])
y_test = pd.read_csv('Data/random_split/Test_label.csv')
y_test = y_test.drop(columns = ['num'])

In [3]:
# scaling Volume
import sklearn.preprocessing as preprocessing
scaler = preprocessing.StandardScaler()
vol_scale_param = scaler.fit(X_train['Volume'].values.reshape(-1,1))

X_train['Volume_scaled'] = scaler.fit_transform(X_train['Volume'].values.reshape(-1,1), vol_scale_param)
X_train = X_train.drop(columns = ['Volume'])

X_test['Volume_scaled'] = scaler.fit_transform(X_test['Volume'].values.reshape(-1,1), vol_scale_param)
X_test = X_test.drop(columns = ['Volume'])


In [None]:
# feature selection
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

select = RFE(RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=40)
#select = RFE(LogisticRegression(penalty="l1"), n_features_to_select=40)

select.fit(X_train, y_train)
# visualize the selected features:
mask = select.get_support()
plt.matshow(mask.reshape(1, -1), cmap='gray_r')


### Prediction using GDBT of lightGBM 

In [8]:
param_test = {
    'max_depth':np.arange(3,10,2),
    'min_child_weight':np.arange(1,6,2)
}

estimator = lgb.LGBMClassifier(boosting_type='gbdt', 
                           objective='binary',
                           metric='auc',
                           num_leaves=9,
                           min_split_gain=0,
                           min_child_weight=5,
                           min_child_samples=10,
                           max_bin=100,
                           subsample= 0.7,  # Subsample ratio of the training instance.
                           subsample_freq=1,  # frequence of subsample, <=0 means no enable\
                           colsample_bytree=0.7,  # Subsample ratio of columns when constructing each tree.
                           reg_alpha=1, 
                           reg_lambda=0,
                           seed=410, 
                           nthread=4, 
                           silent=True)


gsearch = GridSearchCV(estimator, param_grid = param_test, scoring='roc_auc',n_jobs=1,iid=False, cv=5)

lgb_model = gsearch.fit(X_train.values, y_train.values)
lgb_model.best_params_, lgb_model.best_score_

({'max_depth': 5, 'min_child_weight': 5}, 0.746003583989124)

In [9]:
param_test = {
    'max_depth':[4,5,6],
    'min_child_weight':[4,5,6]
}

estimator = lgb.LGBMClassifier(boosting_type='gbdt', 
                           objective='binary',
                           metric='auc',
                           num_leaves=9,
                           min_split_gain=0,
                           min_child_weight=5,
                           min_child_samples=10,
                           max_bin=100,
                           subsample= 0.7,  # Subsample ratio of the training instance.
                           subsample_freq=1,  # frequence of subsample, <=0 means no enable\
                           colsample_bytree=0.7,  # Subsample ratio of columns when constructing each tree.
                           reg_alpha=1, 
                           reg_lambda=0,
                           seed=410, 
                           nthread=4, 
                           silent=True)


gsearch = GridSearchCV(estimator, param_grid = param_test, scoring='roc_auc',n_jobs=1,iid=False, cv=5)

lgb_model = gsearch.fit(X_train.values, y_train.values)
lgb_model.best_params_, lgb_model.best_score_

({'max_depth': 6, 'min_child_weight': 6}, 0.7460723437130179)

In [10]:
param_test = {
    'num_leaves':np.arange(3,100,5)
}

estimator = lgb.LGBMClassifier(boosting_type='gbdt', 
                           objective='binary',
                           metric='auc',
                           max_depth=6,
                           min_split_gain=0,
                           min_child_weight=6,
                           min_child_samples=10,
                           max_bin=100,
                           subsample= 0.7,  # Subsample ratio of the training instance.
                           subsample_freq=1,  # frequence of subsample, <=0 means no enable\
                           colsample_bytree=0.7,  # Subsample ratio of columns when constructing each tree.
                           reg_alpha=1, 
                           reg_lambda=0,
                           seed=410, 
                           nthread=4, 
                           silent=True)


gsearch = GridSearchCV(estimator, param_grid = param_test, scoring='roc_auc',n_jobs=1,iid=False, cv=5)

lgb_model = gsearch.fit(X_train.values, y_train.values)
lgb_model.best_params_, lgb_model.best_score_

({'num_leaves': 23}, 0.746430961779909)

In [12]:
param_test = {
    'max_bin':[i*100 for i in range(1,5)]
}

estimator = lgb.LGBMClassifier(boosting_type='gbdt', 
                           objective='binary',
                           metric='auc',
                           num_leaves=23,
                           max_depth=6,
                           min_split_gain=0,
                           min_child_weight=6,
                           min_child_samples=10,
                           max_bin=100,
                           subsample= 0.7,  # Subsample ratio of the training instance.
                           subsample_freq=1,  # frequence of subsample, <=0 means no enable\
                           colsample_bytree=0.7,  # Subsample ratio of columns when constructing each tree.
                           reg_alpha=1, 
                           reg_lambda=0,
                           seed=410, 
                           nthread=4, 
                           silent=True)


gsearch = GridSearchCV(estimator, param_grid = param_test, scoring='roc_auc',n_jobs=1,iid=False, cv=5)

lgb_model = gsearch.fit(X_train.values, y_train.values)
lgb_model.best_params_, lgb_model.best_score_

({'max_bin': 100}, 0.746430961779909)

In [13]:
param_test = {
    'subsample':[i/10.0 for i in range(4,10)],
    'colsample_bytree':[i/10.0 for i in range(4,10)]
}

estimator = lgb.LGBMClassifier(boosting_type='gbdt', 
                           objective='binary',
                           metric='auc',
                           num_leaves=23,
                           max_depth=6,
                           min_split_gain=0,
                           min_child_weight=6,
                           min_child_samples=10,
                           max_bin=100,
                           subsample= 0.7,  # Subsample ratio of the training instance.
                           subsample_freq=1,  # frequence of subsample, <=0 means no enable\
                           colsample_bytree=0.7,  # Subsample ratio of columns when constructing each tree.
                           reg_alpha=1, 
                           reg_lambda=0,
                           seed=410, 
                           nthread=4, 
                           silent=True)


gsearch = GridSearchCV(estimator, param_grid = param_test, scoring='roc_auc',n_jobs=1,iid=False, cv=5)

lgb_model = gsearch.fit(X_train.values, y_train.values)
lgb_model.best_params_, lgb_model.best_score_

({'colsample_bytree': 0.5, 'subsample': 0.9}, 0.7466614557273576)

In [14]:
param_test = {
    'reg_alpha':[0, 1e-5, 1e-2, 0.1, 0.5, 1],
    'reg_lambda':[0, 1e-5, 1e-2, 0.1, 0.5, 1]
}

estimator = lgb.LGBMClassifier(boosting_type='gbdt', 
                           objective='binary',
                           metric='auc',
                           num_leaves=23,
                           max_depth=6,
                           min_split_gain=0,
                           min_child_weight=6,
                           min_child_samples=10,
                           max_bin=100,
                           subsample= 0.9,  # Subsample ratio of the training instance.
                           subsample_freq=1,  # frequence of subsample, <=0 means no enable\
                           colsample_bytree=0.5,  # Subsample ratio of columns when constructing each tree.
                           reg_alpha=1, 
                           reg_lambda=0,
                           seed=410, 
                           nthread=4, 
                           silent=True)


gsearch = GridSearchCV(estimator, param_grid = param_test, scoring='roc_auc',n_jobs=1,iid=False, cv=5)

lgb_model = gsearch.fit(X_train.values, y_train.values)
lgb_model.best_params_, lgb_model.best_score_

({'reg_alpha': 0.5, 'reg_lambda': 0.1}, 0.746766917332304)

In [16]:
param_test = {
    'learning_rate': [0.05,0.1, 0.2, 0.5, 1, 2],
}

estimator = lgb.LGBMClassifier(boosting_type='gbdt', 
                           objective='binary',
                           metric='auc',
                           learning_rate=0.05,
                           num_leaves=23,
                           max_depth=6,
                           min_split_gain=0,
                           min_child_weight=6,
                           min_child_samples=10,
                           max_bin=100,
                           subsample= 0.9,  # Subsample ratio of the training instance.
                           subsample_freq=1,  # frequence of subsample, <=0 means no enable\
                           colsample_bytree=0.5,  # Subsample ratio of columns when constructing each tree.
                           reg_alpha=0.5, 
                           reg_lambda=0.1,
                           seed=410, 
                           nthread=4, 
                           silent=True)


gsearch = GridSearchCV(estimator, param_grid = param_test, scoring='roc_auc',n_jobs=1,iid=False, cv=5)

lgb_model = gsearch.fit(X_train.values, y_train.values)
lgb_model.best_params_, lgb_model.best_score_

({'learning_rate': 0.05}, 0.7470489968799185)

In [17]:
lgb_params = {'boosting_type': 'gbdt',
              'objective': 'binary',
              'metric':'auc',
              'learning_rate': 0.05,
              'num_leaves': 23,
              'max_depth': 6,
              'min_child_samples': 10,
              'max_bin': 100,
              'subsample': 0.9,
              'subsample_freq': 1,
              'colsample_bytree': 0.5,
              'min_child_weight': 6,
              'min_split_gain': 0,
              'reg_alpha': 0.5,
              'reg_lambda': 0.1,
              'nthread': 4,
              'verbose': 0,
             }

predictors = list(X_train.columns)
categorical_features = ['dayofweek','JPY','AUD','EUR','GBP','USD']
xgtrain = lgb.Dataset(X_train[predictors].values, label=y_train.values,
                      feature_name=predictors,
                      categorical_feature=categorical_features
                     )

evals_results = {}

lightGBM_model = lgb.train(lgb_params, 
                     xgtrain, 
                     evals_result=evals_results, 
                     verbose_eval=10)



In [18]:
y_pred = lightGBM_model.predict(X_test.values)

In [19]:
y_pred[y_pred>0.5] = 1
y_pred[y_pred<=0.5] = 0

print(sum(y_test.label.values)/len(y_test))
print(sum(y_pred)/len(y_pred))
print('Accuracy Score: '+ str(accuracy_score(y_test.label.values, y_pred)))

0.34987509093267316
0.2833095545567843
Accuracy Score: 0.6690050696727552


In [20]:
filename = 'lgt_GDBT_0669.sav'
pickle.dump(lgb_model, open(filename, 'wb'))
# some time later.. load the model from disk
#loaded_model = pickle.load(open(filename, 'rb'))

In [48]:
X = X_train.as_matrix()
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)


In [None]:
rf.fit(X,y_train.as_matrix())

In [None]:
y_pred = rf.predict(X_test.as_matrix())
print('Accuracy Score: '+ str(accuracy_score(y_test.label.values, y_pred)))