In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import xgboost as xgb
import datetime
from sklearn.linear_model import LogisticRegression
import pickle

In [2]:
X_train=np.loadtxt('data/X_train.csv',delimiter=',')
X_test=np.loadtxt('data/X_test.csv',delimiter=',')
y_train=np.loadtxt('data/y_train.csv',delimiter=',')
y_test=np.loadtxt('data/y_test.csv',delimiter=',')

In [3]:
raw_data=pd.read_csv('data/raw_data.csv')

In [4]:
raw_data.drop(columns=['Unnamed: 0'],inplace=True)

In [5]:
raw_data.head()

Unnamed: 0,ep_ttm,bp_lyr,sp_ttm,cfp_ncf_ttm,cfp_ocf_ttm,cfp_fcff_ttm,ortoev_ttm,peg,roe_avg_ttm,roe_diluted_ttm,...,skew_60D,illiq,small_order_flow,s_dq_mv,ln_float_mv,beta_60D_zz500,beta_20D_szzz,beta_480D_szzz,label,label_time
0,0.120824,0.796337,0.436667,0.090356,-0.089589,,0.397811,28.167006,0.155275,0.151725,...,-0.22086,1.157803e-07,-0.002365,137025400000.0,25.643432,0.741209,0.971856,1.574382,2,2015-01-30
1,0.106428,0.542133,0.932968,0.081991,0.215316,0.043625,0.878831,71.756337,0.198387,0.196314,...,0.008047,9.708716e-08,-0.001101,127355600000.0,25.570249,0.922578,1.297357,1.308739,0,2015-01-30
2,-0.00514,0.04863,0.046602,-0.010674,0.001702,-0.007284,0.04701,59.679834,-0.105048,-0.105686,...,-0.481831,6.793055e-06,0.007751,1455239000.0,21.098436,1.367928,0.318375,0.558445,6,2015-01-30
3,-0.012082,0.158174,0.014907,0.000598,-0.030551,0.086292,0.015112,10.260072,-0.075996,-0.076385,...,0.794545,,,3746346000.0,22.044047,1.039238,,0.811105,1,2015-01-30
4,0.065695,0.446514,0.415537,-0.023423,-0.107033,0.079987,0.376103,-163.776036,0.147783,0.14713,...,-0.625961,6.36475e-07,0.003886,8702831000.0,22.886914,1.005679,0.94719,1.056388,2,2015-01-30


# 1. Data Preprocessing

## 1.1 Missing Values

In [6]:
## Missing Value
raw_data.isnull().sum()

ep_ttm                    8171
bp_lyr                    7359
sp_ttm                    8165
cfp_ncf_ttm               8232
cfp_ocf_ttm               8214
cfp_fcff_ttm             10980
ortoev_ttm               11020
peg                      13071
roe_avg_ttm               1653
roe_diluted_ttm           1672
roa_avg                    843
roa_diluted                843
roic_wind                 2781
roic_wind_ttm             2781
berryratio_ttm            5546
grossmargin_ttm           3816
adminexpensetogr_ttm2     1064
taxtoebt_ttm             22747
taxtoor_ttm              22633
eps_diluted_is             419
bps_ttm                   7359
orps_ttm                  8165
dps                       7912
endogenousgrowth             3
scftosales_ttm            3802
wgsdtosales_ttm           1035
wgsdtoor_ttm             27778
debttoassets                 3
curassetsratio            1540
fixedassettoasset        74005
                         ...  
mom_tw_20D                8133
mom_tw_2

## 1.2 Balancing 

In [7]:
raw_data['label'].value_counts()/len(raw_data)

5    0.104623
2    0.102336
7    0.102120
0    0.101597
4    0.100059
9    0.099951
1    0.099029
6    0.097573
3    0.097098
8    0.095615
Name: label, dtype: float64

## 2. Modeling   

## 2.1 Split Training Set & Test Set

In [8]:
raw_data['label_time']=pd.to_datetime((raw_data['label_time']).apply(str))

In [9]:
training_data=raw_data[raw_data['label_time']<datetime.datetime(2019,1,1)]
test_data=raw_data[raw_data['label_time']>=datetime.datetime(2019,1,1)]
training_data.drop(columns=['label_time'],inplace=True)
test_data.drop(columns=['label_time'],inplace=True)
X_train=training_data.loc[:,training_data.columns!='label']
y_train=training_data['label'].values
X_test=test_data.loc[:,test_data.columns!='label']
y_test=test_data['label'].values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


# 2.2 Data Process Pipeline

In [10]:
## process pipeline
process_scale = make_column_transformer(
    (make_pipeline(SimpleImputer(strategy='mean'), StandardScaler()), 
     X_train.columns),
    remainder='passthrough')

# 2.3 XGBoost Classifier

In [11]:
params = {
    'booster': 'gbtree',
    'objective': 'multi:softmax',  
    'num_class': 10,               
    'gamma': 0.1,                  
    'max_depth': 12,               
    'lambda': 2,                   
    'subsample': 0.7,              
    'colsample_bytree': 0.7,      
    'min_child_weight': 3,
    'silent': 1,                   
    'eta': 0.007,                  
    'seed': 1000,
    'nthread': 8                  
}

In [17]:
type(y_train)

numpy.ndarray

In [57]:
param_grid = {'xgbclassifier__max_depth':range(3, 10, 3)}
mean_pipe_xgb = make_pipeline(process_scale,
                          xgb.XGBClassifier(nthread=8))
grid_xgb = GridSearchCV(mean_pipe_xgb, param_grid, cv=5, scoring='accuracy',verbose=2)
grid_xgb.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] xgbclassifier__max_depth=3 ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ....................... xgbclassifier__max_depth=3, total= 6.4min
[CV] xgbclassifier__max_depth=3 ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  6.4min remaining:    0.0s


[CV] ....................... xgbclassifier__max_depth=3, total= 6.1min
[CV] xgbclassifier__max_depth=3 ......................................
[CV] ....................... xgbclassifier__max_depth=3, total= 6.0min
[CV] xgbclassifier__max_depth=3 ......................................
[CV] ....................... xgbclassifier__max_depth=3, total= 6.0min
[CV] xgbclassifier__max_depth=3 ......................................
[CV] ....................... xgbclassifier__max_depth=3, total= 5.9min
[CV] xgbclassifier__max_depth=6 ......................................
[CV] ....................... xgbclassifier__max_depth=6, total=11.7min
[CV] xgbclassifier__max_depth=6 ......................................
[CV] ....................... xgbclassifier__max_depth=6, total=12.3min
[CV] xgbclassifier__max_depth=6 ......................................
[CV] ....................... xgbclassifier__max_depth=6, total=12.2min
[CV] xgbclassifier__max_depth=6 ......................................
[CV] .

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 189.4min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('pipeline',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('simpleimputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=Tru

In [66]:
pickle.dump(grid_xgb, open("xgboost/xgboost.pickle.dat", "wb"))

In [67]:
model=pickle.load(open("xgboost/xgboost.pickle.dat","rb"))

In [69]:
model.score(X_test, y_test)

0.14920888692871712

In [70]:
y_pred=model.predict(X_test)

In [71]:
confusion_matrix(y_test,y_pred)

array([[1752,  195,  126,   99,  222,  216,   92,  221,  259,  450],
       [1050,  240,  196,  175,  481,  447,  141,  235,  260,  402],
       [ 782,  211,  173,  219,  585,  625,  211,  275,  195,  353],
       [ 648,  188,  165,  232,  676,  717,  224,  285,  201,  290],
       [ 564,  176,  149,  227,  715,  757,  221,  284,  220,  319],
       [ 506,  171,  156,  261,  671,  817,  259,  283,  204,  310],
       [ 518,  166,  156,  234,  684,  773,  209,  289,  261,  343],
       [ 639,  187,  177,  205,  598,  652,  164,  354,  258,  375],
       [ 750,  216,  183,  173,  508,  540,  139,  310,  327,  479],
       [1067,  249,  167,  195,  399,  357,  103,  214,  282,  594]],
      dtype=int64)

In [73]:
stock_info_2019=pd.read_csv('data/stock_info_2019.csv')

In [74]:
stock_info_2019.loc[np.where(y_pred==9)].to_csv('xgboost/data/portfolio_9.csv')
stock_info_2019.loc[np.where(y_pred==8)].to_csv('xgboost/data/portfolio_8.csv')
stock_info_2019.loc[np.where(y_pred==7)].to_csv('xgboost/data/portfolio_7.csv')
stock_info_2019.loc[np.where(y_pred==6)].to_csv('xgboost/data/portfolio_6.csv')
stock_info_2019.loc[np.where(y_pred==5)].to_csv('xgboost/data/portfolio_5.csv')
stock_info_2019.loc[np.where(y_pred==4)].to_csv('xgboost/data/portfolio_4.csv')
stock_info_2019.loc[np.where(y_pred==3)].to_csv('xgboost/data/portfolio_3.csv')
stock_info_2019.loc[np.where(y_pred==2)].to_csv('xgboost/data/portfolio_2.csv')
stock_info_2019.loc[np.where(y_pred==1)].to_csv('xgboost/data/portfolio_1.csv')
stock_info_2019.loc[np.where(y_pred==0)].to_csv('xgboost/data/portfolio_0.csv')