In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [3]:
# If you need to recreate the datase, uncomment the next line%
#%run create_dataset.ipynb

In [4]:
ds = pd.read_excel('dataset.xlsx', parse_dates=[0], index_col=0)

In [5]:
# The last two columns in data set are the real future data to be predicted
X = ds.iloc[:,:-2]

In [6]:
# The last column is a category up +1 or down -1
y = ds.iloc[:,-1]

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
clf = tree.DecisionTreeClassifier()
#Fit X_train and y_train 
clf = clf.fit(X_train, y_train)

In [9]:
print ( 'Full tree guess train/validation ',clf.score(X_train, y_train),clf.score(X_test, y_test))

Full tree guess train/validation  1.0 0.8187385180649112


In [10]:
clf.get_depth()

25

In [11]:
bestdepth=-1
bestscore=0
max_depth = 100

In [12]:
for i in range(30):
    clf = tree.DecisionTreeClassifier(max_depth = i+1)
    #fit the training sets
    clf.fit(X_train, y_train)
    #update trainscore
    trainscore=clf.score(X_train, y_train)
    #update valscore
    valscore=clf.score(X_test, y_test)
    print( 'Depth:', i+1, 'Train Score:', trainscore, 'Validation Score:', valscore)
    if valscore > bestscore   :
        #update bestscore
        bestscore=valscore
        #update depth
        bestdepth=i+1


Depth: 1 Train Score: 0.7531015469444019 Validation Score: 0.7477036129822413
Depth: 2 Train Score: 0.7531015469444019 Validation Score: 0.7477036129822413
Depth: 3 Train Score: 0.7650482462858018 Validation Score: 0.7507654623392529
Depth: 4 Train Score: 0.7760759687547863 Validation Score: 0.7599510104102878
Depth: 5 Train Score: 0.7872568540358401 Validation Score: 0.7624004898958971
Depth: 6 Train Score: 0.7950681574513708 Validation Score: 0.7642375995101041
Depth: 7 Train Score: 0.8094654617858784 Validation Score: 0.7666870789957134
Depth: 8 Train Score: 0.8353499770255782 Validation Score: 0.775260257195346
Depth: 9 Train Score: 0.8615408178894165 Validation Score: 0.7813839559093693
Depth: 10 Train Score: 0.8889569612498085 Validation Score: 0.7862829148805879
Depth: 11 Train Score: 0.9134630111808852 Validation Score: 0.8003674219228414
Depth: 12 Train Score: 0.9345994792464389 Validation Score: 0.807103490508267
Depth: 13 Train Score: 0.9525195282585388 Validation Score: 0.8

In [13]:
clf = tree.DecisionTreeClassifier(max_depth = 13)
#fitX_trainval and y_trainval
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8022045315370484

In [14]:
pred = clf.predict(X_test)
print(confusion_matrix(y_test, pred))

[[456 161]
 [162 854]]


In [15]:
feature_importances = clf.feature_importances_
# Create a DataFrame for better visualization
features = X.columns
importances = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importances = importances.sort_values(by='Importance', ascending=False)
# Print feature importances
print(importances)

      Feature  Importance
19       pr_0    0.359191
17     ma20gr    0.057614
14       Infl    0.043811
0         DFF    0.038392
15   BBKMLEIX    0.037263
4       ICNSA    0.033659
8      PERMIT    0.032792
1       DGS10    0.031914
5      ACOGNO    0.031443
2       T10_3    0.030509
18        vix    0.029074
11     LiqChg    0.027738
9     UMCSENT    0.024543
3      UNRATE    0.022866
12      M2Chg    0.022306
10  BusInvChg    0.021767
6    NEWORDER    0.019359
28       pr_9    0.018277
29      pr_10    0.016589
24       pr_5    0.014245
16      vol_n    0.012773
20       pr_1    0.012726
22       pr_3    0.011605
7      AWHMAN    0.008954
27       pr_8    0.008836
26       pr_7    0.008120
13    PSAVERT    0.007746
25       pr_6    0.006103
23       pr_4    0.005006
21       pr_2    0.004779


In [16]:
#tree.plot_tree(clf)

In [17]:
clf0 = tree.DecisionTreeClassifier(max_depth = 11)
clf0.fit(X, y)
clf0.score(X, y)

0.9240382259250184

In [18]:
predict = clf0.predict(X)
print(confusion_matrix(y, predict))

[[2554  365]
 [ 255 4988]]


In [19]:
feature_importances = clf0.feature_importances_
# Create a DataFrame for better visualization
features = X.columns
importances = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importances = importances.sort_values(by='Importance', ascending=False)
# Print feature importances
print(importances)

      Feature  Importance
19       pr_0    0.399492
15   BBKMLEIX    0.050303
1       DGS10    0.043257
9     UMCSENT    0.043007
5      ACOGNO    0.039642
2       T10_3    0.037801
14       Infl    0.035915
11     LiqChg    0.033626
17     ma20gr    0.029810
6    NEWORDER    0.028415
12      M2Chg    0.026705
4       ICNSA    0.024157
0         DFF    0.023168
8      PERMIT    0.022031
18        vix    0.019814
10  BusInvChg    0.018686
13    PSAVERT    0.017341
26       pr_7    0.016917
29      pr_10    0.014060
16      vol_n    0.012044
24       pr_5    0.012002
28       pr_9    0.009213
25       pr_6    0.008490
27       pr_8    0.006464
7      AWHMAN    0.006171
21       pr_2    0.005973
3      UNRATE    0.005462
23       pr_4    0.004666
22       pr_3    0.004194
20       pr_1    0.001172


In [20]:
ds['Prediction'] = predict

In [21]:
pd.DataFrame(ds).to_excel('result_dt.xlsx')

In [22]:
ds.describe()

Unnamed: 0,DFF,DGS10,T10_3,UNRATE,ICNSA,ACOGNO,NEWORDER,AWHMAN,PERMIT,UMCSENT,...,pr_4,pr_5,pr_6,pr_7,pr_8,pr_9,pr_10,ma20gr_fut,dir_fut,Prediction
count,8162.0,8162.0,8162.0,8162.0,8162.0,8162.0,8162.0,8162.0,8162.0,8162.0,...,8162.0,8162.0,8162.0,8162.0,8162.0,8162.0,8162.0,8162.0,8162.0,8162.0
mean,2.56337,4.002111,1.593827,5.703136,367866.7,765.409665,286.719716,1.202291,1372.559177,85.836817,...,0.001724,0.001383,0.001048,0.000718,0.000393,7.1e-05,-0.000245,0.006995,0.284734,0.311688
std,2.191967,1.765468,1.232822,1.790656,314573.3,76.471799,38.407867,0.606064,402.368214,13.395546,...,0.018204,0.016858,0.0157,0.014766,0.014115,0.013763,0.013756,0.035025,0.958665,0.950243
min,0.04,0.52,-1.73,3.4,152144.0,548.601079,214.632642,-1.5,513.0,50.0,...,-0.167844,-0.157828,-0.146553,-0.138912,-0.132208,-0.126792,-0.12533,-0.212308,-1.0,-1.0
25%,0.18,2.47,0.69,4.4,260242.0,701.555834,257.002921,0.8,1118.0,76.1,...,-0.006666,-0.006285,-0.006133,-0.006052,-0.006082,-0.006331,-0.006715,-0.009839,-1.0,-1.0
50%,2.03,4.0,1.6,5.4,317573.0,759.090011,280.045045,1.3,1399.0,88.8,...,0.004049,0.003469,0.002947,0.002374,0.001803,0.001297,0.000674,0.011657,1.0,1.0
75%,4.99,5.41,2.61,6.6,392213.0,818.970217,311.565646,1.7,1651.0,95.6,...,0.012288,0.011138,0.010163,0.009122,0.008278,0.00747,0.006878,0.027977,1.0,1.0
max,7.8,8.05,3.94,14.8,6161268.0,933.092066,396.585366,2.3,2263.0,112.0,...,0.092029,0.085783,0.082464,0.084515,0.088462,0.096451,0.102215,0.127029,1.0,1.0
