## Financial sector with top 12 simple rules

In [1]:
import pandas as pd
import numpy as np
import glob
import talib
from talib import abstract
import ta

In [2]:
path = r'/Users/amrita/Desktop/Financial data folder/' # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    dff = pd.read_csv(filename, index_col=None, header=None)
    li.append(dff)

df = pd.concat(li, axis=0, ignore_index=True)

df = df.dropna(how='any')
df.reset_index(inplace=True)
del(df['index'])


In [3]:
df.columns= df.iloc[0]
df.drop([0],inplace=True)
df.reset_index(inplace=True)
del(df['index'])

In [4]:
# data is in string format and contains headers of subsequent files ( like PX_VOLUME, PX_LAST) , 
#so deleting those rows that have headers and converting the rest of the values to float

a= df.index[df['PX_VOLUME'] == 'PX_VOLUME']   # since the index list will be same for high, low, open, last columns as well
aa = np.array(a)                                # so we are just using volume column

df.drop(aa, inplace=True)

In [5]:
df['PX_VOLUME']= df['PX_VOLUME'].astype(float)    # converting values to float
df['PX_LAST']= df['PX_LAST'].astype(float)
df['PX_LOW']= df['PX_LOW'].astype(float)
df['PX_HIGH']= df['PX_HIGH'].astype(float)
df['PX_OPEN']= df['PX_OPEN'].astype(float)

In [6]:
df

Unnamed: 0,Dates,PX_VOLUME,PX_LAST,PX_LOW,PX_HIGH,PX_OPEN
0,2/5/2004,23555600.0,24.70,23.30,24.94,23.75
1,2/6/2004,4715500.0,24.85,24.20,24.94,24.71
2,2/9/2004,2898200.0,24.47,24.00,24.70,24.60
3,2/10/2004,1566500.0,24.35,24.05,24.47,24.15
4,2/11/2004,1939400.0,24.25,24.16,24.50,24.28
...,...,...,...,...,...,...
454844,1/28/2020,4182.0,11.11,11.11,11.19,11.16
454845,1/29/2020,6486.0,10.94,10.92,11.06,11.06
454846,1/30/2020,5356.0,11.05,10.91,11.05,10.91
454847,1/31/2020,15047.0,10.70,10.67,10.97,10.97


In [7]:
df.reset_index(inplace=True)

In [8]:
indexlist= np.arange(454719)
indexlist

array([     0,      1,      2, ..., 454716, 454717, 454718])

In [9]:
df.reindex(indexlist)    # since it gives keyerror: 4173, we set our own index

Unnamed: 0,index,Dates,PX_VOLUME,PX_LAST,PX_LOW,PX_HIGH,PX_OPEN
0,0,2/5/2004,23555600.0,24.70,23.30,24.94,23.75
1,1,2/6/2004,4715500.0,24.85,24.20,24.94,24.71
2,2,2/9/2004,2898200.0,24.47,24.00,24.70,24.60
3,3,2/10/2004,1566500.0,24.35,24.05,24.47,24.15
4,4,2/11/2004,1939400.0,24.25,24.16,24.50,24.28
...,...,...,...,...,...,...,...
454714,454844,1/28/2020,4182.0,11.11,11.11,11.19,11.16
454715,454845,1/29/2020,6486.0,10.94,10.92,11.06,11.06
454716,454846,1/30/2020,5356.0,11.05,10.91,11.05,10.91
454717,454847,1/31/2020,15047.0,10.70,10.67,10.97,10.97


In [10]:
df.drop(columns = 'index',inplace=True)

In [11]:
df

Unnamed: 0,Dates,PX_VOLUME,PX_LAST,PX_LOW,PX_HIGH,PX_OPEN
0,2/5/2004,23555600.0,24.70,23.30,24.94,23.75
1,2/6/2004,4715500.0,24.85,24.20,24.94,24.71
2,2/9/2004,2898200.0,24.47,24.00,24.70,24.60
3,2/10/2004,1566500.0,24.35,24.05,24.47,24.15
4,2/11/2004,1939400.0,24.25,24.16,24.50,24.28
...,...,...,...,...,...,...
454714,1/28/2020,4182.0,11.11,11.11,11.19,11.16
454715,1/29/2020,6486.0,10.94,10.92,11.06,11.06
454716,1/30/2020,5356.0,11.05,10.91,11.05,10.91
454717,1/31/2020,15047.0,10.70,10.67,10.97,10.97


### Calculating Return columns (3,5,10 days return) 

In [12]:
periods = [3,5,10]   # unit = day

for i in periods:
    period = i
    df['return'+str(period)] = df['PX_LAST'].pct_change(periods = period)


In [13]:
print(df['return3'].describe(), df['return5'].describe(), df['return10'].describe())

count    454716.000000
mean          0.001370
std           0.093673
min          -0.981449
25%          -0.014014
50%           0.000317
75%           0.015809
max          30.632899
Name: return3, dtype: float64 count    454714.000000
mean          0.002206
std           0.121147
min          -0.981336
25%          -0.017893
50%           0.000899
75%           0.020941
max          32.462588
Name: return5, dtype: float64 count    454709.000000
mean          0.004261
std           0.174368
min          -0.981502
25%          -0.024469
50%           0.002283
75%           0.030747
max          33.856865
Name: return10, dtype: float64


#### Generating buy, sell, hold signals from returns calculated above (dependent variable)

In [14]:


thresholds = [0.0003, 0.00008, 0.0022]    # took threshold as 50% value of each column from the above cell
for i in range(len(periods)):
    df['signal'+str(periods[i])] = np.where(df['return'+str(periods[i])] > thresholds[i], 
                                                    1 ,
                                                   np.where(df['return'+str(periods[i])] < -thresholds[i],-1,0)
                                                   )

### trading signal functions

In [15]:
def crossover(shortterm, longterm):
    if len(shortterm)==len(longterm):
        temp = np.zeros(len(shortterm))
        for i in range(1, len(shortterm)):
            temp[i] = 1 if shortterm[i] >= longterm[i] and shortterm[i-1] < longterm[i-1] \
                                   else -1 if shortterm[i] <= longterm[i] and shortterm[i-1] > longterm[i-1] \
                                   else 0
    else:
        raise ValueError('Lengths do not match')
    return temp

In [16]:
def macd(close,fastp,slowp,signalp):
    macd, macd_signal, macdhist = np.array(talib.MACD(close, fastperiod=fastp, slowperiod=slowp, signalperiod=signalp))
    
    res = crossover(macd,macd_signal)

    return res

## Taking top 12 simple rule

In [19]:
df['WMA5-10'] = crossover(talib.WMA(df['PX_LAST'], timeperiod=5), talib.WMA(df['PX_LAST'], timeperiod=10))
df['EMA5-10'] = crossover(talib.EMA(df['PX_LAST'], timeperiod=5), talib.EMA(df['PX_LAST'], timeperiod=10))
df['DEMA5-10'] = crossover(talib.DEMA(df['PX_LAST'], timeperiod=5), talib.DEMA(df['PX_LAST'], timeperiod=10))
df['MACD_12_29_9'] = macd(df['PX_LAST'], fastp = 12, slowp = 29,signalp = 9)
df['MACD_5_35_5']= macd(df['PX_LAST'], fastp = 5, slowp = 35,signalp = 5)
df['SMA5-10'] = crossover(talib.SMA(df['PX_LAST'], timeperiod=5), talib.SMA(df['PX_LAST'], timeperiod=10))
df['DEMA5-10'] = crossover(talib.DEMA(df['PX_LAST'], timeperiod=5), talib.DEMA(df['PX_LAST'], timeperiod=10))
df['WMA14'] = crossover(df['PX_LAST'], talib.WMA(df['PX_LAST'], timeperiod=15))
df['SMA14'] = crossover(df['PX_LAST'], talib.SMA(df['PX_LAST'], timeperiod=15))
df['SMA50'] = crossover(df['PX_LAST'], talib.SMA(df['PX_LAST'], timeperiod=50))
df['EMA14'] = crossover(df['PX_LAST'], talib.EMA(df['PX_LAST'], timeperiod=15))
df['EMA50'] = crossover(df['PX_LAST'], talib.EMA(df['PX_LAST'], timeperiod=50))

In [24]:
df

Unnamed: 0,Dates,PX_VOLUME,PX_LAST,PX_LOW,PX_HIGH,PX_OPEN,return3,return5,return10,signal3,...,WMA14_SMA14,WMA14_SMA50,WMA14_EMA14,WMA14_EMA50,SMA14_SMA50,SMA14_EMA14,SMA14_EMA50,SMA50_EMA14,SMA50_EMA50,EMA14_EMA50
0,2/5/2004,23555600.0,24.70,23.30,24.94,23.75,,,,0,...,0,0,0,0,0,0,0,0,0,0
1,2/6/2004,4715500.0,24.85,24.20,24.94,24.71,,,,0,...,0,0,0,0,0,0,0,0,0,0
2,2/9/2004,2898200.0,24.47,24.00,24.70,24.60,,,,0,...,0,0,0,0,0,0,0,0,0,0
3,2/10/2004,1566500.0,24.35,24.05,24.47,24.15,-0.014170,,,-1,...,0,0,0,0,0,0,0,0,0,0
4,2/11/2004,1939400.0,24.25,24.16,24.50,24.28,-0.024145,,,-1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454714,1/28/2020,4182.0,11.11,11.11,11.19,11.16,-0.008036,-0.008036,-0.014197,-1,...,0,0,0,0,0,0,0,0,0,0
454715,1/29/2020,6486.0,10.94,10.92,11.06,11.06,-0.023214,-0.023214,-0.028419,-1,...,0,0,0,0,0,0,0,0,0,0
454716,1/30/2020,5356.0,11.05,10.91,11.05,10.91,-0.009857,-0.013393,-0.025573,-1,...,0,0,0,0,0,0,0,0,0,0
454717,1/31/2020,15047.0,10.70,10.67,10.97,10.97,-0.036904,-0.044643,-0.044643,-1,...,0,0,0,0,0,0,0,0,0,0


## 2 way simple rules

In [20]:
columns =  [x for x in list(df.columns) if x not in ['Unnamed: 0','Dates', 'PX_VOLUME','PX_LAST','PX_LOW','PX_HIGH','PX_OPEN',
                                                              'return3','return5','return10', 'signal3', 'signal5', 'signal10']]
for i in range(len(columns)):
    for j in range(i+1, len(columns)):
        df[columns[i] + '_' + columns[j]] = np.array([1 if x+y>0 else -1 if x+y<0 else 0 for (x, y) in zip(df[columns[i]], df[columns[j]])])


In [22]:
df.to_csv('my_workv2')

In [23]:
pd.read_csv('my_workv2')      # contains simple and 2 way rules

Unnamed: 0.1,Unnamed: 0,Dates,PX_VOLUME,PX_LAST,PX_LOW,PX_HIGH,PX_OPEN,return3,return5,return10,...,WMA14_SMA14,WMA14_SMA50,WMA14_EMA14,WMA14_EMA50,SMA14_SMA50,SMA14_EMA14,SMA14_EMA50,SMA50_EMA14,SMA50_EMA50,EMA14_EMA50
0,0,2/5/2004,23555600.0,24.70,23.30,24.94,23.75,,,,...,0,0,0,0,0,0,0,0,0,0
1,1,2/6/2004,4715500.0,24.85,24.20,24.94,24.71,,,,...,0,0,0,0,0,0,0,0,0,0
2,2,2/9/2004,2898200.0,24.47,24.00,24.70,24.60,,,,...,0,0,0,0,0,0,0,0,0,0
3,3,2/10/2004,1566500.0,24.35,24.05,24.47,24.15,-0.014170,,,...,0,0,0,0,0,0,0,0,0,0
4,4,2/11/2004,1939400.0,24.25,24.16,24.50,24.28,-0.024145,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454714,454714,1/28/2020,4182.0,11.11,11.11,11.19,11.16,-0.008036,-0.008036,-0.014197,...,0,0,0,0,0,0,0,0,0,0
454715,454715,1/29/2020,6486.0,10.94,10.92,11.06,11.06,-0.023214,-0.023214,-0.028419,...,0,0,0,0,0,0,0,0,0,0
454716,454716,1/30/2020,5356.0,11.05,10.91,11.05,10.91,-0.009857,-0.013393,-0.025573,...,0,0,0,0,0,0,0,0,0,0
454717,454717,1/31/2020,15047.0,10.70,10.67,10.97,10.97,-0.036904,-0.044643,-0.044643,...,0,0,0,0,0,0,0,0,0,0


In [25]:
df.columns

Index(['Dates', 'PX_VOLUME', 'PX_LAST', 'PX_LOW', 'PX_HIGH', 'PX_OPEN',
       'return3', 'return5', 'return10', 'signal3', 'signal5', 'signal10',
       'WMA5-10', 'EMA5-10', 'DEMA5-10', 'MACD_12_29_9', 'MACD_5_35_5',
       'SMA5-10', 'WMA14', 'SMA14', 'SMA50', 'EMA14', 'EMA50',
       'WMA5-10_EMA5-10', 'WMA5-10_DEMA5-10', 'WMA5-10_MACD_12_29_9',
       'WMA5-10_MACD_5_35_5', 'WMA5-10_SMA5-10', 'WMA5-10_WMA14',
       'WMA5-10_SMA14', 'WMA5-10_SMA50', 'WMA5-10_EMA14', 'WMA5-10_EMA50',
       'EMA5-10_DEMA5-10', 'EMA5-10_MACD_12_29_9', 'EMA5-10_MACD_5_35_5',
       'EMA5-10_SMA5-10', 'EMA5-10_WMA14', 'EMA5-10_SMA14', 'EMA5-10_SMA50',
       'EMA5-10_EMA14', 'EMA5-10_EMA50', 'DEMA5-10_MACD_12_29_9',
       'DEMA5-10_MACD_5_35_5', 'DEMA5-10_SMA5-10', 'DEMA5-10_WMA14',
       'DEMA5-10_SMA14', 'DEMA5-10_SMA50', 'DEMA5-10_EMA14', 'DEMA5-10_EMA50',
       'MACD_12_29_9_MACD_5_35_5', 'MACD_12_29_9_SMA5-10',
       'MACD_12_29_9_WMA14', 'MACD_12_29_9_SMA14', 'MACD_12_29_9_SMA50',
       'MA

In [26]:
### putting the dependent variables - signal3,signal5,signal10 at the end of the df for ease to split
#the data for training

dependent_columns = df[['signal3','signal5','signal10']]
dependent_columns

df.drop(columns=['signal3','signal5','signal10'],inplace=True)
df



Unnamed: 0,Dates,PX_VOLUME,PX_LAST,PX_LOW,PX_HIGH,PX_OPEN,return3,return5,return10,WMA5-10,...,WMA14_SMA14,WMA14_SMA50,WMA14_EMA14,WMA14_EMA50,SMA14_SMA50,SMA14_EMA14,SMA14_EMA50,SMA50_EMA14,SMA50_EMA50,EMA14_EMA50
0,2/5/2004,23555600.0,24.70,23.30,24.94,23.75,,,,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2/6/2004,4715500.0,24.85,24.20,24.94,24.71,,,,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2/9/2004,2898200.0,24.47,24.00,24.70,24.60,,,,0.0,...,0,0,0,0,0,0,0,0,0,0
3,2/10/2004,1566500.0,24.35,24.05,24.47,24.15,-0.014170,,,0.0,...,0,0,0,0,0,0,0,0,0,0
4,2/11/2004,1939400.0,24.25,24.16,24.50,24.28,-0.024145,,,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454714,1/28/2020,4182.0,11.11,11.11,11.19,11.16,-0.008036,-0.008036,-0.014197,0.0,...,0,0,0,0,0,0,0,0,0,0
454715,1/29/2020,6486.0,10.94,10.92,11.06,11.06,-0.023214,-0.023214,-0.028419,0.0,...,0,0,0,0,0,0,0,0,0,0
454716,1/30/2020,5356.0,11.05,10.91,11.05,10.91,-0.009857,-0.013393,-0.025573,0.0,...,0,0,0,0,0,0,0,0,0,0
454717,1/31/2020,15047.0,10.70,10.67,10.97,10.97,-0.036904,-0.044643,-0.044643,0.0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df

In [27]:
df = pd.concat([df,dependent_columns], axis=1)
df

Unnamed: 0,Dates,PX_VOLUME,PX_LAST,PX_LOW,PX_HIGH,PX_OPEN,return3,return5,return10,WMA5-10,...,WMA14_EMA50,SMA14_SMA50,SMA14_EMA14,SMA14_EMA50,SMA50_EMA14,SMA50_EMA50,EMA14_EMA50,signal3,signal5,signal10
0,2/5/2004,23555600.0,24.70,23.30,24.94,23.75,,,,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2/6/2004,4715500.0,24.85,24.20,24.94,24.71,,,,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2/9/2004,2898200.0,24.47,24.00,24.70,24.60,,,,0.0,...,0,0,0,0,0,0,0,0,0,0
3,2/10/2004,1566500.0,24.35,24.05,24.47,24.15,-0.014170,,,0.0,...,0,0,0,0,0,0,0,-1,0,0
4,2/11/2004,1939400.0,24.25,24.16,24.50,24.28,-0.024145,,,0.0,...,0,0,0,0,0,0,0,-1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454714,1/28/2020,4182.0,11.11,11.11,11.19,11.16,-0.008036,-0.008036,-0.014197,0.0,...,0,0,0,0,0,0,0,-1,-1,-1
454715,1/29/2020,6486.0,10.94,10.92,11.06,11.06,-0.023214,-0.023214,-0.028419,0.0,...,0,0,0,0,0,0,0,-1,-1,-1
454716,1/30/2020,5356.0,11.05,10.91,11.05,10.91,-0.009857,-0.013393,-0.025573,0.0,...,0,0,0,0,0,0,0,-1,-1,-1
454717,1/31/2020,15047.0,10.70,10.67,10.97,10.97,-0.036904,-0.044643,-0.044643,0.0,...,0,0,0,0,0,0,0,-1,-1,-1


In [None]:
df.columns

## 3 way simple rule

In [None]:
# for i in range(len(columns)):
#     for j in range(i+1, len(columns)):
#         for k in range(j+1, len(columns)):
#             df[columns[i] + '_' + columns[j] + '_' + columns[k]] = np.array([1 if x+y+z>0 else -1 if x+y+z<0 else 0 for (x, y, z) in zip(df[columns[i]], df[columns[j]], df[columns[k]])])

### Model 1 (3 day return)

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

##### For 3 day return

In [29]:
df_3day = df.copy()
df_3day

Unnamed: 0,Dates,PX_VOLUME,PX_LAST,PX_LOW,PX_HIGH,PX_OPEN,return3,return5,return10,WMA5-10,...,WMA14_EMA50,SMA14_SMA50,SMA14_EMA14,SMA14_EMA50,SMA50_EMA14,SMA50_EMA50,EMA14_EMA50,signal3,signal5,signal10
0,2/5/2004,23555600.0,24.70,23.30,24.94,23.75,,,,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2/6/2004,4715500.0,24.85,24.20,24.94,24.71,,,,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2/9/2004,2898200.0,24.47,24.00,24.70,24.60,,,,0.0,...,0,0,0,0,0,0,0,0,0,0
3,2/10/2004,1566500.0,24.35,24.05,24.47,24.15,-0.014170,,,0.0,...,0,0,0,0,0,0,0,-1,0,0
4,2/11/2004,1939400.0,24.25,24.16,24.50,24.28,-0.024145,,,0.0,...,0,0,0,0,0,0,0,-1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454714,1/28/2020,4182.0,11.11,11.11,11.19,11.16,-0.008036,-0.008036,-0.014197,0.0,...,0,0,0,0,0,0,0,-1,-1,-1
454715,1/29/2020,6486.0,10.94,10.92,11.06,11.06,-0.023214,-0.023214,-0.028419,0.0,...,0,0,0,0,0,0,0,-1,-1,-1
454716,1/30/2020,5356.0,11.05,10.91,11.05,10.91,-0.009857,-0.013393,-0.025573,0.0,...,0,0,0,0,0,0,0,-1,-1,-1
454717,1/31/2020,15047.0,10.70,10.67,10.97,10.97,-0.036904,-0.044643,-0.044643,0.0,...,0,0,0,0,0,0,0,-1,-1,-1


In [30]:
df_3day.drop (columns = ['signal5','signal10'],inplace=True)
df_3day

Unnamed: 0,Dates,PX_VOLUME,PX_LAST,PX_LOW,PX_HIGH,PX_OPEN,return3,return5,return10,WMA5-10,...,WMA14_SMA50,WMA14_EMA14,WMA14_EMA50,SMA14_SMA50,SMA14_EMA14,SMA14_EMA50,SMA50_EMA14,SMA50_EMA50,EMA14_EMA50,signal3
0,2/5/2004,23555600.0,24.70,23.30,24.94,23.75,,,,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2/6/2004,4715500.0,24.85,24.20,24.94,24.71,,,,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2/9/2004,2898200.0,24.47,24.00,24.70,24.60,,,,0.0,...,0,0,0,0,0,0,0,0,0,0
3,2/10/2004,1566500.0,24.35,24.05,24.47,24.15,-0.014170,,,0.0,...,0,0,0,0,0,0,0,0,0,-1
4,2/11/2004,1939400.0,24.25,24.16,24.50,24.28,-0.024145,,,0.0,...,0,0,0,0,0,0,0,0,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454714,1/28/2020,4182.0,11.11,11.11,11.19,11.16,-0.008036,-0.008036,-0.014197,0.0,...,0,0,0,0,0,0,0,0,0,-1
454715,1/29/2020,6486.0,10.94,10.92,11.06,11.06,-0.023214,-0.023214,-0.028419,0.0,...,0,0,0,0,0,0,0,0,0,-1
454716,1/30/2020,5356.0,11.05,10.91,11.05,10.91,-0.009857,-0.013393,-0.025573,0.0,...,0,0,0,0,0,0,0,0,0,-1
454717,1/31/2020,15047.0,10.70,10.67,10.97,10.97,-0.036904,-0.044643,-0.044643,0.0,...,0,0,0,0,0,0,0,0,0,-1


In [51]:
df.isnull().any(axis=1)

0          True
1          True
2          True
3          True
4          True
          ...  
454714    False
454715    False
454716    False
454717    False
454718    False
Length: 454719, dtype: bool

In [52]:
df_3day.fillna(0, inplace=True)      # since return3,5,10 columns had nan value

In [53]:
df_3day

Unnamed: 0,Dates,PX_VOLUME,PX_LAST,PX_LOW,PX_HIGH,PX_OPEN,return3,return5,return10,WMA5-10,...,WMA14_SMA50,WMA14_EMA14,WMA14_EMA50,SMA14_SMA50,SMA14_EMA14,SMA14_EMA50,SMA50_EMA14,SMA50_EMA50,EMA14_EMA50,signal3
0,2/5/2004,23555600.0,24.70,23.30,24.94,23.75,0.000000,0.000000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2/6/2004,4715500.0,24.85,24.20,24.94,24.71,0.000000,0.000000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2/9/2004,2898200.0,24.47,24.00,24.70,24.60,0.000000,0.000000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
3,2/10/2004,1566500.0,24.35,24.05,24.47,24.15,-0.014170,0.000000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,-1
4,2/11/2004,1939400.0,24.25,24.16,24.50,24.28,-0.024145,0.000000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454714,1/28/2020,4182.0,11.11,11.11,11.19,11.16,-0.008036,-0.008036,-0.014197,0.0,...,0,0,0,0,0,0,0,0,0,-1
454715,1/29/2020,6486.0,10.94,10.92,11.06,11.06,-0.023214,-0.023214,-0.028419,0.0,...,0,0,0,0,0,0,0,0,0,-1
454716,1/30/2020,5356.0,11.05,10.91,11.05,10.91,-0.009857,-0.013393,-0.025573,0.0,...,0,0,0,0,0,0,0,0,0,-1
454717,1/31/2020,15047.0,10.70,10.67,10.97,10.97,-0.036904,-0.044643,-0.044643,0.0,...,0,0,0,0,0,0,0,0,0,-1


In [54]:
train, test = train_test_split(df_3day, test_size = 0.3)
x_train = train.iloc[0:,1:75]        # excluding date column
y_train = train['signal3']

x_test = test.iloc[0:,1:75]
y_test = test['signal3']

In [55]:
model = RandomForestClassifier(n_estimators = 100, oob_score = True, max_depth = 10, min_samples_split = 8, min_samples_leaf = 4)
model.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=8,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [56]:
test_prediction= model.predict(x_test)

In [59]:
print("Test Accuracy", model.score(x_test,y_test))

Test Accuracy 0.9864825240441004


In [60]:
from sklearn.metrics import confusion_matrix
p_train=model.predict(x_train)
p_test = model.predict(x_test)
print(confusion_matrix(p_train,np.array(y_train)))
print(confusion_matrix(p_test,np.array(y_test)))

[[147366   3571      0]
 [     0   7702      0]
 [     0    325 159339]]
[[63358  1652     0]
 [    0  3092     0]
 [    0   192 68122]]


In [61]:
pd.crosstab(y_test, p_test, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,-1,0,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,63358,0,0,63358
0,1652,3092,192,4936
1,0,0,68122,68122
All,65010,3092,68314,136416


### Model 2 (5 day return)

In [62]:
df_5day = df.copy()
df_5day

Unnamed: 0,Dates,PX_VOLUME,PX_LAST,PX_LOW,PX_HIGH,PX_OPEN,return3,return5,return10,WMA5-10,...,WMA14_EMA50,SMA14_SMA50,SMA14_EMA14,SMA14_EMA50,SMA50_EMA14,SMA50_EMA50,EMA14_EMA50,signal3,signal5,signal10
0,2/5/2004,23555600.0,24.70,23.30,24.94,23.75,,,,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2/6/2004,4715500.0,24.85,24.20,24.94,24.71,,,,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2/9/2004,2898200.0,24.47,24.00,24.70,24.60,,,,0.0,...,0,0,0,0,0,0,0,0,0,0
3,2/10/2004,1566500.0,24.35,24.05,24.47,24.15,-0.014170,,,0.0,...,0,0,0,0,0,0,0,-1,0,0
4,2/11/2004,1939400.0,24.25,24.16,24.50,24.28,-0.024145,,,0.0,...,0,0,0,0,0,0,0,-1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454714,1/28/2020,4182.0,11.11,11.11,11.19,11.16,-0.008036,-0.008036,-0.014197,0.0,...,0,0,0,0,0,0,0,-1,-1,-1
454715,1/29/2020,6486.0,10.94,10.92,11.06,11.06,-0.023214,-0.023214,-0.028419,0.0,...,0,0,0,0,0,0,0,-1,-1,-1
454716,1/30/2020,5356.0,11.05,10.91,11.05,10.91,-0.009857,-0.013393,-0.025573,0.0,...,0,0,0,0,0,0,0,-1,-1,-1
454717,1/31/2020,15047.0,10.70,10.67,10.97,10.97,-0.036904,-0.044643,-0.044643,0.0,...,0,0,0,0,0,0,0,-1,-1,-1


In [63]:
df_5day.drop (columns = ['signal3','signal10'],inplace=True)
df_5day.fillna(0, inplace=True)
df_5day

Unnamed: 0,Dates,PX_VOLUME,PX_LAST,PX_LOW,PX_HIGH,PX_OPEN,return3,return5,return10,WMA5-10,...,WMA14_SMA50,WMA14_EMA14,WMA14_EMA50,SMA14_SMA50,SMA14_EMA14,SMA14_EMA50,SMA50_EMA14,SMA50_EMA50,EMA14_EMA50,signal5
0,2/5/2004,23555600.0,24.70,23.30,24.94,23.75,0.000000,0.000000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2/6/2004,4715500.0,24.85,24.20,24.94,24.71,0.000000,0.000000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2/9/2004,2898200.0,24.47,24.00,24.70,24.60,0.000000,0.000000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
3,2/10/2004,1566500.0,24.35,24.05,24.47,24.15,-0.014170,0.000000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
4,2/11/2004,1939400.0,24.25,24.16,24.50,24.28,-0.024145,0.000000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454714,1/28/2020,4182.0,11.11,11.11,11.19,11.16,-0.008036,-0.008036,-0.014197,0.0,...,0,0,0,0,0,0,0,0,0,-1
454715,1/29/2020,6486.0,10.94,10.92,11.06,11.06,-0.023214,-0.023214,-0.028419,0.0,...,0,0,0,0,0,0,0,0,0,-1
454716,1/30/2020,5356.0,11.05,10.91,11.05,10.91,-0.009857,-0.013393,-0.025573,0.0,...,0,0,0,0,0,0,0,0,0,-1
454717,1/31/2020,15047.0,10.70,10.67,10.97,10.97,-0.036904,-0.044643,-0.044643,0.0,...,0,0,0,0,0,0,0,0,0,-1


In [88]:
train, test = train_test_split(df_5day, test_size = 0.3)
x_train = train.iloc[0:,1:75]        # excluding date column
y_train = train['signal5']

x_test = test.iloc[0:,1:75]
y_test = test['signal5']

In [89]:
x_train

Unnamed: 0,PX_VOLUME,PX_LAST,PX_LOW,PX_HIGH,PX_OPEN,return3,return5,return10,WMA5-10,EMA5-10,...,WMA14_SMA14,WMA14_SMA50,WMA14_EMA14,WMA14_EMA50,SMA14_SMA50,SMA14_EMA14,SMA14_EMA50,SMA50_EMA14,SMA50_EMA50,EMA14_EMA50
337356,26077.0,9.1500,8.9600,9.1500,9.1500,0.018931,0.046911,0.062718,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3608,121721.0,98.2000,98.0500,100.2500,99.8500,-0.026470,-0.017607,-0.002033,-1.0,0.0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
363307,177692.0,144.8200,143.8600,145.5800,144.2400,0.007233,0.022740,0.033174,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
378617,4168752.0,46.5900,46.5200,47.2300,46.6200,-0.003422,-0.025110,-0.019364,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
433268,136233.0,36.3000,36.2500,36.6100,36.5300,-0.013855,-0.006840,0.011988,0.0,-1.0,...,-1,-1,-1,-1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64952,71064.0,19.2000,19.0000,19.3700,19.3600,0.000000,-0.012853,0.007874,0.0,0.0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
24498,5000.0,11.3125,11.2500,11.3750,11.2500,-0.010929,-0.010929,0.000000,0.0,0.0,...,1,1,1,1,1,1,1,1,0,1
190766,2015.0,17.6700,17.4300,17.6700,17.4300,-0.004507,0.008562,0.036972,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
432717,227000.0,25.5700,24.9800,25.6100,25.1800,0.009475,0.030633,0.017914,0.0,1.0,...,0,1,1,1,1,1,1,1,1,1


In [90]:
model = RandomForestClassifier(n_estimators = 100, oob_score = True, max_depth = 10, min_samples_split = 8, min_samples_leaf = 4)
model.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=8,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [91]:
test_prediction= model.predict(x_test)
print("Test Accuracy", model.score(x_test,y_test))

Test Accuracy 0.988315153647666


In [92]:
from sklearn.metrics import confusion_matrix
p_train=model.predict(x_train)
p_test = model.predict(x_test)
print(confusion_matrix(p_train,np.array(y_train)))
print(confusion_matrix(p_test,np.array(y_test)))

[[147929   3099      0]
 [     0   4960      0]
 [     1    146 162168]]
[[63231  1497     0]
 [    0  2053     0]
 [    0    97 69538]]


In [93]:
pd.crosstab(y_test, p_test, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,-1,0,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,63231,0,0,63231
0,1497,2053,97,3647
1,0,0,69538,69538
All,64728,2053,69635,136416


### Model 3 (10 day return)

In [72]:
df_10day = df.copy()
df_10day

Unnamed: 0,Dates,PX_VOLUME,PX_LAST,PX_LOW,PX_HIGH,PX_OPEN,return3,return5,return10,WMA5-10,...,WMA14_EMA50,SMA14_SMA50,SMA14_EMA14,SMA14_EMA50,SMA50_EMA14,SMA50_EMA50,EMA14_EMA50,signal3,signal5,signal10
0,2/5/2004,23555600.0,24.70,23.30,24.94,23.75,,,,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2/6/2004,4715500.0,24.85,24.20,24.94,24.71,,,,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2/9/2004,2898200.0,24.47,24.00,24.70,24.60,,,,0.0,...,0,0,0,0,0,0,0,0,0,0
3,2/10/2004,1566500.0,24.35,24.05,24.47,24.15,-0.014170,,,0.0,...,0,0,0,0,0,0,0,-1,0,0
4,2/11/2004,1939400.0,24.25,24.16,24.50,24.28,-0.024145,,,0.0,...,0,0,0,0,0,0,0,-1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454714,1/28/2020,4182.0,11.11,11.11,11.19,11.16,-0.008036,-0.008036,-0.014197,0.0,...,0,0,0,0,0,0,0,-1,-1,-1
454715,1/29/2020,6486.0,10.94,10.92,11.06,11.06,-0.023214,-0.023214,-0.028419,0.0,...,0,0,0,0,0,0,0,-1,-1,-1
454716,1/30/2020,5356.0,11.05,10.91,11.05,10.91,-0.009857,-0.013393,-0.025573,0.0,...,0,0,0,0,0,0,0,-1,-1,-1
454717,1/31/2020,15047.0,10.70,10.67,10.97,10.97,-0.036904,-0.044643,-0.044643,0.0,...,0,0,0,0,0,0,0,-1,-1,-1


In [73]:
df_10day.drop (columns = ['signal3','signal5'],inplace=True)
df_10day.fillna(0, inplace=True)
df_10day

Unnamed: 0,Dates,PX_VOLUME,PX_LAST,PX_LOW,PX_HIGH,PX_OPEN,return3,return5,return10,WMA5-10,...,WMA14_SMA50,WMA14_EMA14,WMA14_EMA50,SMA14_SMA50,SMA14_EMA14,SMA14_EMA50,SMA50_EMA14,SMA50_EMA50,EMA14_EMA50,signal10
0,2/5/2004,23555600.0,24.70,23.30,24.94,23.75,0.000000,0.000000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2/6/2004,4715500.0,24.85,24.20,24.94,24.71,0.000000,0.000000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2/9/2004,2898200.0,24.47,24.00,24.70,24.60,0.000000,0.000000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
3,2/10/2004,1566500.0,24.35,24.05,24.47,24.15,-0.014170,0.000000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
4,2/11/2004,1939400.0,24.25,24.16,24.50,24.28,-0.024145,0.000000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454714,1/28/2020,4182.0,11.11,11.11,11.19,11.16,-0.008036,-0.008036,-0.014197,0.0,...,0,0,0,0,0,0,0,0,0,-1
454715,1/29/2020,6486.0,10.94,10.92,11.06,11.06,-0.023214,-0.023214,-0.028419,0.0,...,0,0,0,0,0,0,0,0,0,-1
454716,1/30/2020,5356.0,11.05,10.91,11.05,10.91,-0.009857,-0.013393,-0.025573,0.0,...,0,0,0,0,0,0,0,0,0,-1
454717,1/31/2020,15047.0,10.70,10.67,10.97,10.97,-0.036904,-0.044643,-0.044643,0.0,...,0,0,0,0,0,0,0,0,0,-1


In [82]:
train, test = train_test_split(df_10day, test_size = 0.3)
x_train = train.iloc[0:,1:75]        # excluding date column
y_train = train['signal10']

x_test = test.iloc[0:,1:75]
y_test = test['signal10']

In [83]:
x_train

Unnamed: 0,PX_VOLUME,PX_LAST,PX_LOW,PX_HIGH,PX_OPEN,return3,return5,return10,WMA5-10,EMA5-10,...,WMA14_SMA14,WMA14_SMA50,WMA14_EMA14,WMA14_EMA50,SMA14_SMA50,SMA14_EMA14,SMA14_EMA50,SMA50_EMA14,SMA50_EMA50,EMA14_EMA50
423728,5859.0,18.0700,17.9750,18.1900,18.1000,-0.031099,-0.028495,-0.058854,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
205902,228849.0,22.3800,22.1950,22.4800,22.3400,0.013587,0.048244,0.083253,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
405626,5879.0,30.0400,29.6600,30.0600,29.6800,-0.025308,-0.046652,-0.046349,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
214367,200067.0,18.2100,18.0500,18.6800,18.5900,-0.061340,-0.131617,-0.115160,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
358500,2377.0,29.2600,29.2600,29.7800,29.7800,-0.005438,0.001026,0.001369,0.0,0.0,...,-1,-1,-1,-1,-1,-1,-1,-1,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309670,424680.0,44.7400,44.3700,45.1900,44.5400,-0.040326,-0.033693,-0.057709,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
81339,8666.0,20.6400,20.6400,21.0500,21.0500,-0.018078,-0.016675,-0.034160,0.0,0.0,...,-1,-1,-1,-1,0,0,0,0,0,0
364150,5105.0,5.8412,5.8412,5.9372,5.9372,-0.016169,-0.006987,0.054463,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
269705,165673.0,44.0200,43.0400,44.1000,43.1400,0.010096,0.036740,0.028745,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [84]:
model = RandomForestClassifier(n_estimators = 100, oob_score = True, max_depth = 10, min_samples_split = 8, min_samples_leaf = 4)
model.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=8,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [85]:
test_prediction= model.predict(x_test)
print("Test Accuracy", model.score(x_test,y_test))

Test Accuracy 0.9850750645085621


In [86]:
from sklearn.metrics import confusion_matrix
p_train=model.predict(x_train)
p_test = model.predict(x_test)
print(confusion_matrix(p_train,np.array(y_train)))
print(confusion_matrix(p_test,np.array(y_test)))

[[139714   4271      0]
 [     5  14823      0]
 [     0     77 159413]]
[[59878  1987     0]
 [    3  6132     0]
 [    0    46 68370]]


In [87]:
pd.crosstab(y_test, p_test, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,-1,0,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,59878,3,0,59881
0,1987,6132,46,8165
1,0,0,68370,68370
All,61865,6135,68416,136416


#### Model with PX_VOLUME, PX_LAST,	PX_LOW, PX_HIGH,	PX_OPEN columns removed

In [105]:
tempdf = df.copy()
tempdf

Unnamed: 0,Dates,PX_VOLUME,PX_LAST,PX_LOW,PX_HIGH,PX_OPEN,return3,return5,return10,WMA5-10,...,WMA14_EMA50,SMA14_SMA50,SMA14_EMA14,SMA14_EMA50,SMA50_EMA14,SMA50_EMA50,EMA14_EMA50,signal3,signal5,signal10
0,2/5/2004,23555600.0,24.70,23.30,24.94,23.75,,,,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2/6/2004,4715500.0,24.85,24.20,24.94,24.71,,,,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2/9/2004,2898200.0,24.47,24.00,24.70,24.60,,,,0.0,...,0,0,0,0,0,0,0,0,0,0
3,2/10/2004,1566500.0,24.35,24.05,24.47,24.15,-0.014170,,,0.0,...,0,0,0,0,0,0,0,-1,0,0
4,2/11/2004,1939400.0,24.25,24.16,24.50,24.28,-0.024145,,,0.0,...,0,0,0,0,0,0,0,-1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454714,1/28/2020,4182.0,11.11,11.11,11.19,11.16,-0.008036,-0.008036,-0.014197,0.0,...,0,0,0,0,0,0,0,-1,-1,-1
454715,1/29/2020,6486.0,10.94,10.92,11.06,11.06,-0.023214,-0.023214,-0.028419,0.0,...,0,0,0,0,0,0,0,-1,-1,-1
454716,1/30/2020,5356.0,11.05,10.91,11.05,10.91,-0.009857,-0.013393,-0.025573,0.0,...,0,0,0,0,0,0,0,-1,-1,-1
454717,1/31/2020,15047.0,10.70,10.67,10.97,10.97,-0.036904,-0.044643,-0.044643,0.0,...,0,0,0,0,0,0,0,-1,-1,-1


In [106]:
tempdf.drop(columns=['return3', 'return5','return10','signal5','signal10','PX_LAST','PX_LOW','PX_HIGH','PX_OPEN'],inplace =True)

In [109]:
tempdf.drop(columns=['PX_VOLUME'],inplace=True)

In [110]:
tempdf

Unnamed: 0,Dates,WMA5-10,EMA5-10,DEMA5-10,MACD_12_29_9,MACD_5_35_5,SMA5-10,WMA14,SMA14,SMA50,...,WMA14_SMA50,WMA14_EMA14,WMA14_EMA50,SMA14_SMA50,SMA14_EMA14,SMA14_EMA50,SMA50_EMA14,SMA50_EMA50,EMA14_EMA50,signal3
0,2/5/2004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2/6/2004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2/9/2004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,2/10/2004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,-1
4,2/11/2004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454714,1/28/2020,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,-1
454715,1/29/2020,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,-1
454716,1/30/2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,-1
454717,1/31/2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,-1


In [111]:
train, test = train_test_split(tempdf, test_size = 0.3)
x_train = train.iloc[0:,1:67]        # excluding date column
y_train = train['signal3']

x_test = test.iloc[0:,1:67]
y_test = test['signal3']

In [112]:
x_train

Unnamed: 0,WMA5-10,EMA5-10,DEMA5-10,MACD_12_29_9,MACD_5_35_5,SMA5-10,WMA14,SMA14,SMA50,EMA14,...,WMA14_SMA14,WMA14_SMA50,WMA14_EMA14,WMA14_EMA50,SMA14_SMA50,SMA14_EMA14,SMA14_EMA50,SMA50_EMA14,SMA50_EMA50,EMA14_EMA50
421801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1,0,0,0,1,1,1,0,0,0
75565,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0,0.0,...,1,1,1,1,0,0,0,0,0,0
337090,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
45705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
263210,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0,...,-1,-1,-1,-1,-1,-1,-1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
356630,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0,1,0,1,1,0,1,1,1,1
156953,0.0,0.0,0.0,-1.0,0.0,-1.0,-1.0,0.0,0.0,0.0,...,-1,-1,-1,-1,0,0,0,0,0,0
211969,0.0,-1.0,0.0,0.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,...,-1,0,-1,0,-1,-1,-1,-1,0,-1
47105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [113]:
model = RandomForestClassifier(n_estimators = 100, oob_score = True, max_depth = 10, min_samples_split = 8, min_samples_leaf = 4)
model.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=8,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [114]:
test_prediction= model.predict(x_test)
print("Test Accuracy", model.score(x_test,y_test))

Test Accuracy 0.6675316678395496
