In [47]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
import plotly.express as px
import pandas_ta as ta

In [48]:
RANDOM_SEED = 9

In [49]:
df = pd.read_csv('data/n50_data.csv')

In [50]:
df['Date'] = pd.to_datetime(df['Date'])
# Adjusted data available after 2010 only 
df = df[df['Date'] >= '2010-04-01']
df = df.dropna()
df = df.drop_duplicates()

It is important to set benchmark. Here we will assume stock price will remain same after 10 days. ML algorithm will try to beat that benchmark.

In [51]:
np.mean(abs(df['10d_pct_change']))

0.052174911475531244

MAE of price to remain unchanged is 0.0521. Our aim is to reduce this error.

In [52]:
df['10d_pct_change'].describe()

count    142086.000000
mean          0.051867
std           0.047028
min          -0.157364
25%           0.019178
50%           0.040294
75%           0.071247
max           0.787421
Name: 10d_pct_change, dtype: float64

In [53]:
df.shape

(142086, 14)

In [54]:
def log_model_performance(model_type, description, score, parameters):
    df_log = pd.read_csv('model_performance.csv')
    result_dict = {}
    result_dict['model_type'] = model_type
    result_dict['description'] = description
    result_dict['score'] = score
    result_dict['parameters'] = parameters
    df_log = df_log.append(result_dict, ignore_index=True)
    df_log.to_csv('model_performance.csv',index=False)

In [55]:
def get_all_ta_features(df):
    df_list = []
    df.set_index(pd.DatetimeIndex(df["Date"]), inplace=True)
    df = df.drop('Date', axis=1)
    dfg = df.groupby(['Symbol'])
    for grp in dfg.groups:
        x = dfg.get_group(grp).copy()
        x.ta.strategy()
        x = x.drop(['DPO_20','ICS_26'], axis=1)
        x.ta.dpo(centered=False)
        
        df_list.append(x)
    newdf = pd.concat(df_list)
    return newdf

In [56]:
def feature_engineering(df):
    # Processing symbol as labels.
    labelencoder = LabelEncoder()
    df['Symbol'] = labelencoder.fit_transform(df['Symbol'])
    # Splitting Dates
    #df['month'] = df['Date'].dt.month
    df['day'] = df['Date'].dt.day
    df['week_day'] = df['Date'].dt.weekday
    # Price based features
    df['day_return'] = (df['Close']/df['Open'])
    df['close_open_dff'] = df['Close'] - df['Open']
    # Trade based features
    df['basket_size'] = df['Volume'] / df['Trades']
    # All high and high flags
    df['248d_high'] = df.groupby('Symbol')['High'].rolling(window=248).max().droplevel(0)
    df['248d_high'] = df['248d_high'].fillna(-1)
    df['248d_high_flag'] = (df['248d_high'] == df['High']).astype('int')
    df['22d_high'] = df.groupby('Symbol')['High'].rolling(window=22).max().droplevel(0)
    df['22d_high'] = df['22d_high'].fillna(-1)
    df['22d_high_flag'] = (df['22d_high'] == df['High']).astype('int')
    df['10d_high'] = df.groupby('Symbol')['High'].rolling(window=10).max().droplevel(0)
    df['10d_high'] = df['10d_high'].fillna(-1)
    df['10d_high_flag'] = (df['10d_high'] == df['High']).astype('int')
    # shadow
    df['upper_shadow'] = df['High'] - np.maximum(df['Close'], df['Open'])
    df['lower_shadow'] = np.minimum(df['Close'], df['Open']) - df['Low']
    df["high_div_low"] = df["High"] / df["Low"]
    # scale all price columns
    price_columns = ['Open','High','Low','Last','Close','Average']
    for c in price_columns:
        df[c + '_scalled'] = df[c] / df['248d_high']
    return df, labelencoder

In [57]:
df = get_all_ta_features(df)

In [58]:
df = df.reset_index()

In [59]:
df, labelencoder = feature_engineering(df)

In [60]:
df = df.fillna(-999)

In [61]:
split_date = '2021-07-01'
train = df[df.Date < split_date].drop('Date',axis=1)
test = df[df.Date >= split_date].drop('Date',axis=1)

In [62]:
train.columns

Index(['Symbol', 'Open', 'High', 'Low', 'Last', 'Close', 'Average', 'Volume',
       'Turnover', 'Trades',
       ...
       '10d_high_flag', 'upper_shadow', 'lower_shadow', 'high_div_low',
       'Open_scalled', 'High_scalled', 'Low_scalled', 'Last_scalled',
       'Close_scalled', 'Average_scalled'],
      dtype='object', length=309)

In [63]:
lr = LinearRegression()
lr.fit(X = train.drop('10d_pct_change',axis=1), 
       y = train['10d_pct_change'])
predictions = lr.predict(test.drop('10d_pct_change',axis=1))
mae_score = mean_absolute_error(test['10d_pct_change'], predictions)
rmse_score = np.sqrt(mean_squared_error(test['10d_pct_change'], predictions))
log_model_performance("Linear Model", "day_return,range,basket_size,high,high_flag, shadow, scalled with all TA features except leaked and month", mae_score, "None")
print("Linear Regression : MAE is {}. RMSE is {}".format(mae_score, rmse_score))

Linear Regression : MAE is 0.03222477103909089. RMSE is 0.04543694639598105


In [64]:
knn = KNeighborsRegressor(n_neighbors=50)
knn.fit(X = train.drop('10d_pct_change',axis=1), 
       y = train['10d_pct_change'])
predictions = knn.predict(test.drop('10d_pct_change',axis=1))
mae_score = mean_absolute_error(test['10d_pct_change'], predictions)
rmse_score = np.sqrt(mean_squared_error(test['10d_pct_change'], predictions))
log_model_performance("KNN Model", "day_return,range,basket_size,high,high_flag, shadow, scalled with all TA features except leaked and month", mae_score, "n_neighbors=50")
print("KNN : MAE is {}. RMSE is {}".format(mae_score, rmse_score))

KNN : MAE is 0.034898768432147745. RMSE is 0.04606238598024023


In [65]:
"""
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(train.drop('10d_pct_change',axis=1))
y = sc_y.fit_transform(np.array(train['10d_pct_change']).reshape(-1, 1))
regressor = SVR(kernel = 'rbf')
regressor.fit(X, y)
predictions = regressor.predict(6.5)
predictions = sc_y.inverse_transform(predictions) 
mae_score = mean_absolute_error(test['10d_pct_change'], predictions)
rmse_score = np.sqrt(mean_squared_error(test['10d_pct_change'], predictions))
print("SVM : MAE is {}. RMSE is {}".format(mae_score, rmse_score))
"""

'\nsc_X = StandardScaler()\nsc_y = StandardScaler()\nX = sc_X.fit_transform(train.drop(\'10d_pct_change\',axis=1))\ny = sc_y.fit_transform(np.array(train[\'10d_pct_change\']).reshape(-1, 1))\nregressor = SVR(kernel = \'rbf\')\nregressor.fit(X, y)\npredictions = regressor.predict(6.5)\npredictions = sc_y.inverse_transform(predictions) \nmae_score = mean_absolute_error(test[\'10d_pct_change\'], predictions)\nrmse_score = np.sqrt(mean_squared_error(test[\'10d_pct_change\'], predictions))\nprint("SVM : MAE is {}. RMSE is {}".format(mae_score, rmse_score))\n'

In [66]:
dt = DecisionTreeRegressor(random_state=RANDOM_SEED)
dt.fit(X = train.drop('10d_pct_change',axis=1), 
       y = train['10d_pct_change'])
predictions = dt.predict(test.drop('10d_pct_change',axis=1))
mae_score = mean_absolute_error(test['10d_pct_change'], predictions)
rmse_score = np.sqrt(mean_squared_error(test['10d_pct_change'], predictions))
log_model_performance("Decision Tree", "day_return,range,basket_size,high,high_flag, shadow, scalled with all TA features", mae_score, "random_state=RANDOM_SEED")
print("Decision Tree : MAE is {}. RMSE is {}".format(mae_score, rmse_score))

Decision Tree : MAE is 0.04643334635405021. RMSE is 0.06698875997350258


In [67]:
"""
rf = RandomForestRegressor(random_state=RANDOM_SEED)
rf.fit(X = train.drop('10d_pct_change',axis=1), 
       y = train['10d_pct_change'])
predictions = rf.predict(test.drop('10d_pct_change',axis=1))
mae_score = mean_absolute_error(test['10d_pct_change'], predictions)
rmse_score = np.sqrt(mean_squared_error(test['10d_pct_change'], predictions))
log_model_performance("Random Forest", "day_return,range,basket_size,high,high_flag, shadow, scalled ", mae_score, "random_state=RANDOM_SEED")
print("Random Forest : MAE is {}. RMSE is {}".format(mae_score, rmse_score))-
"""

'\nrf = RandomForestRegressor(random_state=RANDOM_SEED)\nrf.fit(X = train.drop(\'10d_pct_change\',axis=1), \n       y = train[\'10d_pct_change\'])\npredictions = rf.predict(test.drop(\'10d_pct_change\',axis=1))\nmae_score = mean_absolute_error(test[\'10d_pct_change\'], predictions)\nrmse_score = np.sqrt(mean_squared_error(test[\'10d_pct_change\'], predictions))\nlog_model_performance("Random Forest", "day_return,range,basket_size,high,high_flag, shadow, scalled ", mae_score, "random_state=RANDOM_SEED")\nprint("Random Forest : MAE is {}. RMSE is {}".format(mae_score, rmse_score))-\n'

In [69]:
lgbm = LGBMRegressor(n_estimators=110, num_leaves=32, colsample_bytree=0.5)
lgbm.fit(X = train.drop('10d_pct_change',axis=1), 
       y = train['10d_pct_change'])
predictions = lgbm.predict(test.drop('10d_pct_change',axis=1))
mae_score = mean_absolute_error(test['10d_pct_change'], predictions)
rmse_score = np.sqrt(mean_squared_error(test['10d_pct_change'], predictions))
#log_model_performance("LGBM", "day_return,range,basket_size,high,high_flag, shadow, scalled with all TA features  except leaked  and month", mae_score, "n_estimators=75, num_leaves=32")
train_predictions = lgbm.predict(train.drop('10d_pct_change',axis=1))
mae_score_train = mean_absolute_error(train['10d_pct_change'], train_predictions)
print("Light GBM : MAE is {}. RMSE is {}. Train MAE is {}".format(mae_score, rmse_score, mae_score_train))

Light GBM : MAE is 0.03192171139234499. RMSE is 0.04441480382283476. Train MAE is 0.028700000660671098


In [70]:
result_df = df[df.Date >= split_date][['Symbol','Date','High','Low','Last','Close','Average','10d_pct_change']]
result_df.loc[:,'predicted_10d_return'] = predictions
result_df['10d_pct_change'] = 100 * result_df['10d_pct_change']
result_df['predicted_10d_return'] = 100 * result_df['predicted_10d_return']
result_df.loc[result_df['10d_pct_change']<=5,'result'] = 'Loss'
result_df.loc[result_df['10d_pct_change']>5,'result'] = 'Win'

In [71]:
result_df.Symbol.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])

In [72]:
result_df.to_csv("result_export.csv", index=False)

In [73]:
test

Unnamed: 0,Symbol,Open,High,Low,Last,Close,Average,Volume,Turnover,Trades,...,10d_high_flag,upper_shadow,lower_shadow,high_div_low,Open_scalled,High_scalled,Low_scalled,Last_scalled,Close_scalled,Average_scalled
2792,0,706.00,711.85,700.00,705.00,703.10,704.35,8035542,5.659841e+09,111500.0,...,0,5.85,3.10,1.016929,0.783574,0.790067,0.776915,0.782464,0.780355,0.781743
2793,0,705.00,724.30,705.00,710.85,710.40,714.65,17676086,1.263213e+10,219997.0,...,0,13.90,0.00,1.027376,0.782464,0.803885,0.782464,0.788957,0.788457,0.793174
2794,0,717.00,721.00,703.00,709.50,710.05,711.02,11105134,7.895923e+09,135960.0,...,0,4.00,7.05,1.025605,0.795782,0.800222,0.780244,0.787458,0.788069,0.789145
2795,0,712.00,731.95,707.10,714.00,712.70,720.33,14580124,1.050251e+10,172719.0,...,0,19.25,4.90,1.035144,0.790233,0.812375,0.784795,0.792453,0.791010,0.799478
2796,0,716.90,723.00,706.00,722.50,720.10,715.66,8719991,6.240543e+09,113244.0,...,0,2.90,10.90,1.024079,0.795671,0.802442,0.783574,0.801887,0.799223,0.794295
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142081,49,705.25,710.00,700.05,703.80,703.70,705.04,3949208,2.784346e+09,82721.0,...,1,4.75,3.65,1.014213,0.953234,0.959654,0.946205,0.951274,0.951139,0.952950
142082,49,705.10,719.90,704.00,716.15,714.10,714.51,9043675,6.461773e+09,146624.0,...,1,5.80,1.10,1.022585,0.953031,0.973035,0.951544,0.967966,0.965196,0.965750
142083,49,715.80,718.80,709.10,715.30,715.35,713.70,4274779,3.050907e+09,94710.0,...,0,3.00,6.25,1.013679,0.967493,0.971548,0.958438,0.966818,0.966885,0.964655
142084,49,718.00,726.80,716.00,718.95,718.70,720.13,4640405,3.341672e+09,86315.0,...,1,8.10,2.00,1.015084,0.970467,0.982361,0.967764,0.971751,0.971413,0.973346


In [74]:
for i in range(2, 12):
    wins = result_df[(result_df['predicted_10d_return']>=i) & (result_df['result']=='Win')].shape[0]
    losses = result_df[(result_df['predicted_10d_return']>=i) & (result_df['result']=='Loss')].shape[0]
    print("For expected return {}, Winning ratio is {}".format(i, wins/(wins+losses)))
    print(wins + losses)

For expected return 2, Winning ratio is 0.39390625
6400
For expected return 3, Winning ratio is 0.39533777354900096
6306
For expected return 4, Winning ratio is 0.4109621265584568
4251
For expected return 5, Winning ratio is 0.4635691657866948
1894
For expected return 6, Winning ratio is 0.4946524064171123
748
For expected return 7, Winning ratio is 0.5706214689265536
354
For expected return 8, Winning ratio is 0.6464088397790055
181
For expected return 9, Winning ratio is 0.6722689075630253
119
For expected return 10, Winning ratio is 0.6774193548387096
62
For expected return 11, Winning ratio is 0.6428571428571429
28


In [76]:
result_df[result_df.predicted_10d_return>=5].head(20)

Unnamed: 0,Symbol,Date,High,Low,Last,Close,Average,10d_pct_change,predicted_10d_return,result
2792,0,2021-07-01,711.85,700.0,705.0,703.1,704.35,4.821505,6.394745,Loss
2793,0,2021-07-02,724.3,705.0,710.85,710.4,714.65,3.744369,5.070134,Loss
2794,0,2021-07-05,721.0,703.0,709.5,710.05,711.02,3.795507,5.353695,Loss
2795,0,2021-07-06,731.95,707.1,714.0,712.7,720.33,3.409569,5.475501,Loss
2796,0,2021-07-07,723.0,706.0,722.5,720.1,715.66,2.346896,5.070212,Loss
2797,0,2021-07-08,728.0,709.1,711.8,711.7,718.74,3.554869,5.806276,Loss
2798,0,2021-07-09,737.0,708.35,728.0,728.5,725.62,0.981469,5.988674,Loss
2799,0,2021-07-12,735.65,715.05,717.95,718.05,724.26,0.396908,6.365274,Loss
2800,0,2021-07-13,720.9,702.0,704.35,704.0,706.82,1.065341,6.292642,Loss
2801,0,2021-07-14,711.5,695.35,696.7,697.25,701.74,1.312298,6.094613,Loss


In [77]:
result_df[result_df['predicted_10d_return']>=8].Symbol.value_counts()

42    57
34    38
43    22
23    18
20    11
10     9
27     6
26     4
0      4
33     3
5      3
48     2
38     1
36     1
15     1
4      1
Name: Symbol, dtype: int64

In [78]:
fig = px.scatter(x=list(test['10d_pct_change']), y=predictions)
fig.update_traces(marker_size=3)
fig.show()

In [68]:
test.shape

(6400, 310)

In [141]:
np.sum(predictions>0.1)

23

In [70]:
lgbm.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 150,
 'n_jobs': -1,
 'num_leaves': 32,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': 'warn',
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [79]:
df_importance = pd.DataFrame({'Column_Name' : train.drop('10d_pct_change',axis=1).columns,
             'Importance' : lgbm.feature_importances_ }  ).sort_values(by='Importance', ascending=False)          

In [98]:
df_importance[df_importance['Importance']==0]

Unnamed: 0,Column_Name,Importance
150,HLC3,0
78,CDL_HIKKAKE,0
160,INC_1,0
14,ABER_XG_5_15,0
58,CDL_ADVANCEBLOCK,0
...,...,...
85,CDL_KICKING,0
108,CDL_TRISTAR,0
86,CDL_KICKINGBYLENGTH,0
107,CDL_THRUSTING,0


In [81]:
fig = px.bar(df_importance[:10], x="Importance", y="Column_Name", orientation='h')
fig.show()

In [113]:
df_importance[df_importance['Column_Name'].str.contains('PVT')]

Unnamed: 0,Column_Name,Importance
208,PVT,69


In [116]:
fig = px.scatter(x=df[df.NATR_14>0]['10d_pct_change'], y=df[df.NATR_14>0]['NATR_14'])
fig.update_traces(marker_size=3)
fig.show()

In [120]:
df[df.NATR_14>10][['Symbol','Date','High','Low','Open','Close','NATR_14','10d_pct_change']].head(30)

Unnamed: 0,Symbol,Date,High,Low,Open,Close,NATR_14,10d_pct_change
2475,ADANIPORTS,2020-03-23,235.8,203.0,235.0,207.8,10.153742,0.282483
8313,AXISBANK,2020-03-19,464.95,423.75,441.05,428.25,10.342796,0.050088
8314,AXISBANK,2020-03-20,449.7,414.2,431.4,428.15,10.198517,-0.043676
8315,AXISBANK,2020-03-23,392.0,302.0,385.35,308.65,16.055966,0.385712
8316,AXISBANK,2020-03-24,337.5,291.0,331.95,303.15,16.275244,0.410853
8317,AXISBANK,2020-03-25,363.75,286.0,293.5,326.8,15.718421,0.309364
8318,AXISBANK,2020-03-26,399.0,332.0,335.95,341.4,15.482078,0.322935
8319,AXISBANK,2020-03-27,409.45,354.05,370.0,359.75,14.994055,0.255455
8320,AXISBANK,2020-03-30,382.0,340.35,345.0,368.15,14.413467,0.319842
8321,AXISBANK,2020-03-31,389.8,372.65,384.4,379.0,13.408807,0.282058


In [114]:
df[['Symbol','Date','High','Low','Open','Close','NATR_14','10d_pct_change']].tail(30)

Unnamed: 0,Symbol,Date,High,Low,Open,Close,NATR_14,10d_pct_change
142056,WIPRO,2021-11-24,647.6,633.1,644.3,636.5,2.586634,0.027416
142057,WIPRO,2021-11-25,641.8,633.0,637.0,636.7,2.499843,0.027093
142058,WIPRO,2021-11-26,634.4,619.65,632.0,621.45,2.574216,0.052297
142059,WIPRO,2021-11-29,632.85,603.95,616.55,630.6,2.683013,0.041151
142060,WIPRO,2021-11-30,645.5,630.5,630.6,637.25,2.633503,0.030286
142061,WIPRO,2021-12-01,645.95,633.25,642.6,634.8,2.597736,0.034263
142062,WIPRO,2021-12-02,648.1,629.0,636.9,646.8,2.578359,0.015074
142063,WIPRO,2021-12-03,653.95,639.9,644.75,640.75,2.573421,0.056262
142064,WIPRO,2021-12-06,644.2,623.15,637.25,624.5,2.692549,0.084868
142065,WIPRO,2021-12-07,634.55,623.0,628.0,632.4,2.599446,0.102309


In [86]:
df['Symbol'] = labelencoder.inverse_transform(df.Symbol) 

In [90]:
df[df['10d_pct_change']>=0.05]

Unnamed: 0,Date,Symbol,Open,High,Low,Last,Close,Average,Volume,Turnover,...,10d_high_flag,upper_shadow,lower_shadow,high_div_low,Open_scalled,High_scalled,Low_scalled,Last_scalled,Close_scalled,Average_scalled
3,2010-04-07,ADANIPORTS,149.86,149.86,147.62,148.42,148.49,148.552,405001,3.008189e+08,...,0,0.00,0.87,1.015174,-149.860000,-149.860000,-147.620000,-148.420000,-148.490000,-148.552000
4,2010-04-08,ADANIPORTS,147.28,150.68,147.00,148.40,148.30,148.700,563756,4.191514e+08,...,0,2.38,0.28,1.025034,-147.280000,-150.680000,-147.000000,-148.400000,-148.300000,-148.700000
5,2010-04-09,ADANIPORTS,149.00,150.20,148.15,148.55,148.54,148.776,344727,2.564351e+08,...,0,1.20,0.39,1.013837,-149.000000,-150.200000,-148.150000,-148.550000,-148.540000,-148.776000
6,2010-04-12,ADANIPORTS,149.00,149.98,147.08,149.90,149.48,149.014,288672,2.150795e+08,...,0,0.50,1.92,1.019717,-149.000000,-149.980000,-147.080000,-149.900000,-149.480000,-149.014000
8,2010-04-15,ADANIPORTS,152.32,153.78,149.30,150.00,149.76,151.320,516509,3.907895e+08,...,0,1.46,0.46,1.030007,-152.320000,-153.780000,-149.300000,-150.000000,-149.760000,-151.320000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142072,2021-12-16,WIPRO,644.00,649.90,639.40,647.50,644.30,644.540,6218620,4.008142e+09,...,0,5.60,4.60,1.016422,0.870447,0.878421,0.864229,0.875177,0.870852,0.871177
142073,2021-12-17,WIPRO,660.00,676.80,650.10,674.75,670.80,666.350,52300009,3.485035e+10,...,1,6.00,9.90,1.041071,0.892073,0.914780,0.878692,0.912009,0.906670,0.900656
142074,2021-12-20,WIPRO,667.55,677.50,663.00,665.70,665.95,669.490,13149101,8.803195e+09,...,1,9.95,2.95,1.021870,0.902277,0.915726,0.896128,0.899777,0.900115,0.904900
142075,2021-12-21,WIPRO,676.00,697.10,671.00,691.15,690.80,687.970,13681048,9.412156e+09,...,1,6.30,5.00,1.038897,0.913699,0.942218,0.906941,0.934176,0.933703,0.929878


In [93]:
df.groupby(df.Date.dt.month)['10d_pct_change'].mean()

Date
1     0.051514
2     0.050749
3     0.054264
4     0.054294
5     0.060064
6     0.048351
7     0.048794
8     0.053173
9     0.055612
10    0.055486
11    0.046709
12    0.043976
Name: 10d_pct_change, dtype: float64