In [206]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
import plotly.express as px

In [207]:
RANDOM_SEED = 9

In [208]:
df = pd.read_csv('data/n50_data.csv')

In [209]:
df['Date'] = pd.to_datetime(df['Date'])
# Adjusted data available after 2010 only 
df = df[df['Date'] >= '2010-04-01']
df = df.dropna()

It is important to set benchmark. Here we will assume stock price will remain same after 10 days. ML algorithm will try to beat that benchmark.

In [210]:
np.mean(abs(df['10d_pct_change']))

0.03989504973458912

MAE of price to remain unchanged is 0.0399. Our aim is to reduce this error.

In [211]:
df['10d_pct_change'].describe()

count    142230.000000
mean          0.036709
std           0.043518
min          -0.189183
25%           0.007231
50%           0.027072
75%           0.055771
max           0.579328
Name: 10d_pct_change, dtype: float64

In [212]:
df.shape

(142230, 14)

In [213]:
def log_model_performance(model_type, description, score, parameters):
    df_log = pd.read_csv('model_performance.csv')
    result_dict = {}
    result_dict['model_type'] = model_type
    result_dict['description'] = description
    result_dict['score'] = score
    result_dict['parameters'] = parameters
    df_log = df_log.append(result_dict, ignore_index=True)
    df_log.to_csv('model_performance.csv',index=False)

In [214]:
def feature_engineering(df):
    # Processing symbol as labels.
    labelencoder = LabelEncoder()
    df['Symbol'] = labelencoder.fit_transform(df['Symbol'])
    # Splitting Dates
    df['month'] = df['Date'].dt.month
    df['day'] = df['Date'].dt.day
    df['week_day'] = df['Date'].dt.weekday
    # Price based features
    df['day_return'] = (df['Close']/df['Open'])
    df['close_open_dff'] = df['Close'] - df['Open']
    # Trade based features
    df['basket_size'] = df['Quantity'] / df['Trades']
    # All high and high flags
    df['248d_high'] = df.groupby('Symbol')['High'].rolling(window=248).max().droplevel(0)
    df['248d_high'] = df['248d_high'].fillna(-1)
    df['248d_high_flag'] = (df['248d_high'] == df['High']).astype('int')
    df['22d_high'] = df.groupby('Symbol')['High'].rolling(window=22).max().droplevel(0)
    df['22d_high'] = df['22d_high'].fillna(-1)
    df['22d_high_flag'] = (df['22d_high'] == df['High']).astype('int')
    df['10d_high'] = df.groupby('Symbol')['High'].rolling(window=10).max().droplevel(0)
    df['10d_high'] = df['10d_high'].fillna(-1)
    df['10d_high_flag'] = (df['10d_high'] == df['High']).astype('int')
    # shadow
    df['upper_shadow'] = df['High'] - np.maximum(df['Close'], df['Open'])
    df['lower_shadow'] = np.minimum(df['Close'], df['Open']) - df['Low']
    df["high_div_low"] = df["High"] / df["Low"]
    # scale all price columns
    price_columns = ['Open','High','Low','Last','Close','Average']
    for c in price_columns:
        df[c + '_scalled'] = df[c] / df['248d_high']
    return df

In [215]:
df = feature_engineering(df)

In [217]:
df.tail()

Unnamed: 0,Date,Symbol,Open,High,Low,Last,Close,Average,Quantity,Turnover,...,10d_high_flag,upper_shadow,lower_shadow,high_div_low,Open_scalled,High_scalled,Low_scalled,Last_scalled,Close_scalled,Average_scalled
240277,2021-12-29,49,705.25,710.0,700.05,703.8,703.7,705.04,3949208,2784346000.0,...,1,4.75,3.65,1.014213,0.953234,0.959654,0.946205,0.951274,0.951139,0.95295
240278,2021-12-30,49,705.1,719.9,704.0,716.15,714.1,714.51,9043675,6461773000.0,...,1,5.8,1.1,1.022585,0.953031,0.973035,0.951544,0.967966,0.965196,0.96575
240279,2021-12-31,49,715.8,718.8,709.1,715.3,715.35,713.7,4274779,3050907000.0,...,0,3.0,6.25,1.013679,0.967493,0.971548,0.958438,0.966818,0.966885,0.964655
240280,2022-01-03,49,718.0,726.8,716.0,718.95,718.7,720.13,4640405,3341672000.0,...,1,8.1,2.0,1.015084,0.970467,0.982361,0.967764,0.971751,0.971413,0.973346
240281,2022-01-04,49,722.75,723.1,711.55,722.0,721.5,718.18,5114400,3673043000.0,...,0,0.35,9.95,1.016232,0.976887,0.97736,0.961749,0.975873,0.975198,0.97071


In [218]:
split_date = '2021-07-01'
train = df[df.Date < split_date].drop('Date',axis=1)
test = df[df.Date >= split_date].drop('Date',axis=1)

In [219]:
train.columns

Index(['Symbol', 'Open', 'High', 'Low', 'Last', 'Close', 'Average', 'Quantity',
       'Turnover', 'Trades', 'Deliverable_quatity', 'Deliverable_quatity_pct',
       '10d_pct_change', 'month', 'day', 'week_day', 'day_return',
       'close_open_dff', 'basket_size', '248d_high', '248d_high_flag',
       '22d_high', '22d_high_flag', '10d_high', '10d_high_flag',
       'upper_shadow', 'lower_shadow', 'high_div_low', 'Open_scalled',
       'High_scalled', 'Low_scalled', 'Last_scalled', 'Close_scalled',
       'Average_scalled'],
      dtype='object')

In [220]:
lr = LinearRegression()
lr.fit(X = train.drop('10d_pct_change',axis=1), 
       y = train['10d_pct_change'])
predictions = lr.predict(test.drop('10d_pct_change',axis=1))
mae_score = mean_absolute_error(test['10d_pct_change'], predictions)
rmse_score = np.sqrt(mean_squared_error(test['10d_pct_change'], predictions))
log_model_performance("Linear Model", "day_return,range,basket_size,high,high_flag, shadow, scalled", mae_score, "None")
print("Linear Regression : MAE is {}. RMSE is {}".format(mae_score, rmse_score))

Linear Regression : MAE is 0.03049639060677217. RMSE is 0.042088269206067494


In [221]:
knn = KNeighborsRegressor(n_neighbors=50)
knn.fit(X = train.drop('10d_pct_change',axis=1), 
       y = train['10d_pct_change'])
predictions = knn.predict(test.drop('10d_pct_change',axis=1))
mae_score = mean_absolute_error(test['10d_pct_change'], predictions)
rmse_score = np.sqrt(mean_squared_error(test['10d_pct_change'], predictions))
log_model_performance("KNN Model", "day_return,range,basket_size,high,high_flag, shadow, scalled", mae_score, "n_neighbors=50")
print("KNN : MAE is {}. RMSE is {}".format(mae_score, rmse_score))

KNN : MAE is 0.03152310231312883. RMSE is 0.04276012521208407


In [163]:
"""
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(train.drop('10d_pct_change',axis=1))
y = sc_y.fit_transform(np.array(train['10d_pct_change']).reshape(-1, 1))
regressor = SVR(kernel = 'rbf')
regressor.fit(X, y)
predictions = regressor.predict(6.5)
predictions = sc_y.inverse_transform(predictions) 
mae_score = mean_absolute_error(test['10d_pct_change'], predictions)
rmse_score = np.sqrt(mean_squared_error(test['10d_pct_change'], predictions))
print("SVM : MAE is {}. RMSE is {}".format(mae_score, rmse_score))
"""

'\nsc_X = StandardScaler()\nsc_y = StandardScaler()\nX = sc_X.fit_transform(train.drop(\'10d_pct_change\',axis=1))\ny = sc_y.fit_transform(np.array(train[\'10d_pct_change\']).reshape(-1, 1))\nregressor = SVR(kernel = \'rbf\')\nregressor.fit(X, y)\npredictions = regressor.predict(6.5)\npredictions = sc_y.inverse_transform(predictions) \nmae_score = mean_absolute_error(test[\'10d_pct_change\'], predictions)\nrmse_score = np.sqrt(mean_squared_error(test[\'10d_pct_change\'], predictions))\nprint("SVM : MAE is {}. RMSE is {}".format(mae_score, rmse_score))\n'

In [222]:
dt = DecisionTreeRegressor(random_state=RANDOM_SEED)
dt.fit(X = train.drop('10d_pct_change',axis=1), 
       y = train['10d_pct_change'])
predictions = dt.predict(test.drop('10d_pct_change',axis=1))
mae_score = mean_absolute_error(test['10d_pct_change'], predictions)
rmse_score = np.sqrt(mean_squared_error(test['10d_pct_change'], predictions))
log_model_performance("Decision Tree", "day_return,range,basket_size,high,high_flag, shadow, scalled", mae_score, "random_state=RANDOM_SEED")
print("Decision Tree : MAE is {}. RMSE is {}".format(mae_score, rmse_score))

Decision Tree : MAE is 0.043804597967616286. RMSE is 0.059764539754261235


In [223]:
rf = RandomForestRegressor(random_state=RANDOM_SEED)
rf.fit(X = train.drop('10d_pct_change',axis=1), 
       y = train['10d_pct_change'])
predictions = rf.predict(test.drop('10d_pct_change',axis=1))
mae_score = mean_absolute_error(test['10d_pct_change'], predictions)
rmse_score = np.sqrt(mean_squared_error(test['10d_pct_change'], predictions))
log_model_performance("Random Forest", "day_return,range,basket_size,high,high_flag, shadow, scalled", mae_score, "random_state=RANDOM_SEED")
print("Random Forest : MAE is {}. RMSE is {}".format(mae_score, rmse_score))

Random Forest : MAE is 0.030740868461823504. RMSE is 0.04256027104377072


In [236]:
lgbm = LGBMRegressor(n_estimators=100, num_leaves=32)
lgbm.fit(X = train.drop('10d_pct_change',axis=1), 
       y = train['10d_pct_change'])
predictions = lgbm.predict(test.drop('10d_pct_change',axis=1))
mae_score = mean_absolute_error(test['10d_pct_change'], predictions)
rmse_score = np.sqrt(mean_squared_error(test['10d_pct_change'], predictions))
log_model_performance("LGBM", "day_return,range,basket_size,high,high_flag, shadow, scalled", mae_score, "n_estimators=75, num_leaves=32")
train_predictions = lgbm.predict(train.drop('10d_pct_change',axis=1))
mae_score_train = mean_absolute_error(train['10d_pct_change'], train_predictions)
print("Light GBM : MAE is {}. RMSE is {}. Train MAE is {}".format(mae_score, rmse_score, mae_score_train))

Light GBM : MAE is 0.0299334910869315. RMSE is 0.04203482614202661. Train MAE is 0.028755900651683108


In [251]:
fig = px.scatter(x=list(test['10d_pct_change']), y=predictions)
fig.update_traces(marker_size=3)
fig.show()

In [258]:
test.shape

(6400, 34)

In [260]:
np.sum(predictions>0.07)

12

In [239]:
lgbm.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 32,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': 'warn',
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [240]:
df_importance = pd.DataFrame({'Column_Name' : train.drop('10d_pct_change',axis=1).columns,
             'Importance' : lgbm.feature_importances_ }  ).sort_values(by='Importance', ascending=False)          

In [241]:
df_importance

Unnamed: 0,Column_Name,Importance
12,month,459
0,Symbol,237
18,248d_high,233
13,day,218
20,22d_high,210
17,basket_size,163
11,Deliverable_quatity_pct,143
8,Turnover,121
22,10d_high,104
26,high_div_low,100


In [242]:
fig = px.bar(df_importance[:10], x="Importance", y="Column_Name", orientation='h')
fig.show()