In [None]:
# uncomment below lines to install the required packages
# !pip install pandas pandas-ta scikit-learn xgboost
# !pip install matplotlib seaborn mplfinance
# !pip install tensorflow   # for deep learning models

In [None]:
import mplfinance as mpf

import pandas as pd
import numpy as np
from keras import Input

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from xgboost import XGBRegressor

In [None]:
df = pd.read_csv('data/AARTIIND.NS_20220608_20240607_1d.csv')
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

# vstop_plt = mpf.make_addplot(df.iloc[10:20, -1], type='scatter', marker='+', markersize=50)
# mpf.plot(df.iloc[10:20, :], type='candle', style='yahoo', volume=True, addplot=vstop_plt)

**Add a Target column with the next day's gain/loss as a percent value**

In [None]:
df['Target'] = (df['Close'].shift(-1) - df['Close']) / df['Close']
df.dropna(inplace=True)

**Scale the data and split into train/test sets**

In [None]:
X = df.drop(columns=['Target'])
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
y = df['Target']
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

## Traditional ML models (LinearRegression, DecisionTree, RandomForest, XGBoostRegressor)

**Configure params for each traditional model to be run by GridSearchCV to pick the best one**

In [None]:
param_grid_lr = {
    'fit_intercept': [True, False],
}

param_grid_dt = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0]
}

model_and_params = [
    ('LinearRegression', LinearRegression(), param_grid_lr),
    ('DecisionTreeRegressor', DecisionTreeRegressor(), param_grid_dt),
    ('RandomForestRegressor', RandomForestRegressor(), param_grid_rf),
    ('XGBRegressor', XGBRegressor(), param_grid_xgb),
]

In [None]:
def grid_search_cv(model, param_grid, X_train, y_train, n_jobs=-1, cv=5):
    searcher = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=n_jobs, cv=cv, scoring='neg_mean_squared_error')
    searcher.fit(X_train, y_train)
    return searcher

def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    return rmse

def train_and_get_best_estimator(model, param_grid):
    grid_search = grid_search_cv(model, param_grid, X_train, y_train)
    best_estimator = grid_search.best_estimator_
    return best_estimator

def evaluate_all_models():
    evaluation_results = []
    for name, model, param_grid in model_and_params:
        estimator = train_and_get_best_estimator(model, param_grid)
        rmse = evaluate_model(estimator, X_test, y_test)
        evaluation_results.append((name, rmse))
    return evaluation_results
    

In [None]:
results = evaluate_all_models()

In [None]:
for name, rmse in results:
    print(name, rmse)

## NN models for stock predictions - LSTM and CNN

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, LSTM, Conv1D, Flatten, MaxPooling1D

y_scaled = scaler.fit_transform(y.values.reshape(-1, 1))

def create_sequences(X, y, time_steps=10):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:i + time_steps])
        ys.append(y[i + time_steps])
    return np.array(Xs), np.array(ys)

X_seq, y_seq = create_sequences(X_scaled, y_scaled, 10)
X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

def build_lstm_model(input_shape):
    model = Sequential()
    model.add(Input(shape=input_shape))
    model.add(LSTM(50, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(50, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(25, activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mse', optimizer='adam')
    return model

def build_cnn_model(input_shape):
    model = Sequential()
    model.add(Input(shape=input_shape))
    model.add(Conv1D(filters=64, kernel_size=2, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=64, kernel_size=2, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dropout(0.2))
    model.add(Dense(25, activation='relu'))
    model.add(Dense(1, activation='relu'))
    model.compile(loss='mse', optimizer='adam')
    return model

input_shape = (X_train.shape[1], X_train.shape[2])
lstm_model = build_lstm_model(input_shape)
cnn_model = build_cnn_model(input_shape)
# print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
# lstm_model.summary()
# cnn_model.summary()

In [None]:
lstm_history = lstm_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)
cnn_history = cnn_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

In [None]:
def evaluate_nn(model, X_train, X_test, y_train, y_test, scaler):
    train_predict = model.predict(X_train)
    test_predict = model.predict(X_test)

    # Invert scaling for prediction
    train_predict = scaler.inverse_transform(train_predict)
    test_predict = scaler.inverse_transform(test_predict)
    
    # Invert scaling for actual values
    y_train_inv = scaler.inverse_transform(y_train)
    y_test_inv = scaler.inverse_transform(y_test)
    
    # Calculate RMSE
    train_rmse = np.sqrt(mean_squared_error(y_train_inv, train_predict))
    test_rmse = np.sqrt(mean_squared_error(y_test_inv, test_predict))
    
    print(f'{model.__class__} Training RMSE: {train_rmse}')
    print(f'{model.__class__} Testing RMSE: {test_rmse}')

In [None]:
evaluate_nn(lstm_model, X_train, X_test, y_train, y_test, scaler)
evaluate_nn(cnn_model, X_train, X_test, y_train, y_test, scaler)

In [86]:
res_df = pd.read_csv('output/20240609031919_results.csv')
res_df

Unnamed: 0,Symbol,LinearRegression,DecisionTreeRegressor,RandomForestRegressor,XGBRegressor
0,ABB.NS,0.024457,0.025257,0.023689,0.02413
1,ADANIENT.NS,0.027825,0.028469,0.026114,0.025438
2,ACC.NS,0.01802,0.018701,0.016998,0.016257
3,AARTIIND.NS,0.021668,0.021703,0.021667,0.022929
4,ABBOTINDIA.NS,0.011072,0.012505,0.010711,0.010824


In [87]:
summary = res_df.describe(percentiles=[0.75, 0.95])
print(summary)

       LinearRegression  DecisionTreeRegressor  RandomForestRegressor  \
count          5.000000               5.000000               5.000000   
mean           0.020608               0.021327               0.019836   
std            0.006435               0.006151               0.006102   
min            0.011072               0.012505               0.010711   
50%            0.021668               0.021703               0.021667   
75%            0.024457               0.025257               0.023689   
95%            0.027151               0.027826               0.025629   
max            0.027825               0.028469               0.026114   

       XGBRegressor  
count      5.000000  
mean       0.019916  
std        0.006192  
min        0.010824  
50%        0.022929  
75%        0.024130  
95%        0.025176  
max        0.025438  


In [88]:
summary.loc['95%'].idxmin()

'XGBRegressor'