Connecting the Google Colab Notebook to my Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Getting Historical Stock Market Data

In [None]:
import yfinance as yf
import pandas as pd

# list of Indian company stock symbols (NSE/BSE)
companies = [
    'RELIANCE.NS', 'TCS.NS', 'HDFCBANK.NS', 'INFY.NS', 'HINDUNILVR.NS',
    'HDFC.NS', 'KOTAKBANK.NS', 'ICICIBANK.NS', 'SBIN.NS', 'BAJFINANCE.NS',
    'BHARTIARTL.NS', 'ITC.NS', 'ASIANPAINT.NS', 'HCLTECH.NS', 'MARUTI.NS',
    'LT.NS', 'WIPRO.NS', 'AXISBANK.NS', 'ULTRACEMCO.NS', 'SUNPHARMA.NS',
    'TITAN.NS', 'ADANIPORTS.NS', 'NESTLEIND.NS', 'ONGC.NS', 'BAJAJFINSV.NS',
    'TECHM.NS', 'HDFCLIFE.NS', 'NTPC.NS', 'JSWSTEEL.NS', 'POWERGRID.NS',
    'DIVISLAB.NS', 'SBILIFE.NS', 'GRASIM.NS', 'DRREDDY.NS', 'BRITANNIA.NS',
    'TATAMOTORS.NS', 'HINDALCO.NS', 'COALINDIA.NS', 'TATASTEEL.NS', 'IOC.NS',
    'BAJAJ-AUTO.NS', 'HEROMOTOCO.NS', 'EICHERMOT.NS', 'INDUSINDBK.NS', 'BPCL.NS',
    'BHARATIARTL.NS', 'UPL.NS', 'CIPLA.NS', 'SHREECEM.NS', 'ADANIGREEN.NS',
    'ADANITRANS.NS', 'M&M.NS', 'TATACONSUM.NS', 'BAJAJHLDNG.NS', 'DABUR.NS',
    'GAIL.NS', 'HDFCAMC.NS', 'HINDPETRO.NS', 'NAUKRI.NS', 'BERGEPAINT.NS',
    'PIDILITIND.NS', 'SIEMENS.NS', 'DLF.NS', 'BANDHANBNK.NS', 'MUTHOOTFIN.NS',
    'ICICIGI.NS', 'SBICARD.NS', 'LUPIN.NS', 'HAVELLS.NS', 'COLPAL.NS',
    'AMBUJACEM.NS', 'PGHH.NS', 'GODREJCP.NS', 'PEL.NS', 'MRF.NS',
    'BIOCON.NS', 'MARICO.NS', 'INDIGO.NS', 'NMDC.NS', 'BEL.NS',
    'APOLLOHOSP.NS', 'JUBLFOOD.NS', 'BOSCHLTD.NS', 'ICICIPRULI.NS', 'GLAND.NS',
    'LTI.NS', 'MPHASIS.NS', 'VEDL.NS', 'AUBANK.NS', 'TORNTPHARM.NS',
    'ACC.NS', 'TATAPOWER.NS', 'BANKBARODA.NS', 'ATGL.NS', 'MINDTREE.NS',
    'PIDILITIND.NS', 'PERSISTENT.NS', 'IGL.NS', 'HAL.NS', 'ICICIBANK.NS'
]


# defining the start and end dates for the historical data
start_date = '2020-01-01'
end_date = '2023-01-01'

# creating an empty DataFrame to store all the data
all_data = pd.DataFrame()

for symbol in companies:
    # fetching historical data from Yahoo Finance
    data = yf.download(symbol, start=start_date, end=end_date)

    # adding a column for the company symbol
    data['Symbol'] = symbol

    # appending the data to the main DataFrame
    all_data = all_data.append(data)

# resetting the index
all_data.reset_index(inplace=True)

# displaying the first few rows of the DataFrame
print(all_data.head())




In [None]:
# inspecting the first few columns of the data
all_data.head()
# saving the data to my drive
all_data.to_csv('/content/drive/MyDrive/Fall 2023/Applied Data Science/datasets/indian_stock_market_data.csv', index=False)


# Processing the data - basic data cleaning

In [None]:
# importing necessary libraries
from sklearn.preprocessing import MinMaxScaler

# loading the dataset that contains historical data
file_path = '/content/drive/MyDrive/Fall 2023/Applied Data Science/datasets/indian_stock_market_data.csv'
data = pd.read_csv(file_path)

# basic data cleaning
# converting 'Date' to datetime format
data['Date'] = pd.to_datetime(data['Date'])

# sorting data by 'Date' and 'Symbol'
data.sort_values(by=['Symbol', 'Date'], inplace=True)

# resetting index after sorting
data.reset_index(drop=True, inplace=True)

# handling missing values
# checking for missing values
missing_values = data.isnull().sum()

# dropping rows with missing values
data.dropna(inplace=True)

# handling outliers
# for financial data, it's tricky to define outliers without specific domain knowledge. I am not an expert in finance
# checking how far data points are from mean to detect outliers
for column in ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']:
    mean = data[column].mean()
    std = data[column].std()
    data = data[(data[column] > (mean - 3 * std)) & (data[column] < (mean + 3 * std))]

# normalize the numerical columns
# identifying numerical columns (excluding 'Date' and 'Symbol')
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns

# creating a MinMaxScaler object
scaler = MinMaxScaler()

# appling normalization to numerical columns
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# displaying the transformed data
print("First few rows of the transformed data:")
data.head(10)

# saving the preprocessed data back to a CSV file
data.to_csv('/content/preprocessed_indian_stock_market_data.csv', index=False)


First few rows of the transformed data:


# Feature Engineering

In [None]:
import pandas as pd

# loading the cleaned dataset
file_path = '/content/drive/MyDrive/Fall 2023/Applied Data Science/datasets/preprocessed_indian_stock_market_data.csv'  # Adjust the file path if necessary
preprocessed_data = pd.read_csv(file_path)


# calculating additional features that are commonly used in stock price analysis, based on  limited research given time constraint

# 1. daily return: (Close - Open) / Open
preprocessed_data['Daily_Return'] = (preprocessed_data['Close'] - preprocessed_data['Open']) / preprocessed_data['Open']

# 2. daily high-low range: High - Low
preprocessed_data['High_Low_Range'] = preprocessed_data['High'] - preprocessed_data['Low']

# 3. 7-day moving average of close price
preprocessed_data['7Day_MA_Close'] = preprocessed_data['Close'].rolling(window=7).mean()

# 4. 14-day moving average of close price
preprocessed_data['14Day_MA_Close'] = preprocessed_data['Close'].rolling(window=14).mean()

# 5. 7-day moving average of volume
preprocessed_data['7Day_MA_Volume'] = preprocessed_data['Volume'].rolling(window=7).mean()

# 6. relative strength index (RSI) - a momentum indicator (14-day period)
# computing daily price change
delta = preprocessed_data['Close'].diff(1)
delta = delta.dropna()

# separating the positive and negative gains
up = delta.clip(lower=0)
down = -1 * delta.clip(upper=0)

# calculating the average gain and loss
roll_up = up.rolling(window=14).mean()
roll_down = down.rolling(window=14).mean()

# calculatingh the Relative Strength (RS)
RS = roll_up / roll_down

# calculating the RSI
preprocessed_data['RSI'] = 100.0 - (100.0 / (1.0 + RS))

# dropping NaN values generated by rolling mean
preprocessed_data.dropna(inplace=True)

# checking the new features
print("First few rows of the transformed data with new features:")
print(preprocessed_data.head())

# statistical analysis: correlation with close price
correlation = preprocessed_data.corr()['Close']
print("Correlation of features with the Close price:")
print(correlation)

# saving the data with new features to a CSV file
preprocessed_data.to_csv('/content/drive/MyDrive/Fall 2023/Applied Data Science/datasets/feature_engineered_stock_data.csv', index=False)


First few rows of the transformed data with new features:
          Date      Open      High       Low     Close  Adj Close    Volume  \
14  2020-01-21  0.310253  0.303661  0.328830  0.329535   0.316067  0.004697   
15  2020-01-22  0.312986  0.309289  0.332211  0.335850   0.322124  0.021819   
16  2020-01-23  0.316834  0.312940  0.337340  0.341195   0.327250  0.014565   
17  2020-01-24  0.322195  0.316172  0.341191  0.345112   0.331007  0.010302   
18  2020-01-27  0.322878  0.317977  0.341628  0.341307   0.327357  0.007042   

    Symbol  Daily_Return  High_Low_Range  7Day_MA_Close  14Day_MA_Close  \
14  ACC.NS      0.062147       -0.025169       0.332528        0.328611   
15  ACC.NS      0.073050       -0.022922       0.332472        0.329110   
16  ACC.NS      0.076889       -0.024400       0.333571        0.330259   
17  ACC.NS      0.071126       -0.025019       0.335170        0.332337   
18  ACC.NS      0.057075       -0.023651       0.336574        0.333817   

    7Day_MA_Volu

  correlation = preprocessed_data.corr()['Close']


# Selecting and Training Machine Learning Models

Training with a particular set of features

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import numpy as np

# Load the dataset
file_path = '/content/drive/MyDrive/Fall 2023/Applied Data Science/datasets/feature_engineered_stock_data.csv'
data = pd.read_csv(file_path)

# Selecting relevant features based on correlation
selected_features = ['Volume', 'Daily_Return', 'High_Low_Range', '7Day_MA_Close', '14Day_MA_Close', '7Day_MA_Volume', 'RSI']

# Preparing the data
X = data[selected_features]
y = data['Close']

# Handle infinite and NaN values
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.dropna(inplace=True)
y = y.loc[X.index]  # Ensure target aligns with features after dropping NaNs

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)
lr_mse = mean_squared_error(y_test, lr_predictions)
print('Linear Regression MSE:', lr_mse)

# Random Forest Model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_predictions)
print('Random Forest Regression MSE:', rf_mse)

# Hyperparameter Tuning for Random Forest (Optional)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, 30]
}
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
print("Best parameters found: ", grid_search.best_params_)
print("Best score found: ", grid_search.best_score_)
best_rf_model = grid_search.best_estimator_
best_rf_predictions = best_rf_model.predict(X_test)
best_rf_mse = mean_squared_error(y_test, best_rf_predictions)
print('Best Random Forest MSE:', best_rf_mse)


# Evaluating the models

In [None]:
# Additional imports for evaluation metrics
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import numpy as np

# Evaluate Linear Regression model
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_predictions))
lr_mae = mean_absolute_error(y_test, lr_predictions)
lr_r2 = r2_score(y_test, lr_predictions)
print('Linear Regression RMSE:', lr_rmse)
print('Linear Regression MAE:', lr_mae)
print('Linear Regression R-squared:', lr_r2)

# Evaluate Random Forest model
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_predictions))
rf_mae = mean_absolute_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)
print('Random Forest RMSE:', rf_rmse)
print('Random Forest MAE:', rf_mae)
print('Random Forest R-squared:', rf_r2)

# Evaluate Best Random Forest model (after hyperparameter tuning)
best_rf_rmse = np.sqrt(mean_squared_error(y_test, best_rf_predictions))
best_rf_mae = mean_absolute_error(y_test, best_rf_predictions)
best_rf_r2 = r2_score(y_test, best_rf_predictions)
print('Best Random Forest RMSE:', best_rf_rmse)
print('Best Random Forest MAE:', best_rf_mae)
print('Best Random Forest R-squared:', best_rf_r2)


Linear Regression RMSE: 0.020116351893519513
Linear Regression MAE: 0.006641079325823219
Linear Regression R-squared: 0.9923921597125268
Random Forest RMSE: 0.01727997728825834
Random Forest MAE: 0.004688437758800487
Random Forest R-squared: 0.9943862990514614
Best Random Forest RMSE: 0.01732804197415406
Best Random Forest MAE: 0.004676147067757274
Best Random Forest R-squared: 0.9943550263217712


# Getting Out-of-Sample Data to Validate Model

In [None]:
# installing and importing necessary packages
import yfinance as yf
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# fetching new out-of-sample data

companies = [
    'RELIANCE.NS', 'TCS.NS', 'HDFCBANK.NS', 'INFY.NS', 'HINDUNILVR.NS',
    'HDFC.NS', 'KOTAKBANK.NS', 'ICICIBANK.NS', 'SBIN.NS', 'BAJFINANCE.NS',
    'BHARTIARTL.NS', 'ITC.NS', 'ASIANPAINT.NS', 'HCLTECH.NS', 'MARUTI.NS',
    'LT.NS', 'WIPRO.NS', 'AXISBANK.NS', 'ULTRACEMCO.NS', 'SUNPHARMA.NS',
    'TITAN.NS', 'ADANIPORTS.NS', 'NESTLEIND.NS', 'ONGC.NS', 'BAJAJFINSV.NS',
    'TECHM.NS', 'HDFCLIFE.NS', 'NTPC.NS', 'JSWSTEEL.NS', 'POWERGRID.NS',
    'DIVISLAB.NS', 'SBILIFE.NS', 'GRASIM.NS', 'DRREDDY.NS', 'BRITANNIA.NS',
    'TATAMOTORS.NS', 'HINDALCO.NS', 'COALINDIA.NS', 'TATASTEEL.NS', 'IOC.NS',
    'BAJAJ-AUTO.NS', 'HEROMOTOCO.NS', 'EICHERMOT.NS', 'INDUSINDBK.NS', 'BPCL.NS',
    'BHARATIARTL.NS', 'UPL.NS', 'CIPLA.NS', 'SHREECEM.NS', 'ADANIGREEN.NS',
    'ADANITRANS.NS', 'M&M.NS', 'TATACONSUM.NS', 'BAJAJHLDNG.NS', 'DABUR.NS',
    'GAIL.NS', 'HDFCAMC.NS', 'HINDPETRO.NS', 'NAUKRI.NS', 'BERGEPAINT.NS',
    'PIDILITIND.NS', 'SIEMENS.NS', 'DLF.NS', 'BANDHANBNK.NS', 'MUTHOOTFIN.NS',
    'ICICIGI.NS', 'SBICARD.NS', 'LUPIN.NS', 'HAVELLS.NS', 'COLPAL.NS',
    'AMBUJACEM.NS', 'PGHH.NS', 'GODREJCP.NS', 'PEL.NS', 'MRF.NS',
    'BIOCON.NS', 'MARICO.NS', 'INDIGO.NS', 'NMDC.NS', 'BEL.NS',
    'APOLLOHOSP.NS', 'JUBLFOOD.NS', 'BOSCHLTD.NS', 'ICICIPRULI.NS', 'GLAND.NS',
    'LTI.NS', 'MPHASIS.NS', 'VEDL.NS', 'AUBANK.NS', 'TORNTPHARM.NS',
    'ACC.NS', 'TATAPOWER.NS', 'BANKBARODA.NS', 'ATGL.NS', 'MINDTREE.NS',
    'PIDILITIND.NS', 'PERSISTENT.NS', 'IGL.NS', 'HAL.NS', 'ICICIBANK.NS'
]

# adjust this to a date that was not included in train-test data
new_start_date = '2023-02-01'
# end date is not included in train-test data as well
new_end_date = '2023-04-01'
new_data = pd.DataFrame()
for symbol in companies:
    fetched_data = yf.download(symbol, start=new_start_date, end=new_end_date)
    fetched_data['Symbol'] = symbol
    new_data = new_data.append(fetched_data)

new_data.reset_index(inplace=True)

# preprocessing the out-of-sample data
# convert 'Date' to datetime and sort
new_data['Date'] = pd.to_datetime(new_data['Date'])
new_data.sort_values(by=['Symbol', 'Date'], inplace=True)
new_data.reset_index(drop=True, inplace=True)

# handling missing values
new_data.dropna(inplace=True)

# removing outliers
for column in ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']:
    mean = new_data[column].mean()
    std = new_data[column].std()
    new_data = new_data[(new_data[column] > (mean - 3 * std)) & (new_data[column] < (mean + 3 * std))]

# normalizing numerical columns
numerical_columns = new_data.select_dtypes(include=['float64', 'int64']).columns
scaler = MinMaxScaler()
new_data[numerical_columns] = scaler.fit_transform(new_data[numerical_columns])

# feature engineering - same as above
new_data['Daily_Return'] = (new_data['Close'] - new_data['Open']) / new_data['Open']
new_data['High_Low_Range'] = new_data['High'] - new_data['Low']
new_data['7Day_MA_Close'] = new_data['Close'].rolling(window=7).mean()
new_data['14Day_MA_Close'] = new_data['Close'].rolling(window=14).mean()
new_data['7Day_MA_Volume'] = new_data['Volume'].rolling(window=7).mean()

delta = new_data['Close'].diff(1)
up = delta.clip(lower=0)
down = -1 * delta.clip(upper=0)
roll_up = up.rolling(window=14).mean()
roll_down = down.rolling(window=14).mean()
RS = roll_up / roll_down
new_data['RSI'] = 100.0 - (100.0 / (1.0 + RS))
new_data.dropna(inplace=True)

# saving the preprocessed data to a CSV file
new_data.to_csv('/content/drive/MyDrive/Fall 2023/Applied Data Science/datasets/out_of_sample_stock_data.csv', index=False)



# Validating Model and Testing for Generalizability

In [None]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# loading the out-of-sample dataset
file_path = '/content/drive/MyDrive/Fall 2023/Applied Data Science/datasets/out_of_sample_stock_data.csv'  # Adjust the file path if necessary
out_of_sample_data = pd.read_csv(file_path)

# handling infinite and NaN values in out-of-sample data
out_of_sample_data.replace([np.inf, -np.inf], np.nan, inplace=True)
out_of_sample_data.dropna(inplace=True)

# selecting the same features as used in the model training
selected_features = ['Volume', 'Daily_Return', 'High_Low_Range', '7Day_MA_Close', '14Day_MA_Close', '7Day_MA_Volume', 'RSI']
X_out_sample = out_of_sample_data[selected_features]

# predicting using linear regression model that was trained above
lr_out_sample_predictions = lr_model.predict(X_out_sample)

# predicting using the best random forest model
best_rf_out_sample_predictions = best_rf_model.predict(X_out_sample)

# evaluating the performance on the out-of-sample data
lr_out_sample_mse = mean_squared_error(out_of_sample_data['Close'], lr_out_sample_predictions)
lr_out_sample_mae = mean_absolute_error(out_of_sample_data['Close'], lr_out_sample_predictions)
lr_out_sample_r2 = r2_score(out_of_sample_data['Close'], lr_out_sample_predictions)

best_rf_out_sample_mse = mean_squared_error(out_of_sample_data['Close'], best_rf_out_sample_predictions)
best_rf_out_sample_mae = mean_absolute_error(out_of_sample_data['Close'], best_rf_out_sample_predictions)
best_rf_out_sample_r2 = r2_score(out_of_sample_data['Close'], best_rf_out_sample_predictions)

# printing the evaluation results
print('Linear Regression - Out-of-Sample MSE:', lr_out_sample_mse)
print('Linear Regression - Out-of-Sample MAE:', lr_out_sample_mae)
print('Linear Regression - Out-of-Sample R-squared:', lr_out_sample_r2)

print('Random Forest - Out-of-Sample MSE:', best_rf_out_sample_mse)
print('Random Forest - Out-of-Sample MAE:', best_rf_out_sample_mae)
print('Random Forest - Out-of-Sample R-squared:', best_rf_out_sample_r2)


Linear Regression - Out-of-Sample MSE: 0.004594872740885675
Linear Regression - Out-of-Sample MAE: 0.0284734319152641
Linear Regression - Out-of-Sample R-squared: 0.9187099894273586
Random Forest - Out-of-Sample MSE: 0.004046743904095051
Random Forest - Out-of-Sample MAE: 0.02501343144940093
Random Forest - Out-of-Sample R-squared: 0.9284071892086284
