# 02 â€” Baseline Models (Reliance Industries)

Establish benchmarks (naive, moving average, linear regression) that deep learning models must exceed.

**Target:** Log return at t+1. Evaluate with MAE (magnitude) and directional accuracy.

In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from ta.momentum import RSIIndicator
from ta.trend import EMAIndicator, MACD
from ta.volatility import BollingerBands

SYMBOL = "RELIANCE"
DATA_FILENAME = "RELI Historical Data.csv"

# Find CSV file
project_root = Path(os.getcwd())
if project_root.name == 'notebooks':
    csv_path = project_root / DATA_FILENAME
    if not csv_path.exists():
        csv_path = project_root.parent / DATA_FILENAME
else:
    csv_path = project_root / DATA_FILENAME
    if not csv_path.exists():
        csv_path = project_root / 'notebooks' / DATA_FILENAME

print(f"Using data file: {csv_path}")

Using data file: /Users/bhavishya/VSC Projects/stock-price-prediction/notebooks/RELI Historical Data.csv


In [None]:
# Helper functions

# Parse volume (e.g., "8.96M") and convert to float
def parse_volume(vol_str):
    if pd.isna(vol_str) or vol_str == '-':
        return 0.0
    vol_str = str(vol_str).strip()
    if vol_str.endswith('M'):
        return float(vol_str[:-1].replace(',', '')) * 1_000_000
    elif vol_str.endswith('K'):
        return float(vol_str[:-1].replace(',', '')) * 1_000
    elif vol_str.endswith('B'):
        return float(vol_str[:-1].replace(',', '')) * 1_000_000_000
    else:
        try:
            return float(vol_str.replace(',', ''))
        except:
            return 0.0

# Parse price (e.g., "1,234.56") and convert to float
def parse_price(price_str):
    if pd.isna(price_str) or price_str == '-':
        return np.nan
    return float(str(price_str).replace(',', ''))

# Add technical indicators like RSI, MACD, EMAs, Bollinger Bands
def add_technical_indicators(df):
    out = df.copy()
    out['rsi'] = RSIIndicator(close=out['close'].squeeze(), window=14).rsi()
    macd = MACD(close=out['close'].squeeze())
    out['macd'] = macd.macd()
    out['ema_10'] = EMAIndicator(close=out['close'].squeeze(), window=10).ema_indicator()
    out['ema_20'] = EMAIndicator(close=out['close'].squeeze(), window=20).ema_indicator()
    bb = BollingerBands(close=out['close'].squeeze(), window=20, window_dev=2)
    out['bb_high'] = bb.bollinger_hband()
    out['bb_low'] = bb.bollinger_lband()
    return out

# Add day-of-week, day-of-month, is_month_end
def add_calendar(df):
    out = df.copy()
    out['dow'] = out.index.dayofweek
    out['dom'] = out.index.day
    out['is_month_end'] = out.index.is_month_end.astype(int)
    return out

def build_target(df):
    out = df.copy()
    price = out['close']

    # Calculate log returns this way to avoid issues with 0 prices
    # Here y is calculated at t+1 respect to t of price
    out['y'] = (np.log(price) - np.log(price.shift(1))).shift(-1)
    return out

def time_split(df, train_end, val_end):
    train_end_dt = pd.to_datetime(train_end)
    val_end_dt = pd.to_datetime(val_end)
    train = df[df.index <= train_end_dt]
    val = df[(df.index > train_end_dt) & (df.index <= val_end_dt)]
    test = df[df.index > val_end_dt]
    return train, val, test

def directional_accuracy(y_true, y_pred):
    """Fraction of predictions with correct sign."""
    return (np.sign(y_true) == np.sign(y_pred)).mean()

In [9]:
# Load and prepare data
raw_df = pd.read_csv(csv_path)
raw_df.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,28-11-2025,1567.5,1568.0,1581.3,1563.0,8.96M,0.26%
1,27-11-2025,1563.4,1575.0,1575.5,1556.0,9.79M,-0.41%
2,26-11-2025,1569.9,1542.3,1571.6,1540.5,14.05M,1.96%
3,25-11-2025,1539.7,1535.9,1559.6,1525.1,15.03M,0.25%
4,24-11-2025,1535.9,1550.0,1550.0,1531.8,18.43M,-0.69%


In [10]:
raw_df.columns = [col.strip() for col in raw_df.columns]

for col in ['Price', 'Open', 'High', 'Low']:
    raw_df[col] = raw_df[col].apply(parse_price)
raw_df['Vol.'] = raw_df['Vol.'].apply(parse_volume)

raw_df = raw_df.rename(columns={
    'Date': 'date', 'Price': 'close', 'Open': 'open',
    'High': 'high', 'Low': 'low', 'Vol.': 'volume',
})
raw_df['date'] = pd.to_datetime(raw_df['date'], format='%d-%m-%Y')
raw_df = raw_df.sort_values('date').set_index('date')

# Keep only OHLCV columns
df = raw_df[['open', 'high', 'low', 'close', 'volume']].copy()
df = add_technical_indicators(df)
df = add_calendar(df)
df = build_target(df)
df = df.dropna()

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df.index.min()} to {df.index.max()}")
df.head()

Dataset shape: (224, 15)
Date range: 2025-01-03 00:00:00 to 2025-11-27 00:00:00


Unnamed: 0_level_0,open,high,low,close,volume,rsi,macd,ema_10,ema_20,bb_high,bb_low,dow,dom,is_month_end,y
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2025-01-03,1243.9,1262.05,1235.5,1251.15,15520000.0,51.08764,-12.696209,1233.3988,1242.912368,1306.781023,1186.328977,4,3,0,-0.026853
2025-01-06,1253.95,1262.0,1215.0,1218.0,14820000.0,40.467361,-13.113814,1230.599019,1240.539761,1295.336507,1188.418493,0,6,0,0.018586
2025-01-07,1222.0,1244.5,1221.25,1240.85,10070000.0,48.425991,-11.468761,1232.462833,1240.569308,1286.712472,1191.612528,1,7,0,0.019671
2025-01-08,1249.0,1271.05,1245.35,1265.5,19350000.0,55.359149,-8.082821,1238.469591,1242.943659,1282.675545,1193.714455,2,8,0,-0.008531
2025-01-09,1267.0,1269.75,1248.05,1254.75,12790000.0,52.071564,-6.195458,1241.429665,1244.068073,1278.347474,1195.697526,3,9,0,-0.010294


In [4]:
# Time-based split (60/20/20)
n_samples = len(df)
train_end_idx = int(n_samples * 0.6)
val_end_idx = int(n_samples * 0.8)

train_end_date = df.index[train_end_idx - 1]
val_end_date = df.index[val_end_idx - 1]

train, val, test = time_split(df, str(train_end_date.date()), str(val_end_date.date()))
feature_cols = [col for col in df.columns if col != 'y']

print(f"Train: {len(train)}, Val: {len(val)}, Test: {len(test)}")
print(f"Features: {len(feature_cols)}")

Train: 134, Val: 45, Test: 45
Features: 14


In [11]:
# Naive baseline: predict zero return
naive_val = np.zeros(len(val))
naive_test = np.zeros(len(test))

print("Naive baseline (predict 0):")
print(f"  Val predictions sample: {naive_val[:5]}")

Naive baseline (predict 0):
  Val predictions sample: [0. 0. 0. 0. 0.]


In [12]:
# Moving average baseline
ma_window = 5

ma_train_series = train['y'].rolling(ma_window, min_periods=1).mean()
ma_pred_val = ma_train_series.iloc[-1]
ma_preds_val = pd.Series(ma_pred_val, index=val.index)

ma_full_series = pd.concat([train['y'], val['y']]).rolling(ma_window, min_periods=1).mean()
ma_pred_test = ma_full_series.iloc[-1]
ma_preds_test = pd.Series(ma_pred_test, index=test.index)

print(f"Moving average (window={ma_window}):")
print(f"  Val prediction: {ma_pred_val:.6f}")
print(f"  Test prediction: {ma_pred_test:.6f}")

Moving average (window=5):
  Val prediction: -0.002585
  Test prediction: -0.002218


In [13]:
# Linear regression baseline
X_train, y_train = train[feature_cols], train['y']
X_val, y_val = val[feature_cols], val['y']
X_test, y_test = test[feature_cols], test['y']

lr_model = LinearRegression().fit(X_train, y_train)
lr_preds_val = lr_model.predict(X_val)
lr_preds_test = lr_model.predict(X_test)

print("Linear Regression:")
print(f"  Coefficients shape: {lr_model.coef_.shape}")
print(f"  Intercept: {lr_model.intercept_:.6f}")

Linear Regression:
  Coefficients shape: (14,)
  Intercept: -0.052496


In [14]:
# Evaluation function
def report(name, y_true_val, y_pred_val, y_true_test, y_pred_test):
    print(f"\n{name}:")
    print(f"  Val MAE: {mean_absolute_error(y_true_val, y_pred_val):.6f}")
    print(f"  Test MAE: {mean_absolute_error(y_true_test, y_pred_test):.6f}")
    print(f"  Val Dir Acc: {directional_accuracy(y_true_val, y_pred_val):.4f}")
    print(f"  Test Dir Acc: {directional_accuracy(y_true_test, y_pred_test):.4f}")

# Report all baselines
report("Naive (predict 0)", val['y'].values, naive_val, test['y'].values, naive_test)
report("Moving Average", val['y'].values, ma_preds_val.values, test['y'].values, ma_preds_test.values)
report("Linear Regression", y_val.values, lr_preds_val, y_test.values, lr_preds_test)


Naive (predict 0):
  Val MAE: 0.008468
  Test MAE: 0.007322
  Val Dir Acc: 0.0000
  Test Dir Acc: 0.0000

Moving Average:
  Val MAE: 0.008669
  Test MAE: 0.007923
  Val Dir Acc: 0.4889
  Test Dir Acc: 0.4667

Linear Regression:
  Val MAE: 0.012863
  Test MAE: 0.008392
  Val Dir Acc: 0.3778
  Test Dir Acc: 0.4444
