Importing library and dataframe

In [2]:
import pandas as pd
import numpy as np

df=pd.read_csv("Daily.csv")
df

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,15-09-2024,2402.04,2417.56,2428.91,2396.68,133.42K,-0.64%
1,14-09-2024,2417.56,2439.29,2440.40,2384.99,154.06K,-0.92%
2,13-09-2024,2440.12,2361.79,2459.47,2337.76,302.56K,3.32%
3,12-09-2024,2361.79,2340.33,2387.25,2316.39,249.29K,0.92%
4,11-09-2024,2340.19,2387.98,2388.40,2279.00,333.29K,-1.99%
...,...,...,...,...,...,...,...
3107,14-03-2016,12.50,15.07,15.07,11.40,92.18K,-17.05%
3108,13-03-2016,15.07,12.92,15.07,12.92,1.30K,16.64%
3109,12-03-2016,12.92,11.95,13.45,11.95,0.83K,8.12%
3110,11-03-2016,11.95,11.75,11.95,11.75,0.18K,1.70%


## Data preprocessing

In [3]:
# dropping irrelevant data for regression
def drop_irrelevant(df):
    df=df.drop(["Open","High","Low","Vol.","Change %"],axis=1)
    return df
df = drop_irrelevant(df)

In [4]:
# checking for null values
print(df.isnull().sum())

Date     0
Price    0
dtype: int64


In [5]:
# conversion of date and price to specific datatype
def pre_processing(df):
    # price pre-processing
    for i in range(len(df)):
        if "," in df.loc[i,"Price"]:
            val=df.loc[i,"Price"]
            df.loc[i,"Price"]=float(val[:val.find(",")]+val[val.find(",")+1:])
        else:
            df.loc[i,"Price"]=float(df.loc[i,"Price"])

    # date pre-processing
    from datetime import datetime
    df["Date"]=pd.to_datetime(df["Date"], format='mixed')

    df=df.sort_values(by="Date")
    return df

df = pre_processing(df)

In [6]:
from bokeh.plotting import figure, output_notebook, show
from bokeh.models import HoverTool
from bokeh.io import push_notebook


def plot_dataset(df): 
    output_notebook() 


    p = figure(width=1150, height=600, x_axis_type='datetime', title='Interactive Date vs Price Plot',
        background_fill_color="#2F2F2F",  # Dark background
        border_fill_color="#2F2F2F",  # Dark border
        outline_line_color="#FFFFFF"  # Light border)
    )

    # Add a line renderer
    p.line(df['Date'], df['Price'], line_width=2, color='blue', legend_label='Price')

    # Add hover tool
    hover = HoverTool()
    hover.tooltips = [("Date", "@x{%F}"), ("Price", "@y")]
    hover.formatters = {'@x': 'datetime'}
    p.add_tools(hover)

    # Customize axes
    p.xaxis.axis_label = 'Date'
    p.yaxis.axis_label = 'Price ($)'

    # Show the plot
    show(p, notebook_handle=True)

In [7]:
plot_dataset(df)

Converting categorical data to numerical data

In [8]:
# converting date to numerical value for regression task
def encode(df):
    from sklearn.preprocessing import LabelEncoder

    df["Date"]=df["Date"].astype(str)

    le=LabelEncoder()

    unique_values=df["Date"].unique()

    le.fit(unique_values)
    df["Date"]=le.transform(df["Date"])

    return df

df = encode(df)
df


Unnamed: 0,Date,Price
3089,0,11.62
3059,1,8.75
3028,2,13.85
2998,3,12.35
2967,4,11.08
...,...,...
126,3107,2929.35
95,3108,3559.95
65,3109,3133.33
34,3110,2722.64


In [9]:
df.drop(['Date'], axis= 1, inplace=True)

In [10]:
df.head()

def make_dependence_dataset(df, shift):
    new_df = pd.DataFrame()
    if shift > df.shape[0]:
        return new_df

    for i in range(1,shift + 1):
        new_df[f'Lag_{i}'] = df.shift(i)
    new_df.dropna(inplace=True)
    new_df['target'] = df['Price'].iloc[shift:]
    return new_df


In [11]:
def full_preprocessing(df):
    df = drop_irrelevant(df)
    df = pre_processing(df)
    df = encode(df)
    df.drop(['Date'], axis= 1, inplace=True)
    df.reindex()
    return df

## Ridge Regression

In [12]:
def get_featues_target(df, shift):
    lagged_dataset = make_dependence_dataset(df,shift)
    features = lagged_dataset.drop(['target'], axis = 1)
    target = lagged_dataset['target']
    return features, target

In [13]:
from sklearn.preprocessing import PolynomialFeatures,StandardScaler
from sklearn.linear_model import RidgeCV

# import warnings
# # Suppress LinAlgWarning
# warnings.filterwarnings("ignore", Warning)


shifts = [i for i in range(1, 22, 5)]
cvs = [i for i in range(2, 10)]
polys = [i for i in range(1, 4)]

df = pd.read_csv("train.csv")
df = full_preprocessing(df)

test_df = pd.read_csv('test.csv')
test_df = full_preprocessing(test_df)

result = []

for shift in shifts:
    print(f"\n{shift}")
    X_train, y_train = get_featues_target(df, shift)
    X_test, y_test = get_featues_target(test_df, shift)

    # Apply StandardScaler
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    for poly in polys:
        poly_transformer = PolynomialFeatures(degree=poly)
        poly_X_train = poly_transformer.fit_transform(X_train)
        poly_X_test = poly_transformer.fit_transform(X_test)
        for cv in cvs:
            rd_model = RidgeCV(alphas=np.logspace(-6, 6, 13),cv=cv)
            rd_model.fit(poly_X_train, y_train)
            score = rd_model.score(poly_X_test, y_test)
            result.append([shift, poly, cv, score])
        print(f"{poly}",end="\t")

result = np.array(result)

        


1
1	2	3	
6
1	2	3	
11
1	2	3	
16
1	2	3	
21
1	2	3	

In [14]:
sorted_ind = result[:,-1].argsort()
result = result[sorted_ind]

In [15]:
best_shift, best_poly , best_cv , _ = result[-1]
best_shift = (int)(best_shift); best_cv = (int)(best_cv); best_poly = (int)(best_poly)
poly_transformer = PolynomialFeatures(degree=best_poly) 

best_rd_model = RidgeCV(cv=best_cv)
X_train, y_train = get_featues_target(df, best_shift)
X_test, y_test = get_featues_target(test_df, best_shift)

poly_X_train = poly_transformer.fit_transform(X_train)
poly_X_test = poly_transformer.fit_transform(X_test)
best_rd_model.fit(poly_X_train,y_train)
y_pred = best_rd_model.predict(poly_X_test)
best_rd_model.score(poly_X_test, y_test)

0.40823923282479446

### Back test

In [43]:
model_back_pred = best_rd_model.predict(poly_X_train)
model_front_pred = best_rd_model.predict(poly_X_test)
def plot_ridge_comparison(y_true,y_pred):
    if y_true.shape[0] != y_pred.shape[0]:
        print(f"incompactible {y_true.shape} {y_pred.shape}")
        return
    output_notebook() 
    x_data = [i for i in range(y_true.shape[0])]

    p = figure(width=1150, height=600, x_axis_type='datetime', title='Interactive Date vs Price Plot',
        background_fill_color="#424242",  # Dark background
        border_fill_color="#2F2F2F",  # Dark border
        outline_line_color="#FFFFFF"  # Light border
    )

    # Add a line renderer
    p.line(x_data, y_true, line_width=2, color='#F57F17', legend_label='True')
    p.line(x_data, y_pred, line_width=2, color='#AB47BC', legend_label='Predicted')

    # Add hover tool
    hover = HoverTool()
    p.add_tools(hover)

    # Customize axes
    p.xaxis.axis_label = 'Date'
    p.yaxis.axis_label = 'Price ($)'

    # Show the plot
    show(p, notebook_handle=True)

plot_ridge_comparison(y_train, model_back_pred)
plot_ridge_comparison(y_test, model_front_pred)

### front-test with prediction

In [65]:
from typing import Optional
import warnings

# Suppress LinAlgWarning from numpy
warnings.filterwarnings("ignore", category=UserWarning)


def front_test_pred(model, X_test:np.ndarray, transformer, till:Optional[int] = None):
    y_pred = []
    next_pred_on = list(X_test[0])
    length = X_test.shape[0]
    if till:
        length = till
    print(transformer.transform([next_pred_on]))
    for i in range(length):
        pred = model.predict(transformer.transform([next_pred_on]))[0]
        # print(pred)
        y_pred.append(pred)
        next_pred_on = next_pred_on[1:]
        next_pred_on.append(pred)
    return np.array(y_pred)


In [69]:
front_y_pred = front_test_pred(best_rd_model, np.array(X_test), poly_transformer, 50)

[[1.00000e+00 3.81595e+03 2.97327e+03 3.50282e+03 3.43351e+03 2.30428e+03
  2.35203e+03]]


In [70]:
plot_ridge_comparison(y_test.iloc[:50], front_y_pred)
plot_ridge_comparison(y_test, np.zeros(y_test.shape))

### Inference

Ridge Regression performs well in preddicting whether the values will go 'RISE' or 'FALL', but it performs very bad in predicting the next actual values. 