In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error , r2_score
from sklearn.preprocessing import StandardScaler

# Data exploration 

In [2]:
train_data = pd.read_csv("train_data.csv")
test_data = pd.read_csv("test_data.csv")
print("Train Data",train_data.shape)
print("Test Data",test_data.shape)



Train Data (279411, 8)
Test Data (119748, 8)


In [3]:
train_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279411 entries, 0 to 279410
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Unnamed: 0    279411 non-null  int64  
 1   open          279411 non-null  float64
 2   high          279411 non-null  float64
 3   low           279411 non-null  float64
 4   close         279411 non-null  float64
 5   volume        279411 non-null  float64
 6   symbol_LB     279411 non-null  int64  
 7   Daily_Return  279411 non-null  float64
dtypes: float64(6), int64(2)
memory usage: 17.1 MB


In [4]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119748 entries, 0 to 119747
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Unnamed: 0    119748 non-null  int64  
 1   open          119748 non-null  float64
 2   high          119748 non-null  float64
 3   low           119748 non-null  float64
 4   close         119748 non-null  float64
 5   volume        119748 non-null  float64
 6   symbol_LB     119748 non-null  int64  
 7   Daily_Return  119748 non-null  float64
dtypes: float64(6), int64(2)
memory usage: 7.3 MB


In [5]:
train_data.describe()

Unnamed: 0.1,Unnamed: 0,open,high,low,close,volume,symbol_LB,Daily_Return
count,279411.0,279411.0,279411.0,279411.0,279411.0,279411.0,279411.0,279411.0
mean,246095.610205,0.000625,0.000619,0.000648,0.000643,-0.000939,210.616583,0.000377
std,142977.011257,1.000956,1.00094,1.000969,1.000947,0.998864,121.539068,0.012442
min,2.0,-1.822676,-1.822096,-1.909717,-1.821662,-1.301871,0.0,-0.139902
25%,122189.5,-0.770393,-0.770492,-0.770898,-0.770652,-0.75091,106.0,-0.006034
50%,245168.0,-0.145313,-0.145288,-0.144625,-0.144972,-0.314977,210.0,0.000456
75%,369100.5,0.610793,0.610121,0.610705,0.610831,0.464478,315.0,0.006936
max,497470.0,2.902225,3.502887,2.941988,3.352516,3.479182,421.0,0.805807


In [6]:
test_data.describe()

Unnamed: 0.1,Unnamed: 0,open,high,low,close,volume,symbol_LB,Daily_Return
count,119748.0,119748.0,119748.0,119748.0,119748.0,119748.0,119748.0,119748.0
mean,246278.575375,-0.001459,-0.001445,-0.001512,-0.001499,0.002191,210.999265,0.000341
std,142956.093812,0.997773,0.99781,0.997741,0.997794,1.002652,121.712394,0.012419
min,0.0,-1.809901,-1.805799,-1.818213,-1.811629,-1.303565,0.0,-0.492185
25%,122347.25,-0.772765,-0.772129,-0.773044,-0.772476,-0.750771,107.0,-0.006038
50%,244636.5,-0.146226,-0.146495,-0.145544,-0.147252,-0.313742,210.0,0.000449
75%,369837.75,0.604406,0.603783,0.605494,0.604143,0.467179,316.0,0.00694
max,497471.0,2.902225,3.062257,2.93555,3.087407,3.479708,421.0,0.132757


In [7]:
# select the feature based on test data
test_data.corr()["volume"].sort_values(ascending=False)

volume          1.000000
Unnamed: 0     -0.003077
Daily_Return   -0.006035
symbol_LB      -0.006975
high           -0.301240
open           -0.303259
close          -0.303450
low            -0.305502
Name: volume, dtype: float64

In [8]:
# Prepare features and target
x_train = train_data.drop(["Unnamed: 0","Daily_Return","symbol_LB","close"],axis=1)
y_train = train_data["close"]
x_test = test_data.drop(["Unnamed: 0","Daily_Return","symbol_LB","close"],axis=1)
y_test = test_data["close"]

# Scale the features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [9]:
Linear_model = Ridge()

In [10]:
# Train the model with scaled data
Linear_model = Ridge()
Linear_model.fit(x_train_scaled, y_train)

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [11]:
y_pred = Linear_model.predict(x_test_scaled)

# Model Evaluation 

In [12]:
print("Mean Square Error : " , mean_squared_error(y_pred=y_pred,y_true=y_test))
print("R2_score : " ,r2_score(y_pred=y_pred,y_true=y_test))

Mean Square Error :  0.00015181100549354812
R2_score :  0.999847515854391


In [13]:
coefficients = pd.DataFrame({
    'Feature': x_test.columns,
    'Coefficient': Linear_model.coef_
})
print(coefficients.sort_values('Coefficient', ascending=False))
print(f"Intercept: {Linear_model.intercept_}")

  Feature  Coefficient
1    high     0.795688
2     low     0.728549
3  volume    -0.000228
0    open    -0.523431
Intercept: 0.000642606033424297
