#### Predicting stock market prices is a critical task in finance and investment. Accurate predictions can lead to substantial profits, while poor predictions can result in significant losses. 

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('World-Stock-Prices-Dataset.csv')
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Brand_Name,Ticker,Industry_Tag,Country,Dividends,Stock Splits,Capital Gains
0,2025-07-03 00:00:00-04:00,6.63,6.74,6.615,6.64,4209664.0,peloton,PTON,fitness,usa,0.0,0.0,
1,2025-07-03 00:00:00-04:00,106.75,108.370003,106.330101,107.339996,560190.0,crocs,CROX,footwear,usa,0.0,0.0,
2,2025-07-03 00:00:00-04:00,122.629997,123.050003,121.550003,121.93,36600.0,adidas,ADDYY,apparel,germany,0.0,0.0,
3,2025-07-03 00:00:00-04:00,221.705002,224.009995,221.360001,223.410004,29295154.0,amazon,AMZN,e-commerce,usa,0.0,0.0,
4,2025-07-03 00:00:00-04:00,212.145004,214.649994,211.810104,213.550003,34697317.0,apple,AAPL,technology,usa,0.0,0.0,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310122 entries, 0 to 310121
Data columns (total 13 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           310122 non-null  object 
 1   Open           310122 non-null  float64
 2   High           310122 non-null  float64
 3   Low            310122 non-null  float64
 4   Close          310122 non-null  float64
 5   Volume         310122 non-null  float64
 6   Brand_Name     310122 non-null  object 
 7   Ticker         310122 non-null  object 
 8   Industry_Tag   310122 non-null  object 
 9   Country        310122 non-null  object 
 10  Dividends      310122 non-null  float64
 11  Stock Splits   310122 non-null  float64
 12  Capital Gains  2 non-null       float64
dtypes: float64(8), object(5)
memory usage: 30.8+ MB


In [4]:
df.describe()

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Capital Gains
count,310122.0,310122.0,310122.0,310122.0,310122.0,310122.0,310122.0,2.0
mean,76.325431,77.198011,75.434586,76.338032,22709370.0,0.003643,0.000875,0.0
std,141.68407,143.224109,140.063617,141.679129,85657440.0,0.069905,0.115434,0.0
min,0.0,0.0,0.0,0.198861,0.0,0.0,0.0,0.0
25%,15.9325,16.163165,15.710323,15.94,1379050.0,0.0,0.0,0.0
50%,35.40812,35.809869,34.982469,35.407547,4591642.0,0.0,0.0,0.0
75%,83.996691,84.914656,83.088609,84.0,12128600.0,0.0,0.0,0.0
max,3445.580078,3463.070068,3370.0,3427.610107,7421641000.0,15.0,50.0,0.0


In [5]:
df.shape

(310122, 13)

In [6]:
df.isnull().sum()

Date                  0
Open                  0
High                  0
Low                   0
Close                 0
Volume                0
Brand_Name            0
Ticker                0
Industry_Tag          0
Country               0
Dividends             0
Stock Splits          0
Capital Gains    310120
dtype: int64

In [9]:
df['Stock Splits'].value_counts()

Stock Splits
0.000000     310055
2.000000         32
4.000000          7
1.500000          5
10.000000         4
7.000000          2
5.000000          2
1.004000          2
20.000000         2
3.000000          2
50.000000         1
0.487329          1
1.196000          1
1.061000          1
1.998000          1
1.003000          1
1.013685          1
1.800000          1
1.142465          1
Name: count, dtype: int64

In [10]:
df.Dividends.value_counts()

Dividends
0.000000    307417
0.180000        62
0.150000        57
0.100000        52
0.160000        52
             ...  
0.157778         1
0.218000         1
0.066000         1
0.200556         1
0.314000         1
Name: count, Length: 476, dtype: int64

In [13]:
X = df[['Open','High','Low','Volume']]
y = df['Close']

0           6.640000
1         107.339996
2         121.930000
3         223.410004
4         213.550003
             ...    
310117     37.496925
310118     24.772856
310119     72.034958
310120      7.168196
310121      5.482471
Name: Close, Length: 310122, dtype: float64

In [14]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [17]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(217085, 4) (93037, 4) (217085,) (93037,)


In [18]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

In [21]:
y_pred = model.predict(X_test)
y_pred

array([ 97.23023222, 159.10909128,  22.55927227, ...,  31.19527865,
        12.1658398 ,  44.98275777], shape=(93037,))

In [25]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(mse)
print(mae)
print(r2)

1.305474934509078
0.39080940127151803
0.9999343334331467


In [26]:
df['Close']

0           6.640000
1         107.339996
2         121.930000
3         223.410004
4         213.550003
             ...    
310117     37.496925
310118     24.772856
310119     72.034958
310120      7.168196
310121      5.482471
Name: Close, Length: 310122, dtype: float64

In [27]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

model = LinearRegression()
kfold_validation = KFold(10)

results = cross_val_score(model, X, y, cv=kfold_validation)
print(results)

print('Score: ',np.mean(results))

[0.99994668 0.99991702 0.99988467 0.99987507 0.99994448 0.99996094
 0.99991517 0.99970578 0.99992852 0.9997114 ]
Score:  0.9998789730967486


#### The model is performing consistently across different data splits

### Hyperparameter Tuning

There is no much parameters for tuning in Linear Regression model,
so we can use Regularization models for tuning 

since there are only four features X = df[['Open','High','Low','Volume']]
we can go with ridge regression since we only need to shrink coefficients effectively and handle multicolinearity
if we need to eliminate some features we can go for lasso

Before using any regularization models  feature scaling is important

L1, L2 penalize coefficients based on their magitude

In [31]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [40]:
from sklearn.linear_model import Ridge

ridge = Ridge(1.0)
ridge.fit(X_train_scaled, y_train)

y_pred = ridge.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(mse)
print(mae)
print(r2)

2.3177952735375054
0.5685604496326497
0.9998834128068961


In [39]:
print('Coefficients:', ridge.coef_)
print('Intercept: ',ridge.intercept_)

Coefficients: [-7.51618009e+01  1.11411582e+02  1.05715501e+02  1.35609183e-03]
Intercept:  76.39592385949327


### RandomizedSearchCV

In [44]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

param_dist = {
    'alpha': uniform(0.001,10)
}

ridge_cv = RandomizedSearchCV(
    estimator=ridge,
    param_distributions=param_dist,
    n_iter = 50,
    cv = 5,
    scoring = 'neg_mean_squared_error',
    random_state = 42,
    n_jobs = -1
)

ridge_cv.fit(X_train_scaled, y_train)

In [45]:
ridge_cv.best_params_

{'alpha': np.float64(0.20684494295802447)}

In [48]:
ridge_cv.best_score_

np.float64(-1.266300633762554)

In [49]:
ridge_cv.best_estimator_

In [51]:
best_ridge = ridge_cv.best_estimator_
y_pred_best = best_ridge.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred_best)
mae = mean_absolute_error(y_test, y_pred_best)
r2 = r2_score(y_test, y_pred_best)

print(mse)
print(mae)
print(r2)

2.2886130360914785
0.5672761810350272
0.9998848806997643
