#### The data is the price history and trading volumes of the Asian Paints stocks in the index NIFTY 50 from NSE (National Stock Exchange) India. All datasets are at a day-level with pricing and trading values split across .cvs files for each stock along with a metadata file with some macro-information about the stocks itself. The data spans from 1st January, 2000 to 30th April, 2021

In [17]:
from warnings import filterwarnings
filterwarnings('ignore')

import pandas as pd
import numpy as np

In [18]:
df = pd.read_csv('ASIANPAINT.csv')
df.head()

Unnamed: 0,Date,Symbol,Series,Prev Close,Open,High,Low,Last,Close,VWAP,Volume,Turnover,Trades,Deliverable Volume,%Deliverble
0,2000-01-03,ASIANPAINT,EQ,361.2,370.0,390.0,370.0,385.0,381.65,380.54,3318,126261700000.0,,,
1,2000-01-04,ASIANPAINT,EQ,381.65,380.0,392.0,375.0,390.0,385.55,383.5,4818,184769900000.0,,,
2,2000-01-05,ASIANPAINT,EQ,385.55,371.5,390.0,371.5,383.0,383.0,379.81,2628,99813840000.0,,,
3,2000-01-06,ASIANPAINT,EQ,383.0,384.9,384.9,374.5,375.1,377.5,379.88,3354,127411400000.0,,,
4,2000-01-07,ASIANPAINT,EQ,377.5,376.0,390.0,370.0,389.0,385.7,383.38,9589,367627500000.0,,,


In [19]:
df.shape

(5306, 15)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5306 entries, 0 to 5305
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Date                5306 non-null   object 
 1   Symbol              5306 non-null   object 
 2   Series              5306 non-null   object 
 3   Prev Close          5306 non-null   float64
 4   Open                5306 non-null   float64
 5   High                5306 non-null   float64
 6   Low                 5306 non-null   float64
 7   Last                5306 non-null   float64
 8   Close               5306 non-null   float64
 9   VWAP                5306 non-null   float64
 10  Volume              5306 non-null   int64  
 11  Turnover            5306 non-null   float64
 12  Trades              2456 non-null   float64
 13  Deliverable Volume  4797 non-null   float64
 14  %Deliverble         4797 non-null   float64
dtypes: float64(11), int64(1), object(3)
memory usage: 621.9

1. For Regression:
Close price (most common in stock forecasting)

2. For Classification:
Price Movement Direction
Target = 1 if next day’s Close > today’s Close, else 0.
Binary classification → predicts uptrend vs downtrend.

In [21]:
df.isnull().sum()

Date                     0
Symbol                   0
Series                   0
Prev Close               0
Open                     0
High                     0
Low                      0
Last                     0
Close                    0
VWAP                     0
Volume                   0
Turnover                 0
Trades                2850
Deliverable Volume     509
%Deliverble            509
dtype: int64

In [38]:
df.describe()

Unnamed: 0,Prev Close,Open,High,Low,Last,Close,VWAP,Volume,Turnover,Trades,Deliverable Volume,%Deliverble
count,5306.0,5306.0,5306.0,5306.0,5306.0,5306.0,5306.0,5306.0,5306.0,2456.0,4797.0,4797.0
mean,1247.000952,1247.683952,1264.625349,1230.900697,1247.317132,1247.410903,1247.799926,509672.1,69429190000000.0,41346.011401,275128.1,0.625572
std,1074.399506,1074.025577,1087.238871,1062.629409,1074.432667,1074.476439,1075.057438,853599.3,144397300000000.0,36435.449733,415746.5,0.187597
min,210.75,210.0,215.75,204.0,210.6,210.75,212.22,25.0,1278415000.0,70.0,25.0,0.0768
25%,415.7375,415.0,424.95,410.0,416.0,416.2625,415.6925,13707.5,753813300000.0,16264.75,12622.0,0.4862
50%,889.375,890.0,903.975,878.075,890.0,889.475,888.35,57136.0,11094370000000.0,32402.5,52738.0,0.6161
75%,1599.0375,1599.8,1629.675,1573.0375,1602.5,1600.7625,1600.9825,807790.5,77614790000000.0,54727.5,438375.0,0.7655
max,5213.1,5221.1,5247.75,5150.05,5221.1,5213.1,5175.79,11545940.0,2216650000000000.0,351347.0,7938427.0,1.0


In [39]:
df1 = df.copy()

In [40]:
df1.columns

Index(['Date', 'Symbol', 'Series', 'Prev Close', 'Open', 'High', 'Low', 'Last',
       'Close', 'VWAP', 'Volume', 'Turnover', 'Trades', 'Deliverable Volume',
       '%Deliverble'],
      dtype='object')

In [43]:
X = df1[['Prev Close','Open','High','Low','Last','VWAP','Volume','Turnover']]
y = df1['Close']

In [56]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

### Split

In [57]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(3714, 8) (1592, 8) (3714,) (1592,)


### Scale

For linear models (regression, logistic regression), standardization (scaling) is usually preferred.

For distance-based algorithms (K-NN, clustering), normalization is preferred.

In [59]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

### Model

In [53]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train_scaled, y_train)

In [54]:
y_pred = model.predict(X_test_scaled)
y_pred

array([ 364.05160314,  931.44639453, 2864.65595592, ..., 1728.67484853,
        748.00790209, 2185.51779725], shape=(1592,))

### Metrics

In [55]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(mse)
print(mae)
print(r2)

10755.346092404947
84.93573631337232
0.9898992875600541


### Cross Validation

In [58]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

model = LinearRegression()
kfold_validation = KFold(10)

results = cross_val_score(model, X_scaled, y, cv=kfold_validation)
print(results)

print('Score: ',np.mean(results))

[0.99937253 0.99775729 0.99975064 0.99960878 0.9999046  0.99961936
 0.99997468 0.99968758 0.99959708 0.99981666]
Score:  0.9995089214863763


##### The model is performing consistently across different data splits¶

### Ridge Model

In [60]:
from sklearn.linear_model import Ridge

ridge = Ridge(1.0)
ridge.fit(X_train_scaled, y_train)

y_pred = ridge.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(mse)
print(mae)
print(r2)

10719.482665823065
84.7823894140872
0.9899329681274577


In [61]:
print('Coefficients:', ridge.coef_)
print('Intercept: ',ridge.intercept_)

Coefficients: [ 2.15522251e+01 -1.48401083e+01  1.87624651e+02  1.47076544e+02
  4.62935139e+02  2.86694471e+02  2.12432549e-01 -1.30473776e-01]
Intercept:  1272.8488152934842


### Hyperparameter Tuning

In [69]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

param_dist = {
    'alpha':uniform(0.001,10)
}

ridge_cv = RandomizedSearchCV(
    estimator=ridge,
    param_distributions=param_dist,
    n_iter = 50,
    cv = 5,
    scoring='neg_mean_squared_error',
    random_state= 42,
    n_jobs=-1
)

ridge_cv.fit(X_train_scaled, y_train)

In [70]:
ridge_cv.best_params_

{'alpha': np.float64(0.20684494295802447)}

In [71]:
ridge_cv.best_score_

np.float64(-23.881907009216956)

In [72]:
ridge_cv.best_estimator_

In [73]:
best_ridge = ridge_cv.best_estimator_
y_pred_best = best_ridge.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred_best)
mae = mean_absolute_error(y_test, y_pred_best)
r2 = r2_score(y_test, y_pred_best)

print(mse)
print(mae)
print(r2)

10731.883938857218
84.84306816252688
0.989921321668875


# Classification Model

Create target: Movement = 1 if next day Close > today Close, else 0

In [74]:
df.head()

Unnamed: 0,Date,Symbol,Series,Prev Close,Open,High,Low,Last,Close,VWAP,Volume,Turnover,Trades,Deliverable Volume,%Deliverble
0,2000-01-03,ASIANPAINT,EQ,361.2,370.0,390.0,370.0,385.0,381.65,380.54,3318,126261700000.0,,,
1,2000-01-04,ASIANPAINT,EQ,381.65,380.0,392.0,375.0,390.0,385.55,383.5,4818,184769900000.0,,,
2,2000-01-05,ASIANPAINT,EQ,385.55,371.5,390.0,371.5,383.0,383.0,379.81,2628,99813840000.0,,,
3,2000-01-06,ASIANPAINT,EQ,383.0,384.9,384.9,374.5,375.1,377.5,379.88,3354,127411400000.0,,,
4,2000-01-07,ASIANPAINT,EQ,377.5,376.0,390.0,370.0,389.0,385.7,383.38,9589,367627500000.0,,,


In [79]:
df['Close']

0        381.65
1        385.55
2        383.00
3        377.50
4        385.70
         ...   
5301    2557.90
5302    2574.35
5303    2614.55
5304    2613.45
5305    2536.40
Name: Close, Length: 5306, dtype: float64

In [78]:
df['Next_Close'] = df['Close'].shift(-1)
df['Next_Close']

0        385.55
1        383.00
2        377.50
3        385.70
4        415.00
         ...   
5301    2574.35
5302    2614.55
5303    2613.45
5304    2536.40
5305        NaN
Name: Next_Close, Length: 5306, dtype: float64

In [80]:
(df['Next_Close']>df['Close'])

0        True
1       False
2       False
3        True
4        True
        ...  
5301     True
5302     True
5303    False
5304    False
5305    False
Length: 5306, dtype: bool

In [81]:
(df['Next_Close']>df['Close']).astype(int)

0       1
1       0
2       0
3       1
4       1
       ..
5301    1
5302    1
5303    0
5304    0
5305    0
Length: 5306, dtype: int64

In [82]:
df['Movement'] = (df['Next_Close']>df['Close']).astype(int)

In [84]:
df.isnull().sum()

Date                     0
Symbol                   0
Series                   0
Prev Close               0
Open                     0
High                     0
Low                      0
Last                     0
Close                    0
VWAP                     0
Volume                   0
Turnover                 0
Trades                2850
Deliverable Volume     509
%Deliverble            509
Next_Close               1
Movement                 0
dtype: int64

In [85]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression

features = ['Prev Close', 'Open', 'High', 'Low', 'Last','Close', 'VWAP', 'Volume',
            'Turnover', 'Trades', 'Deliverable Volume', '%Deliverble']

X = df[features]
y = df['Movement']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(transformers=[
    ('num',numeric_transformer, features)
])

clf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

In [89]:
print(y.value_counts())

Movement
1    2694
0    2612
Name: count, dtype: int64


In [90]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(3714, 12) (1592, 12) (3714,) (1592,)


In [92]:
clf_pipeline.fit(X_train, y_train)
y_pred = clf_pipeline.predict(X_test)

In [94]:
from sklearn.metrics import accuracy_score, classification_report

accuracy_score(y_test, y_pred)

0.49183417085427134

In [96]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.46      0.09      0.15       796
           1       0.50      0.89      0.64       796

    accuracy                           0.49      1592
   macro avg       0.48      0.49      0.40      1592
weighted avg       0.48      0.49      0.40      1592

