In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_squared_error

In [2]:
banks = pd.read_csv('BANKS-large.csv')

In [3]:
banks.head()

Unnamed: 0,Date,HDFC,AXIS,ICICI,KOTAK,SBI,UBI,PNB,INDUSIND,BANDHAN,RBL,IDBI,CANARA,BOI,INDIAN,UCO
0,2021-03-10,1549.045776,760.75,623.77356,1962.97522,384.15332,37.849998,41.5,1034.747681,348.518799,69.5,38.25,161.199997,76.949997,132.86113,13.0
1,2021-03-12,1545.262085,750.599976,611.060791,1934.189575,377.613678,37.650002,41.049999,1017.632141,348.020477,69.699997,42.0,157.0,75.849998,129.608597,13.4
2,2021-03-15,1522.0625,744.400024,601.738098,1917.697754,380.388062,36.950001,40.799999,1033.553589,338.253571,72.150002,40.349998,158.300003,73.599998,125.025475,14.05
3,2021-03-16,1505.633667,737.75,593.213074,1902.255371,375.086975,36.450001,39.799999,1029.622925,338.353241,77.0,38.650002,157.550003,71.949997,122.807838,14.05
4,2021-03-17,1488.906006,727.900024,587.778931,1867.422607,364.782135,35.150002,38.400002,1003.999329,334.765381,76.790001,35.849998,150.25,68.599998,114.62722,13.7


In [4]:
banks = banks.drop(["Date"], axis=1)

In [5]:
corr = banks.corr()
corr.style.background_gradient()

Unnamed: 0,HDFC,AXIS,ICICI,KOTAK,SBI,UBI,PNB,INDUSIND,BANDHAN,RBL,IDBI,CANARA,BOI,INDIAN,UCO
HDFC,1.0,0.658573,0.53645,0.721264,0.494879,0.350826,0.578677,0.773089,0.031208,0.265863,0.552917,0.245987,-0.156385,0.51431,0.605476
AXIS,0.658573,1.0,0.404942,0.351749,0.421538,0.086015,0.521124,0.699112,0.212026,-0.249887,0.219923,0.162935,0.001876,0.406291,0.536327
ICICI,0.53645,0.404942,1.0,0.59682,0.940852,0.774752,0.386994,0.197896,-0.392143,0.186292,0.795671,0.830503,-0.636169,0.755134,0.434425
KOTAK,0.721264,0.351749,0.59682,1.0,0.585587,0.620103,0.479713,0.55967,0.010172,0.228371,0.739383,0.516022,-0.411962,0.629588,0.394808
SBI,0.494879,0.421538,0.940852,0.585587,1.0,0.826957,0.51852,0.243944,-0.257841,0.145475,0.829111,0.890228,-0.565379,0.868741,0.531423
UBI,0.350826,0.086015,0.774752,0.620103,0.826957,1.0,0.481911,0.066828,-0.135339,0.25851,0.892343,0.876062,-0.478338,0.836327,0.401083
PNB,0.578677,0.521124,0.386994,0.479713,0.51852,0.481911,1.0,0.605835,0.286788,0.295265,0.479424,0.280958,0.27189,0.727703,0.847078
INDUSIND,0.773089,0.699112,0.197896,0.55967,0.243944,0.066828,0.605835,1.0,0.213326,0.06996,0.272978,-0.062336,0.165999,0.429573,0.634731
BANDHAN,0.031208,0.212026,-0.392143,0.010172,-0.257841,-0.135339,0.286788,0.213326,1.0,-0.29056,-0.160275,-0.196012,0.50029,-0.086572,0.044044
RBL,0.265863,-0.249887,0.186292,0.228371,0.145475,0.25851,0.295265,0.06996,-0.29056,1.0,0.190263,0.016703,0.101519,0.216216,0.369324


In [6]:
X = banks.drop(["HDFC"], axis=1)
Y = banks.HDFC

In [7]:
(X_train, X_test, y_train, y_test) = train_test_split(X,Y,test_size=0.5,random_state=1)

# LASSO Regression

In [9]:
# lasso for a specific alpha
lasso = Lasso(alpha=0.01, max_iter=10000, normalize=True, fit_intercept=True)
lasso.fit(X_train, y_train)

print('intercept  :- ', lasso.intercept_)
print(pd.Series(lasso.coef_, index=X.columns))

intercept  :-  314.5951223034597
AXIS        0.669949
ICICI       0.270478
KOTAK       0.086165
SBI        -0.118341
UBI         1.138814
PNB        -2.264123
INDUSIND    0.251522
BANDHAN     0.249255
RBL         0.978052
IDBI        5.196092
CANARA     -0.358243
BOI         0.000000
INDIAN     -1.608053
UCO         7.806425
dtype: float64


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lasso())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 


## Finding optimal alpha for lasso

In [10]:
# For many alphas
# creating a grid for alpha 
alphas = 10**np.linspace(1,-4,1000)
alphas

array([1.00000000e+01, 9.88541702e+00, 9.77214697e+00, 9.66017480e+00,
       9.54948564e+00, 9.44006479e+00, 9.33189772e+00, 9.22497005e+00,
       9.11926760e+00, 9.01477631e+00, 8.91148232e+00, 8.80937190e+00,
       8.70843150e+00, 8.60864770e+00, 8.51000725e+00, 8.41249705e+00,
       8.31610415e+00, 8.22081576e+00, 8.12661920e+00, 8.03350198e+00,
       7.94145172e+00, 7.85045620e+00, 7.76050334e+00, 7.67158118e+00,
       7.58367791e+00, 7.49678187e+00, 7.41088152e+00, 7.32596543e+00,
       7.24202233e+00, 7.15904109e+00, 7.07701066e+00, 6.99592017e+00,
       6.91575883e+00, 6.83651600e+00, 6.75818117e+00, 6.68074392e+00,
       6.60419396e+00, 6.52852114e+00, 6.45371540e+00, 6.37976681e+00,
       6.30666554e+00, 6.23440189e+00, 6.16296626e+00, 6.09234915e+00,
       6.02254120e+00, 5.95353313e+00, 5.88531578e+00, 5.81788007e+00,
       5.75121707e+00, 5.68531791e+00, 5.62017385e+00, 5.55577622e+00,
       5.49211648e+00, 5.42918618e+00, 5.36697695e+00, 5.30548053e+00,
      

In [11]:
# identifying the best alpha
lassocv = LassoCV(alphas = alphas, normalize=True, fit_intercept=True)
lassocv.fit(X_train, y_train)
lassocv.alpha_

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lasso())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 


0.03330600343624589

In [12]:
# MSE for chosen alpha
lasso = Lasso(alpha = lassocv.alpha_, normalize=True, fit_intercept=True)
lasso.fit(X_train, y_train)

mean_squared_error(y_test, lasso.predict(X_test))

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lasso())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 


655.2544390982155

In [13]:
# fit the model on entire data
lasso.fit(X, Y)
pd.Series(lasso.coef_, index=X.columns)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lasso())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 


AXIS        0.622231
ICICI       0.173135
KOTAK       0.105275
SBI        -0.000000
UBI        -0.000000
PNB        -0.000000
INDUSIND    0.270145
BANDHAN     0.058622
RBL         1.025339
IDBI        4.568663
CANARA     -0.228175
BOI        -0.000000
INDIAN     -1.403357
UCO         1.754778
dtype: float64