In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

In [2]:
banks = pd.read_csv('datasets/BANKS-large.csv')

In [3]:
banks.head()

Unnamed: 0,Date,HDFC,AXIS,ICICI,KOTAK,SBI,UBI,PNB,INDUSIND,BANDHAN,RBL,IDBI,CANARA,BOI,INDIAN,UCO
0,2021-03-10,1549.045776,760.75,623.77356,1962.97522,384.15332,37.849998,41.5,1034.747681,348.518799,69.5,38.25,161.199997,76.949997,132.86113,13.0
1,2021-03-12,1545.262085,750.599976,611.060791,1934.189575,377.613678,37.650002,41.049999,1017.632141,348.020477,69.699997,42.0,157.0,75.849998,129.608597,13.4
2,2021-03-15,1522.0625,744.400024,601.738098,1917.697754,380.388062,36.950001,40.799999,1033.553589,338.253571,72.150002,40.349998,158.300003,73.599998,125.025475,14.05
3,2021-03-16,1505.633667,737.75,593.213074,1902.255371,375.086975,36.450001,39.799999,1029.622925,338.353241,77.0,38.650002,157.550003,71.949997,122.807838,14.05
4,2021-03-17,1488.906006,727.900024,587.778931,1867.422607,364.782135,35.150002,38.400002,1003.999329,334.765381,76.790001,35.849998,150.25,68.599998,114.62722,13.7


In [4]:
banks = banks.drop(["Date"], axis=1)

## Correlation Analysis

In [5]:
corr = banks.corr()
corr.style.background_gradient()

Unnamed: 0,HDFC,AXIS,ICICI,KOTAK,SBI,UBI,PNB,INDUSIND,BANDHAN,RBL,IDBI,CANARA,BOI,INDIAN,UCO
HDFC,1.0,0.658573,0.53645,0.721264,0.494879,0.350826,0.578677,0.773089,0.031208,0.265863,0.552917,0.245987,-0.156385,0.51431,0.605476
AXIS,0.658573,1.0,0.404942,0.351749,0.421538,0.086015,0.521124,0.699112,0.212026,-0.249887,0.219923,0.162935,0.001876,0.406291,0.536327
ICICI,0.53645,0.404942,1.0,0.59682,0.940852,0.774752,0.386994,0.197896,-0.392143,0.186292,0.795671,0.830503,-0.636169,0.755134,0.434425
KOTAK,0.721264,0.351749,0.59682,1.0,0.585587,0.620103,0.479713,0.55967,0.010172,0.228371,0.739383,0.516022,-0.411962,0.629588,0.394808
SBI,0.494879,0.421538,0.940852,0.585587,1.0,0.826957,0.51852,0.243944,-0.257841,0.145475,0.829111,0.890228,-0.565379,0.868741,0.531423
UBI,0.350826,0.086015,0.774752,0.620103,0.826957,1.0,0.481911,0.066828,-0.135339,0.25851,0.892343,0.876062,-0.478338,0.836327,0.401083
PNB,0.578677,0.521124,0.386994,0.479713,0.51852,0.481911,1.0,0.605835,0.286788,0.295265,0.479424,0.280958,0.27189,0.727703,0.847078
INDUSIND,0.773089,0.699112,0.197896,0.55967,0.243944,0.066828,0.605835,1.0,0.213326,0.06996,0.272978,-0.062336,0.165999,0.429573,0.634731
BANDHAN,0.031208,0.212026,-0.392143,0.010172,-0.257841,-0.135339,0.286788,0.213326,1.0,-0.29056,-0.160275,-0.196012,0.50029,-0.086572,0.044044
RBL,0.265863,-0.249887,0.186292,0.228371,0.145475,0.25851,0.295265,0.06996,-0.29056,1.0,0.190263,0.016703,0.101519,0.216216,0.369324


## Data Preparation

In [6]:
X = banks.drop(["HDFC"], axis=1)
Y = banks.HDFC

## Train-test split

In [7]:
(X_train, X_test, y_train, y_test) = train_test_split(X,Y,test_size=0.5,random_state=1)

# Ridge Regression

In [8]:
# Fitting a ridge regression for specific value of alpha (lambda)
# CASE1 : Choosing small lambda

ridge1 = Ridge(alpha=2, normalize=True, fit_intercept=True)
ridge1.fit(X_train, y_train)

print('intercept :- ', ridge1.intercept_)
print(pd.Series(ridge1.coef_, index=X.columns))

intercept :-  848.9209776305045
AXIS        0.200160
ICICI       0.055929
KOTAK       0.071558
SBI         0.047756
UBI         0.046564
PNB         1.778129
INDUSIND    0.111831
BANDHAN     0.040602
RBL         0.156361
IDBI        0.871565
CANARA     -0.020876
BOI        -0.186874
INDIAN      0.153318
UCO         5.199469
dtype: float64


In [9]:
pred1 = ridge1.predict(X_test)
mean_squared_error(y_test, pred1)

1809.2379800894676

In [10]:
# Fitting a ridge regression for specific value of alpha (lambda)
# CASE2 : Choosing high lambda value

ridge2 = Ridge(alpha=10e10, normalize=True, fit_intercept=True)
ridge2.fit(X_train, y_train)

print('intercept :- ', ridge2.intercept_)
print(pd.Series(ridge2.coef_, index=X.columns))

intercept :-  1511.7623028210005
AXIS        1.161651e-11
ICICI       5.165839e-12
KOTAK       4.257674e-12
SBI         6.825697e-12
UBI         5.001916e-11
PNB         1.681534e-10
INDUSIND    5.723939e-12
BANDHAN     1.830370e-12
RBL         7.971310e-12
IDBI        6.706497e-11
CANARA      5.825068e-12
BOI        -1.416236e-11
INDIAN      2.316079e-11
UCO         4.221405e-10
dtype: float64


In [11]:
pred2 = ridge2.predict(X_test)
mean_squared_error(y_test, pred2)

4559.431450719523

## Choosing the optimal alpha/lambda

In [12]:
# For many alphas
# creating a grid for alpha 
alphas = 10**np.linspace(10,-2,100)
alphas

array([1.00000000e+10, 7.56463328e+09, 5.72236766e+09, 4.32876128e+09,
       3.27454916e+09, 2.47707636e+09, 1.87381742e+09, 1.41747416e+09,
       1.07226722e+09, 8.11130831e+08, 6.13590727e+08, 4.64158883e+08,
       3.51119173e+08, 2.65608778e+08, 2.00923300e+08, 1.51991108e+08,
       1.14975700e+08, 8.69749003e+07, 6.57933225e+07, 4.97702356e+07,
       3.76493581e+07, 2.84803587e+07, 2.15443469e+07, 1.62975083e+07,
       1.23284674e+07, 9.32603347e+06, 7.05480231e+06, 5.33669923e+06,
       4.03701726e+06, 3.05385551e+06, 2.31012970e+06, 1.74752840e+06,
       1.32194115e+06, 1.00000000e+06, 7.56463328e+05, 5.72236766e+05,
       4.32876128e+05, 3.27454916e+05, 2.47707636e+05, 1.87381742e+05,
       1.41747416e+05, 1.07226722e+05, 8.11130831e+04, 6.13590727e+04,
       4.64158883e+04, 3.51119173e+04, 2.65608778e+04, 2.00923300e+04,
       1.51991108e+04, 1.14975700e+04, 8.69749003e+03, 6.57933225e+03,
       4.97702356e+03, 3.76493581e+03, 2.84803587e+03, 2.15443469e+03,
      

In [13]:
# fitting ridge regression model for each alpha
ridge = Ridge(normalize=True, fit_intercept=True)
coefs = []

for a in alphas:
    ridge.set_params(alpha = a)
    ridge.fit(X_train, y_train)
    coefs.append(ridge.coef_)

In [14]:
# coefficients
np.shape(coefs)

(100, 14)

In [15]:
alphas[28]

4037017.2585965497

In [16]:
coefs[28]

array([ 2.87749403e-07,  1.27961547e-07,  1.05465695e-07,  1.69077377e-07,
        1.23900957e-06,  4.16528136e-06,  1.41786191e-07,  4.53396690e-08,
        1.97455144e-07,  1.66124777e-06,  1.44290958e-07, -3.50811615e-07,
        5.73709209e-07,  1.04567261e-05])

### Choosing the optimal alpha for ridge regression

Choosing lambda

1. Let us consider a grid for lambda(l1,l2,...., lk)
2. for l in lambda:
    3. fit a ridge regression model
    4. validate model using CV
5. choose l for which the validation error is the smallest

In [17]:
# identifying the best alpha
ridgecv = RidgeCV(alphas = alphas, normalize=True, fit_intercept=True)
ridgecv.fit(X_train, y_train)
ridgecv.alpha_

0.017475284000076828

In [18]:
# MSE for chosen alpha
ridge4 = Ridge(alpha = ridgecv.alpha_, normalize=True, fit_intercept=True)
ridge4.fit(X_train, y_train)

mean_squared_error(y_test, ridge4.predict(X_test))

664.177910042078

In [19]:
# fit the model on entire data
ridge4.fit(X, Y)
pd.Series(ridge4.coef_, index= X.columns)

AXIS        0.595938
ICICI       0.267448
KOTAK       0.099346
SBI        -0.152092
UBI        -0.322681
PNB        -0.150249
INDUSIND    0.256057
BANDHAN     0.189765
RBL         1.033953
IDBI        4.560812
CANARA     -0.275945
BOI        -0.407420
INDIAN     -1.210915
UCO         4.845295
dtype: float64