# Ridge Regression from scratch

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
teams = pd.read_csv('teams.csv')

In [3]:
teams

Unnamed: 0,team,year,athletes,events,age,height,weight,prev_medals,medals
0,AFG,1964,8,8,22.0,161.0,64.2,0.0,0
1,AFG,1968,5,5,23.2,170.2,70.0,0.0,0
2,AFG,1972,8,8,29.0,168.3,63.8,0.0,0
3,AFG,1980,11,11,23.6,168.4,63.2,0.0,0
4,AFG,2004,5,5,18.6,170.8,64.8,0.0,0
...,...,...,...,...,...,...,...,...,...
2009,ZIM,2000,26,19,25.0,179.0,71.1,0.0,0
2010,ZIM,2004,14,11,25.1,177.8,70.5,0.0,3
2011,ZIM,2008,16,15,26.1,171.9,63.7,3.0,4
2012,ZIM,2012,9,8,27.3,174.4,65.2,4.0,0


In [4]:
train, test = train_test_split(teams, test_size=0.2, random_state=1) # means everytime we split it would give us the same split

In [7]:
predictors =["athletes", "events"]
target = ["medals"]

In [8]:
X= train[predictors].copy()

In [9]:
y = train[target].copy()

In [10]:
X

Unnamed: 0,athletes,events
1322,6,6
1872,119,80
953,4,4
1117,2,2
1993,43,25
...,...,...
1791,40,25
1096,36,23
1932,719,245
235,13,11


In [11]:
y

1322      0
1872      5
953       0
1117      0
1993      0
       ... 
1791      1
1096      1
1932    264
235       0
1061      3
Name: medals, Length: 1611, dtype: int64

In [12]:
# scale our X values, we will be standadizing our X values
x_mean= X.mean()
x_std= X.std()

In [16]:
x_mean

athletes    74.409063
events      35.990068
dtype: float64

In [17]:
x_std

athletes    127.250043
events       48.978737
dtype: float64

In [18]:
X= (X-x_mean)/x_std

In [19]:
X.describe()

Unnamed: 0,athletes,events
count,1611.0,1611.0
mean,-8.269818e-18,-9.923781e-18
std,1.0,1.0
min,-0.5768883,-0.714393
25%,-0.5297371,-0.6123079
50%,-0.4197174,-0.4489717
75%,-0.02679027,0.183956
max,6.008571,4.634867


Here we can see the mean=0, and std_dev=1

In [20]:
X["intercept"]=1

In [23]:
X= X[["intercept"]+predictors] # we need this format for matrix multiplication

In [24]:
X

Unnamed: 0,intercept,athletes,events
1322,1,-0.537596,-0.612308
1872,1,0.350420,0.898552
953,1,-0.553313,-0.653142
1117,1,-0.569030,-0.693976
1993,1,-0.246829,-0.224384
...,...,...,...
1791,1,-0.270405,-0.224384
1096,1,-0.301839,-0.265219
1932,1,5.065546,4.267361
235,1,-0.482586,-0.510223


In [25]:
X.T

Unnamed: 0,1322,1872,953,1117,1993,385,1287,1831,0,1159,...,960,847,1669,715,905,1791,1096,1932,235,1061
intercept,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
athletes,-0.537596,0.35042,-0.553313,-0.56903,-0.246829,-0.482586,-0.537596,0.138239,-0.521879,-0.152527,...,-0.199678,-0.160386,-0.529737,-0.529737,-0.341132,-0.270405,-0.301839,5.065546,-0.482586,-0.19182
events,-0.612308,0.898552,-0.653142,-0.693976,-0.224384,-0.571474,-0.612308,0.102288,-0.571474,-0.163133,...,-0.285636,-0.101882,-0.612308,-0.591891,-0.367304,-0.224384,-0.265219,4.267361,-0.510223,0.041037


In [26]:
alpha =2 # lambda is a key word for an anonymous function
I= np.identity(X.shape[1])

In [27]:
I

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [28]:
# we now make the I[0][0] term to be 0 in I
# we did this as we don't want to penalise our Y intercept
I[0][0]=0

In [29]:
I

array([[0., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [31]:
penalty = alpha * I # penalty matrix

In [32]:
penalty

array([[0., 0., 0.],
       [0., 2., 0.],
       [0., 0., 2.]])

In [43]:
B= np.linalg.inv(X.T @ X + penalty) @ X.T @ y

In [51]:
B

intercept    10.691496
athletes     61.857734
events      -34.632920
dtype: float64

In [52]:
B.index = ["intercept", "athletes", "events"]

The above are the B0, B1, B2 values for the intercept, slope values for the variables

In [53]:
test_X = test[predictors]
test_X = (test_X - x_mean) / x_std # here we are standardizing as per mean, std_dev of the training set, as we don't usally know the test set 

In [54]:
test_X["intercept"]=1
test_X = test_X[["intercept"]+ predictors]

In [55]:
test_X

Unnamed: 0,intercept,athletes,events
309,1,-0.553313,-0.653142
285,1,0.594035,1.000637
919,1,-0.144668,0.102288
120,1,0.146098,0.531045
585,1,-0.301839,-0.122299
...,...,...,...
541,1,-0.380425,-0.408138
1863,1,-0.191820,0.143122
622,1,-0.058224,0.388126
1070,1,-0.569030,-0.693976


In [56]:
predictions = test_X @ B

In [57]:
predictions

309     -0.914959
285     12.782156
919     -1.799893
120      1.337116
585     -3.744014
          ...    
541      1.294285
1863    -6.130765
622     -6.352080
1070    -0.472980
1196    -0.914959
Length: 403, dtype: float64

### Comparision with sklearn

In [58]:
from sklearn.linear_model import Ridge

In [59]:
ridge = Ridge(alpha=alpha)

In [60]:
ridge.fit(X[predictors], y)

In [61]:
ridge.coef_

array([ 61.85773366, -34.63292036])

In [62]:
ridge.intercept_

10.691495965238982

In [63]:
sklearn_predictions= ridge.predict(test_X[predictors])

In [65]:
predictions-sklearn_predictions

309     9.947598e-14
285    -5.009326e-13
919    -3.232969e-13
120    -4.867218e-13
585    -2.433609e-13
            ...     
541     1.598721e-14
1863   -4.369838e-13
622    -5.719869e-13
1070    1.385558e-13
1196    9.947598e-14
Length: 403, dtype: float64

As we can see we did a pretty good job in predicting