In [27]:
import numpy as np
from sklearn.metrics import r2_score
from sklearn.datasets import load_boston
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler

## Preparing and Analyzing Data

In [28]:
X,y =load_boston(return_X_y=True)

df = pd.DataFrame(X,columns = load_boston().feature_names)
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [29]:
df.isna().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
dtype: int64

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
dtypes: float64(13)
memory usage: 51.5 KB


In [31]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


In [32]:
z = np.abs(stats.zscore(df))
z

array([[0.41978194, 0.28482986, 1.2879095 , ..., 1.45900038, 0.44105193,
        1.0755623 ],
       [0.41733926, 0.48772236, 0.59338101, ..., 0.30309415, 0.44105193,
        0.49243937],
       [0.41734159, 0.48772236, 0.59338101, ..., 0.30309415, 0.39642699,
        1.2087274 ],
       ...,
       [0.41344658, 0.48772236, 0.11573841, ..., 1.17646583, 0.44105193,
        0.98304761],
       [0.40776407, 0.48772236, 0.11573841, ..., 1.17646583, 0.4032249 ,
        0.86530163],
       [0.41500016, 0.48772236, 0.11573841, ..., 1.17646583, 0.44105193,
        0.66905833]])

In [33]:
outliers = list(set(np.where(z > 3)[0]))

len(outliers)

91

In [34]:
new_df = df.drop(outliers,axis = 0).reset_index(drop = False)
new_df

Unnamed: 0,index,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
410,501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
411,502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
412,503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
413,504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


In [35]:
y_new = y[list(new_df["index"])].reshape(-1, 1)
X_new = new_df.drop('index', axis = 1)
best = {}
X_new

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
410,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
411,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
412,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
413,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


## Training and Scaling Data With 0.7 for train and 0.3 for test

In [36]:
ss = StandardScaler()
X_new = ss.fit_transform(X_new)
ss1 = StandardScaler()
y_new = ss1.fit_transform(y_new)

X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size=0.3, random_state=50)
X_train


array([[-0.47469258, -0.48858069, -0.12071229, ..., -0.312853  ,
        -0.69109006,  0.47844775],
       [-0.32954618, -0.48858069, -0.39610039, ...,  1.18595212,
        -1.37450603,  0.65473288],
       [-0.45812656, -0.48858069, -0.33424851, ...,  1.13911446,
         0.39274828,  0.18000012],
       ...,
       [-0.45597786, -0.48858069, -0.12071229, ..., -0.312853  ,
         0.43991388,  0.74442181],
       [ 0.66225011, -0.48858069,  1.07067259, ...,  0.81125084,
         0.35800489,  0.41813968],
       [-0.47567187,  1.80106659, -1.08825229, ..., -1.53063215,
         0.16701333, -1.19316825]])

## Trying Ridge and Lasso when alpha = 10

In [37]:
rr = Ridge(alpha=10)
lr = Lasso(alpha=10)
rr.fit(X_train, y_train)
lr.fit(X_train, y_train)
print(f'Ridge Coefs When Alpha is 10 = {rr.coef_}')
print("\n")
print(f'Lasso Coefs When Alpha is 10 = {lr.coef_}')

Ridge Coefs When Alpha is 10 = [[-0.02799425  0.037122   -0.00327619  0.         -0.18194989  0.34867
  -0.04613831 -0.29158584  0.17902969 -0.18638039 -0.2353079   0.03404087
  -0.36256829]]


Lasso Coefs When Alpha is 10 = [-0.  0. -0.  0. -0.  0. -0.  0. -0. -0. -0.  0. -0.]


## Scoring when alpha = 10

In [38]:
print("*****************************************************")
print(f'Score for Ridge Train is = {rr.score(X_train, y_train)}')
print(f'Score for Ridge Test is = {rr.score(X_test, y_test)}')
print("*****************************************************")
print(f'Score for Lasso Train is = {lr.score(X_train, y_train)}')
print(f'Score for Lasso Test is = {lr.score(X_test, y_test)}')
best[101] = rr.score(X_test, y_test)
best[102] =  lr.score(X_test, y_test)
best

*****************************************************
Score for Ridge Train is = 0.7248646301954165
Score for Ridge Test is = 0.7781417577661975
*****************************************************
Score for Lasso Train is = 0.0
Score for Lasso Test is = -0.0008355246529270222


{101: 0.7781417577661975, 102: -0.0008355246529270222}

### Inserted the test results to the best dict to select best performing model. 101 is when alpha is 10. if it's Ridge then key finishes with 1, if it's Lasso then key finishes with 2

## Trying Ridge and Lasso when alpha = 0.8

In [39]:
rr1 = Ridge(alpha=0.8)
lr1 = Lasso(alpha=0.8)
rr1.fit(X_train, y_train)
lr1.fit(X_train, y_train)
print(f'Ridge Coefs When Alpha is 0.8 = {rr1.coef_}')
print("\n")
print(f'Lasso Coefs When Alpha is 0.8 = {lr1.coef_}')

Ridge Coefs When Alpha is 0.8 = [[-0.0405517   0.04663998  0.02184457  0.         -0.22207503  0.34621886
  -0.04281225 -0.32944684  0.24910734 -0.24762037 -0.24913127  0.02997429
  -0.37204364]]


Lasso Coefs When Alpha is 0.8 = [-0.  0. -0.  0. -0.  0. -0.  0. -0. -0. -0.  0. -0.]


## Scoring when alpha = 0.8

In [40]:
print("*****************************************************")
print(f'Score for Ridge Train is = {rr1.score(X_train, y_train)}')
print(f'Score for Ridge Test is = {rr1.score(X_test, y_test)}')
print("*****************************************************")
print(f'Score for Lasso Train is = {lr1.score(X_train, y_train)}')
print(f'Score for Lasso Test is = {lr1.score(X_test, y_test)}')
best[0.81] = rr1.score(X_test, y_test)
best[0.82] =  lr1.score(X_test, y_test)
best

*****************************************************
Score for Ridge Train is = 0.7266739561065114
Score for Ridge Test is = 0.7753942795925424
*****************************************************
Score for Lasso Train is = 0.0
Score for Lasso Test is = -0.0008355246529270222


{101: 0.7781417577661975,
 102: -0.0008355246529270222,
 0.81: 0.7753942795925424,
 0.82: -0.0008355246529270222}

## Trying Ridge and Lasso When alpha = 0.1

In [41]:
rr2 = Ridge(alpha=0.1)
lr2 = Lasso(alpha=0.1)
rr2.fit(X_train, y_train)
lr2.fit(X_train, y_train)
print(f'Ridge Coefs When Alpha is 0.1 = {rr2.coef_}')
print("\n")
print(f'Lasso Coefs When Alpha is 0.1 = {lr2.coef_}')

Ridge Coefs When Alpha is 0.1 = [[-0.04211104  0.04770185  0.02454212  0.         -0.22599133  0.34578512
  -0.04239688 -0.33284546  0.25680415 -0.25440831 -0.25042252  0.02955332
  -0.37268599]]


Lasso Coefs When Alpha is 0.1 = [-0.          0.         -0.          0.         -0.          0.32679515
 -0.         -0.         -0.         -0.00124485 -0.15087986  0.
 -0.37689669]


## Scoring when alpha = 0.1

In [42]:
print("*****************************************************")
print(f'Score for Ridge Train is = {rr2.score(X_train, y_train)}')
print(f'Score for Ridge Test is = {rr2.score(X_test, y_test)}')
print("*****************************************************")
print(f'Score for Lasso Train is = {lr2.score(X_train, y_train)}')
print(f'Score for Lasso Test is = {lr2.score(X_test, y_test)}')
best[0.11] = rr2.score(X_test, y_test)
best[0.12] =  lr2.score(X_test, y_test)
best

*****************************************************
Score for Ridge Train is = 0.7266927361493105
Score for Ridge Test is = 0.7749808044133962
*****************************************************
Score for Lasso Train is = 0.6646347982182961
Score for Lasso Test is = 0.7363859820456125


{101: 0.7781417577661975,
 102: -0.0008355246529270222,
 0.81: 0.7753942795925424,
 0.82: -0.0008355246529270222,
 0.11: 0.7749808044133962,
 0.12: 0.7363859820456125}

## Trying Ridge and Lasso When alpha = 0.01

In [43]:
rr3 = Ridge(alpha=0.01)
lr3 = Lasso(alpha=0.01)
rr3.fit(X_train, y_train)
lr3.fit(X_train, y_train)
print(f'Ridge Coefs When Alpha is 0.01 = {rr3.coef_}')
print("\n")
print(f'Lasso Coefs When Alpha is 0.01 = {lr3.coef_}')

Ridge Coefs When Alpha is 0.01 = [[-0.04232063  0.04784328  0.02490055  0.         -0.2265065   0.34572585
  -0.04234107 -0.33328897  0.25782895 -0.25531293 -0.25059186  0.02949764
  -0.37276721]]


Lasso Coefs When Alpha is 0.01 = [-0.          0.01896854 -0.          0.         -0.17493158  0.35367807
 -0.02590195 -0.26304515  0.12230728 -0.14880992 -0.23427992  0.02717521
 -0.3864512 ]


## Scoring when alpha = 0.01

In [44]:
print("*****************************************************")
print(f'Score for Ridge Train is = {rr3.score(X_train, y_train)}')
print(f'Score for Ridge Test is = {rr3.score(X_test, y_test)}')
print("*****************************************************")
print(f'Score for Lasso Train is = {lr3.score(X_train, y_train)}')
print(f'Score for Lasso Test is = {lr3.score(X_test, y_test)}')
best[0.011] = rr3.score(X_test, y_test)
best[0.012] =  lr3.score(X_test, y_test)
best

*****************************************************
Score for Ridge Train is = 0.7266930456977404
Score for Ridge Test is = 0.774924541109321
*****************************************************
Score for Lasso Train is = 0.7226188329924625
Score for Lasso Test is = 0.7770907585569744


{101: 0.7781417577661975,
 102: -0.0008355246529270222,
 0.81: 0.7753942795925424,
 0.82: -0.0008355246529270222,
 0.11: 0.7749808044133962,
 0.12: 0.7363859820456125,
 0.011: 0.774924541109321,
 0.012: 0.7770907585569744}

## Trying Ridge and Lasso When alpha = 0.001

In [45]:
rr4 = Ridge(alpha=0.001)
lr4 = Lasso(alpha=0.001)
rr4.fit(X_train, y_train)
lr4.fit(X_train, y_train)
print(f'Ridge Coefs When Alpha is 0.001 = {rr4.coef_}')
print("\n")
print(f'Lasso Coefs When Alpha is 0.001 = {lr4.coef_}')

Ridge Coefs When Alpha is 0.001 = [[-0.04234171  0.04785748  0.02493655  0.         -0.22655817  0.34571987
  -0.04233546 -0.33333341  0.25793189 -0.25540381 -0.25060884  0.02949205
  -0.37277532]]


Lasso Coefs When Alpha is 0.001 = [-0.03687154  0.04436282  0.01719586  0.         -0.21991734  0.34624075
 -0.04088659 -0.32712321  0.24166504 -0.24135221 -0.24844521  0.02928402
 -0.37393591]


## Scoring when alpha = 0.001

In [47]:
print("*****************************************************")
print(f'Score for Ridge Train is = {rr4.score(X_train, y_train)}')
print(f'Score for Ridge Test is = {rr4.score(X_test, y_test)}')
print("*****************************************************")
print(f'Score for Lasso Train is = {lr4.score(X_train, y_train)}')
print(f'Score for Lasso Test is = {lr4.score(X_test, y_test)}')
best[0.0011] = rr4.score(X_test, y_test)
best[0.0012] =  lr4.score(X_test, y_test)
best

*****************************************************
Score for Ridge Train is = 0.726693048812595
Score for Ridge Test is = 0.7749188743363429
*****************************************************
Score for Lasso Train is = 0.7266317856443787
Score for Lasso Test is = 0.7753785776074973


{101: 0.7781417577661975,
 102: -0.0008355246529270222,
 0.81: 0.7753942795925424,
 0.82: -0.0008355246529270222,
 0.11: 0.7749808044133962,
 0.12: 0.7363859820456125,
 0.011: 0.774924541109321,
 0.012: 0.7770907585569744,
 0.0011: 0.7749188743363429,
 0.0012: 0.7753785776074973}

In [48]:
print(f'Best Performing Model without Outliers is = {max(best, key=best.get)} with the value of {max(best.values())}')

Best Performing Model without Outliers is = 101 with the value of 0.7781417577661975


### So the best performing model is when alpha = 10 using Ridge Regression
