In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
hitters_path = "../../Data/Hitters.csv"
hitters = pd.read_csv(hitters_path)
hitters.head()

Unnamed: 0.1,Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,...,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,-Andy Allanson,293,66,1,30,29,14,1,293,66,...,30,29,14,A,E,446,33,20,,A
1,-Alan Ashby,315,81,7,24,38,39,14,3449,835,...,321,414,375,N,W,632,43,10,475.0,N
2,-Alvin Davis,479,130,18,66,72,76,3,1624,457,...,224,266,263,A,W,880,82,14,480.0,A
3,-Andre Dawson,496,141,20,65,78,37,11,5628,1575,...,828,838,354,N,E,200,11,3,500.0,N
4,-Andres Galarraga,321,87,10,39,42,30,2,396,101,...,48,46,33,N,E,805,40,4,91.5,N


In [3]:
hitters.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322 entries, 0 to 321
Data columns (total 21 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  322 non-null    object 
 1   AtBat       322 non-null    int64  
 2   Hits        322 non-null    int64  
 3   HmRun       322 non-null    int64  
 4   Runs        322 non-null    int64  
 5   RBI         322 non-null    int64  
 6   Walks       322 non-null    int64  
 7   Years       322 non-null    int64  
 8   CAtBat      322 non-null    int64  
 9   CHits       322 non-null    int64  
 10  CHmRun      322 non-null    int64  
 11  CRuns       322 non-null    int64  
 12  CRBI        322 non-null    int64  
 13  CWalks      322 non-null    int64  
 14  League      322 non-null    object 
 15  Division    322 non-null    object 
 16  PutOuts     322 non-null    int64  
 17  Assists     322 non-null    int64  
 18  Errors      322 non-null    int64  
 19  Salary      263 non-null    f

In [4]:
hitters = hitters.dropna(subset=["Salary"])
hitters.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 263 entries, 1 to 321
Data columns (total 21 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  263 non-null    object 
 1   AtBat       263 non-null    int64  
 2   Hits        263 non-null    int64  
 3   HmRun       263 non-null    int64  
 4   Runs        263 non-null    int64  
 5   RBI         263 non-null    int64  
 6   Walks       263 non-null    int64  
 7   Years       263 non-null    int64  
 8   CAtBat      263 non-null    int64  
 9   CHits       263 non-null    int64  
 10  CHmRun      263 non-null    int64  
 11  CRuns       263 non-null    int64  
 12  CRBI        263 non-null    int64  
 13  CWalks      263 non-null    int64  
 14  League      263 non-null    object 
 15  Division    263 non-null    object 
 16  PutOuts     263 non-null    int64  
 17  Assists     263 non-null    int64  
 18  Errors      263 non-null    int64  
 19  Salary      263 non-null    f

In [5]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

numerical_pipeline = Pipeline([("standardization",StandardScaler())])
numerical_attributes = hitters.drop(["Unnamed: 0","Salary","League","Division","NewLeague"],axis=1).columns

categorical_pipeline = Pipeline([("encoding",OneHotEncoder())])
categorical_attributes = ["League","Division","NewLeague"]

transformer = ColumnTransformer([("numerical_pipeline",numerical_pipeline,numerical_attributes),
                                 ("categorical_pipeline",categorical_pipeline,categorical_attributes)])

X = pd.DataFrame(transformer.fit_transform(hitters))
Y = hitters["Salary"]

### Ridge Regression

In [6]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

rr = Ridge(alpha=1)
rr.fit(X,Y)
predictions = rr.predict(X)
MSE = mean_squared_error(Y,predictions)

print(rr.coef_,rr.intercept_)
print(np.sqrt(MSE))

[-270.6864407   296.64505003   18.10059158  -29.33940613   -9.11329453
  124.40717273  -38.66774782 -225.40654798  126.65960655   39.07092364
  320.41216891  160.38678418 -184.4236106    78.62365619   47.46259711
  -23.72419031  -30.43885531   30.43885531   60.01559493  -60.01559493
   13.11128155  -13.11128155] 537.8557165586524
304.1134680271562


In [7]:
rr = Ridge(alpha=100000)
rr.fit(X,Y)
predictions = rr.predict(X)
MSE = mean_squared_error(Y,predictions)

print(rr.coef_,rr.intercept_)
print(np.sqrt(MSE))

[ 0.45891368  0.5110038   0.39793418  0.48866341  0.52250579  0.51692086
  0.46434919  0.61088812  0.63780925  0.60979162  0.6538173   0.65880874
  0.56838978  0.35201964  0.02975743 -0.00658569  0.00757828 -0.00757828
  0.1133501  -0.1133501   0.00107361 -0.00107361] 535.9275272918461
446.7417261255457


In [8]:
from sklearn.model_selection import GridSearchCV

param_grid=[{"alpha":[0,1,5,10,100,1000,10000]}]

grid_search_rr=GridSearchCV(rr,param_grid,cv=5,scoring="neg_mean_squared_error",return_train_score=True)
grid_search_rr.fit(X,Y)

GridSearchCV(cv=5, estimator=Ridge(alpha=100000),
             param_grid=[{'alpha': [0, 1, 5, 10, 100, 1000, 10000]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [9]:
cvres=grid_search_rr.cv_results_
for mean_score,params in zip(cvres["mean_test_score"],cvres["params"]):
    print(np.sqrt(-mean_score),params)

354.3490022444641 {'alpha': 0}
345.1971917486754 {'alpha': 1}
344.9896460955026 {'alpha': 5}
345.39609937710446 {'alpha': 10}
347.0201295880294 {'alpha': 100}
367.5440471737516 {'alpha': 1000}
430.437037444695 {'alpha': 10000}


In [10]:
grid_search_rr.best_estimator_

Ridge(alpha=5)

### Lasso Regression

In [11]:
from sklearn.linear_model import Lasso

lr = Lasso(alpha=1)
lr.fit(X,Y)
predictions = lr.predict(X)
MSE = mean_squared_error(Y,predictions)

print(lr.coef_,lr.intercept_)
print(np.sqrt(MSE))

[-2.82369623e+02  3.04358267e+02  1.11271457e+01 -2.49660478e+01
 -0.00000000e+00  1.20694778e+02 -3.49475072e+01 -1.62644108e+02
  0.00000000e+00  1.42228553e+01  3.75565034e+02  1.92616442e+02
 -1.89643123e+02  7.87602599e+01  4.19966607e+01 -1.84794214e+01
 -3.58260791e+01  1.73472026e-14  1.14412953e+02 -2.17328477e-11
  0.00000000e+00 -0.00000000e+00] 498.74167724144104
304.35888022005037


In [12]:
lr = Lasso(alpha=100000)
lr.fit(X,Y)
predictions = lr.predict(X)
MSE = mean_squared_error(Y,predictions)

print(lr.coef_,lr.intercept_)
print(np.sqrt(MSE))

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. -0.  0. -0.
  0. -0.  0. -0.] 535.9258821292775
450.26022382434286


In [13]:
param_grid=[{"alpha":[0,1,5,10,100,1000,10000]}]

grid_search_lr=GridSearchCV(lr,param_grid,cv=5,scoring="neg_mean_squared_error",return_train_score=True)
grid_search_lr.fit(X,Y)

GridSearchCV(cv=5, estimator=Lasso(alpha=100000),
             param_grid=[{'alpha': [0, 1, 5, 10, 100, 1000, 10000]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [14]:
cvres=grid_search_lr.cv_results_
for mean_score,params in zip(cvres["mean_test_score"],cvres["params"]):
    print(np.sqrt(-mean_score),params)

347.84689608521967 {'alpha': 0}
346.0546151737132 {'alpha': 1}
346.67308944078513 {'alpha': 5}
348.91254391834786 {'alpha': 10}
378.77826258891315 {'alpha': 100}
453.4833738957413 {'alpha': 1000}
453.4833738957413 {'alpha': 10000}


In [15]:
grid_search_rr.best_estimator_

Ridge(alpha=5)