In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import scale 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge,Lasso
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_csv('Hitters_Data.csv').dropna()
df.info()

In [None]:
df

In [None]:
sns.histplot(df.Salary)

In [None]:
sns.histplot(np.log(df.Salary))

In [None]:
dummies = pd.get_dummies(df[['League', 'Division', 'NewLeague']])

In [None]:
dummies

In [None]:
y = np.log(df.Salary)

# Drop the column with the independent variable (Salary), and columns for which we created dummy variables
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis = 1).astype('float64')

# Define the feature set X.
X = pd.concat([X_, dummies[['League_N', 'Division_W', 'NewLeague_N']]], axis = 1)

X.info()

In [None]:
y

In [None]:
X

In [None]:
# Split data into training and test sets
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [None]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
#Scale both training set and test set (be careful do them independently!)
scaler.fit(X_train)
X_trainStandard = scaler.transform(X_train)
X_testStandard = scaler.transform(X_test)

In [None]:
#The Ridge() function has an alpha argument ( λ , but with a different name!) that is used to tune the model. 
ridge2 = Ridge(alpha = 4)
ridge2.fit(X_trainStandard, y_train)             # Fit a ridge regression on the training data
pred2 = ridge2.predict(X_testStandard)           # Use this model to predict the test data
print(pd.Series(ridge2.coef_, index = X.columns)) # Print coefficients
print(mean_squared_error(y_test, pred2))          # Calculate the test MSE

In [None]:
ridge3 = Ridge(alpha = 10**10)
ridge3.fit(X_trainStandard, y_train)             # Fit a ridge regression on the training data
pred3 = ridge3.predict(X_testStandard)           # Use this model to predict the test data
print(pd.Series(ridge3.coef_, index = X.columns)) # Print coefficients
print(mean_squared_error(y_test, pred3))          # Calculate the test MSE

In [None]:
ridge4 = Ridge(alpha = 0)
ridge4.fit(X_trainStandard, y_train)             # Fit a ridge regression on the training data
pred = ridge4.predict(X_testStandard)            # Use this model to predict the test data
print(pd.Series(ridge4.coef_, index = X.columns)) # Print coefficients
print(mean_squared_error(y_test, pred))           # Calculate the test MSE

In [None]:
#We'll generate an array of alpha values ranging from very big to very small, essentially covering the full range of scenarios 
#from the null model containing only the intercept, to the least squares fit
alphas = 10**np.linspace(5,-2,100)*0.5
alphas

In [None]:
ridge = Ridge()
coefs = []

for a in alphas:
    ridge.set_params(alpha = a)
    ridge.fit(X_trainStandard, y_train)
    coefs.append(ridge.coef_)
np.shape(coefs)

In [None]:
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('weights')

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
alphas = 10**np.linspace(5,-2,100)*0.5
scoresCV = []
for l in alphas:
    RidgeReg = make_pipeline(preprocessing.StandardScaler(), Ridge(alpha=l))
    scoreCV = cross_val_score(RidgeReg, X_train, y_train, scoring='neg_mean_squared_error',
                             cv=KFold(n_splits=10, shuffle=True,
                                            random_state=1))
    scoresCV.append([l,-1*np.mean(scoreCV)])
df = pd.DataFrame(scoresCV,columns=['Lambda','Validation Error'])

In [None]:
print(df.sort_values(['Validation Error']))

In [None]:
ridge5 = Ridge(alpha = 121.006413)
ridge5.fit(X_trainStandard, y_train)
mean_squared_error(y_test, ridge5.predict(X_testStandard))

In [None]:
pd.Series(ridge5.coef_, index = X.columns)

In [None]:
lasso2 = Lasso(alpha=100,max_iter = 10000)
lasso2.fit(X_trainStandard, y_train)             # Fit a Lasso regression on the training data
pred2 = lasso2.predict(X_testStandard)           # Use this model to predict the test data
print(pd.Series(lasso2.coef_, index = X.columns)) # Print coefficients
print(mean_squared_error(y_test, pred2))          # Calculate the test MSE

In [None]:
lasso3 = Lasso(alpha=0.1,max_iter = 10000)
lasso3.fit(X_trainStandard, y_train)             # Fit a Lasso regression on the training data
pred3 = lasso3.predict(X_testStandard)           # Use this model to predict the test data
print(pd.Series(lasso3.coef_, index=X.columns)) # Print coefficients
print(mean_squared_error(y_test, pred3))          # Calculate the test MSE

In [None]:
alphas = 10**np.linspace(1,-2,100)*0.5
lasso = Lasso(max_iter = 10000)
coefs = []
for a in alphas:
    lasso.set_params(alpha=a)
    lasso.fit(X_trainStandard, y_train)
    coefs.append(lasso.coef_)
    
ax = plt.gca()
ax.plot(alphas*2, coefs)
ax.set_xscale('log')
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('weights')

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
alphas = 10**np.linspace(1,-2,100)*0.5
scoresCV = []
for l in alphas:
    lassoReg = make_pipeline(preprocessing.StandardScaler(), Lasso(alpha=l,max_iter=10000))
    scoreCV = cross_val_score(lassoReg, X_train, y_train, scoring='neg_mean_squared_error',
                             cv=KFold(n_splits=10, shuffle=True,
                                            random_state=1))
    scoresCV.append([l,-1*np.mean(scoreCV)])
df = pd.DataFrame(scoresCV,columns=['Lambda','Validation Error'])
print(df.sort_values(by='Validation Error'))

In [None]:
plt.plot(df.Lambda,df['Validation Error'])

In [None]:
lasso = Lasso(alpha = 0.017556,max_iter=10000)
lasso.fit(X_trainStandard, y_train)
mean_squared_error(y_test, lasso.predict(X_testStandard))

In [None]:
pd.Series(lasso.coef_, index = X.columns)