In [46]:
import sklearn
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
pd.set_option("display.precision", 3)

import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.metrics import r2_score

In [12]:
diabetes = datasets.load_diabetes()

In [17]:
diabetes

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990842, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06832974, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286377, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04687948,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452837, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00421986,  0.00306441]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59

In [18]:
diabetes["feature_names"]

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [26]:
columns = diabetes["feature_names"]

In [27]:
columns

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [23]:
diabetes["data"]

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990842, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06832974, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286377, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04687948,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452837, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00421986,  0.00306441]])

In [24]:
df = pd.DataFrame.from_records(diabetes["data"])

In [29]:
df.columns = columns

In [30]:
df

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038,0.051,0.062,0.022,-0.044,-0.035,-0.043,-0.003,0.020,-0.018
1,-0.002,-0.045,-0.051,-0.026,-0.008,-0.019,0.074,-0.039,-0.068,-0.092
2,0.085,0.051,0.044,-0.006,-0.046,-0.034,-0.032,-0.003,0.003,-0.026
3,-0.089,-0.045,-0.012,-0.037,0.012,0.025,-0.036,0.034,0.023,-0.009
4,0.005,-0.045,-0.036,0.022,0.004,0.016,0.008,-0.003,-0.032,-0.047
...,...,...,...,...,...,...,...,...,...,...
437,0.042,0.051,0.020,0.060,-0.006,-0.003,-0.029,-0.003,0.031,0.007
438,-0.006,0.051,-0.016,-0.068,0.049,0.079,-0.029,0.034,-0.018,0.044
439,0.042,0.051,-0.016,0.017,-0.037,-0.014,-0.025,-0.011,-0.047,0.015
440,-0.045,-0.045,0.039,0.001,0.016,0.015,-0.029,0.027,0.045,-0.026


In [31]:
df["target"] = diabetes["target"]

In [32]:
df

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038,0.051,0.062,0.022,-0.044,-0.035,-0.043,-0.003,0.020,-0.018,151.0
1,-0.002,-0.045,-0.051,-0.026,-0.008,-0.019,0.074,-0.039,-0.068,-0.092,75.0
2,0.085,0.051,0.044,-0.006,-0.046,-0.034,-0.032,-0.003,0.003,-0.026,141.0
3,-0.089,-0.045,-0.012,-0.037,0.012,0.025,-0.036,0.034,0.023,-0.009,206.0
4,0.005,-0.045,-0.036,0.022,0.004,0.016,0.008,-0.003,-0.032,-0.047,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.042,0.051,0.020,0.060,-0.006,-0.003,-0.029,-0.003,0.031,0.007,178.0
438,-0.006,0.051,-0.016,-0.068,0.049,0.079,-0.029,0.034,-0.018,0.044,104.0
439,0.042,0.051,-0.016,0.017,-0.037,-0.014,-0.025,-0.011,-0.047,0.015,132.0
440,-0.045,-0.045,0.039,0.001,0.016,0.015,-0.029,0.027,0.045,-0.026,220.0


In [35]:
# define model
lm1 = smf.ols("target ~ age + sex + bmi + bp + s1 + s2 + s3 + s4 + s5 + s6", data=df)

# fit the model 
fit1 = lm1.fit()

# print stats
fit1.summary()

0,1,2,3
Dep. Variable:,target,R-squared:,0.518
Model:,OLS,Adj. R-squared:,0.507
Method:,Least Squares,F-statistic:,46.27
Date:,"Wed, 15 Jul 2020",Prob (F-statistic):,3.8299999999999998e-62
Time:,09:24:38,Log-Likelihood:,-2386.0
No. Observations:,442,AIC:,4794.0
Df Residuals:,431,BIC:,4839.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,152.1335,2.576,59.061,0.000,147.071,157.196
age,-10.0122,59.749,-0.168,0.867,-127.448,107.424
sex,-239.8191,61.222,-3.917,0.000,-360.151,-119.488
bmi,519.8398,66.534,7.813,0.000,389.069,650.610
bp,324.3904,65.422,4.958,0.000,195.805,452.976
s1,-792.1842,416.684,-1.901,0.058,-1611.169,26.801
s2,476.7458,339.035,1.406,0.160,-189.621,1143.113
s3,101.0446,212.533,0.475,0.635,-316.685,518.774
s4,177.0642,161.476,1.097,0.273,-140.313,494.442

0,1,2,3
Omnibus:,1.506,Durbin-Watson:,2.029
Prob(Omnibus):,0.471,Jarque-Bera (JB):,1.404
Skew:,0.017,Prob(JB):,0.496
Kurtosis:,2.726,Cond. No.,227.0


In [94]:
X = df.iloc[:, 0:9]
y = df.iloc[:, -1:]

X, y = np.array(X), np.array(y)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_lasso_r2s = []

In [97]:
for alpha in (1, 50, 100, 100, .005):
    cv_lasso_r2s = []
    
    for train_ind, val_ind in kf.split(X,y):

        X_train, y_train = X[train_ind], y[train_ind]
        X_val, y_val = X[val_ind], y[val_ind]

        lasso_model = Lasso(alpha = alpha)
        lasso_model.fit(X_train, y_train)
        cv_lasso_r2s.append(lasso_model.score(X_val, y_val))

    print("alpha: ", alpha, ", simple regression scores: ", cv_lasso_r2s)



alpha:  1 , simple regression scores:  [0.3575929204237066, 0.34424228519388844, 0.28902373012684723, 0.3964900248762879, 0.2753795502486067]
alpha:  50 , simple regression scores:  [-0.011962984778542074, -0.026977525334391794, -0.015303973538954851, -0.00021232908102475356, -0.04231597698524037]
alpha:  100 , simple regression scores:  [-0.011962984778542074, -0.026977525334391794, -0.015303973538954851, -0.00021232908102475356, -0.04231597698524037]
alpha:  100 , simple regression scores:  [-0.011962984778542074, -0.026977525334391794, -0.015303973538954851, -0.00021232908102475356, -0.04231597698524037]
alpha:  0.005 , simple regression scores:  [0.45253659345332264, 0.5669695614024161, 0.3897935394321954, 0.584164223622277, 0.3968508714781406]


In [78]:
from sklearn.model_selection import cross_val_score
lm = LinearRegression()

cross_val_score(lm, X, y, cv=5, scoring="r2")

array([0.4409735 , 0.5248217 , 0.47986219, 0.44452649, 0.55229566])

In [59]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cross_val_score(lm, X, y, cv=kf, scoring="r2")

array([0.44938019, 0.56995183, 0.38849081, 0.58604807, 0.39226456])