Loading Bibs

In [2]:
import numpy as np
from sklearn import datasets
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import statsmodels.api as sm
from sklearn.feature_selection import RFE
from sklearn.model_selection import RandomizedSearchCV


Loading Dataset

In [3]:
X, Y = datasets.load_diabetes(return_X_y=True)
full_database = pd.DataFrame(X,columns=['age','sex','bmi','bp','s1_tc','s2_ldl','s3_hdl','s4_tch','s5_ltg','s6_glu'])
full_database['disease_progression'] = Y

x = full_database.drop('disease_progression',axis=1)
y = full_database['disease_progression']

X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size=0.05,random_state=100)


In [4]:
def score_model(model,X_test,Y_test):
    Y_predict = model.predict(X_test)
    R2_score = metrics.r2_score(Y_test,Y_predict)
    print("R2_score:",R2_score)
    MSE = metrics.mean_squared_error(Y_test,Y_predict)
    MSE_sqrt = np.sqrt(MSE)
    print("MSE:",MSE_sqrt)
    return R2_score, MSE_sqrt

def train_model(model,X_train,Y_train):
    model.fit(X_train,Y_train)
    return model

LINEAR REGRESSION

In [5]:
model_LR = LinearRegression()
model_LR = train_model(model_LR,X_train,Y_train)
R2_LR,MSE_LR = score_model(model_LR,X_test,Y_test)

R2_score: 0.6492086119720775
MSE: 50.051544786457605


In [7]:
select = RFE(estimator = LinearRegression(), n_features_to_select=7,step=1)
select.fit(X_train,Y_train)
X_train_transform = select.transform(X_train)
X_test_transform = select.transform(X_test)

print(select.support_)

[False  True  True  True  True  True False  True  True False]


In [9]:
model_LR_2 = LinearRegression()
model_LR_2 = train_model(model_LR_2,X_train_transform,Y_train)
R2_LR_2,MSE_LR_2 = score_model(model_LR_2,X_test_transform,Y_test)

R2_score: 0.6459888861108414
MSE: 50.28071825460392


Ordinary Least Squares (OLS)

In [11]:
X_train_plus_constant = sm.add_constant(X_train)
X_test_plus_constant = sm.add_constant(X_test)
model_OLS = sm.OLS(Y_train,X_train_plus_constant,hasconst=True).fit()

R2_OLS, MSE_OLS = score_model(model_OLS,X_test_plus_constant,Y_test)
# Y_pred = model_OLS.predict(X_train_plus_constant)

R2_score: 0.6492086119720772
MSE: 50.05154478645762


In [13]:
X_train_plus_constant = sm.add_constant(X_train_transform)
X_test_plus_constant = sm.add_constant(X_test_transform)
model_OLS_2 = sm.OLS(Y_train,X_train_plus_constant,hasconst=True).fit()

R2_OLS_2, MSE_OLS_2 = score_model(model_OLS_2,X_test_plus_constant,Y_test)
# Y_pred = model_OLS.predict(X_train_plus_constant)

R2_score: 0.6459888861108662
MSE: 50.28071825460214


Random Forest Regressor

In [15]:
model_forest = RandomForestRegressor(random_state=100)
model_forest = train_model(model_forest,X_train,Y_train)
R2_forest,MSE_forest = score_model(model_forest,X_test,Y_test)

R2_score: 0.5363200063316871
MSE: 57.54428570580492


In [17]:
n_estimators = np.arange(20, 500, step=10)
criterion = ["squared_error", "absolute_error", "poisson"]
max_features = ["auto", "sqrt", "log2"]
max_depth = list(np.arange(2, 10, step=1))
min_samples_split = np.arange(2, 10, step=2)
min_samples_leaf = [1, 2, 4]
bootstrap = [True,False]

param_grid = {
    "n_estimators": n_estimators,
    "criterion": criterion,
    "max_features": max_features,
    "max_depth": max_depth,
    "min_samples_split": min_samples_split,
    "min_samples_leaf": min_samples_leaf,
    "bootstrap": bootstrap,
}

model_forest = RandomForestRegressor(random_state=100)
random_cv = RandomizedSearchCV(model_forest, param_grid, n_iter=100, cv=5,scoring="neg_root_mean_squared_error", n_jobs=-1, random_state = 100)
rcv = random_cv.fit(X_train, Y_train)

145 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
137 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/envs/testenv/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/envs/testenv/lib/python3.8/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/opt/conda/envs/testenv/lib/python3.8/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/opt/conda/envs/testenv/lib/python3.8/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter

In [18]:
rcv.best_params_

{'n_estimators': 160,
 'min_samples_split': 8,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 6,
 'criterion': 'poisson',
 'bootstrap': True}

In [20]:
rfn = RandomForestRegressor(**rcv.best_params_, random_state = 100)
model_forest = train_model(rfn,X_train,Y_train)
R2_forest,MSE_forest = score_model(model_forest,X_test,Y_test)

R2_score: 0.5471973193948223
MSE: 56.86532416799156


In [22]:
data = [['Linear Regression',R2_LR,MSE_LR],['Linear Regression 2',R2_LR_2,MSE_LR_2],['OLS',R2_OLS, MSE_OLS],['OLS_2',R2_OLS_2, MSE_OLS_2],['Random Forest Regressor',R2_forest,MSE_forest]]

print(pd.DataFrame(data,columns=['Model','R2','MSE']))

                     Model        R2        MSE
0        Linear Regression  0.649209  50.051545
1      Linear Regression 2  0.645989  50.280718
2                      OLS  0.649209  50.051545
3                    OLS_2  0.645989  50.280718
4  Random Forest Regressor  0.547197  56.865324
