In [None]:
from traitlets.config.manager import BaseJSONConfigManager
from pathlib import Path
path = Path.home() / ".jupyter" / "nbconfig"
cm = BaseJSONConfigManager(config_dir=str(path))
cm.update(
    "rise",
    {
        "theme": "black",
        "transition": None,
        "start_slideshow_at": "selected",
        "enable_chalkboard": True,
        "chalkboard": {
            "color": ["rgb(225, 193, 7)", "rgb(30, 136, 229)"]
        },
     }
)

In [None]:
# edit train_linear_model to train ridge models as well
def train_linear_model(X_train, y_train, model_type):
    if model_type == "unregularized":
        reg = LinearRegression().fit(X_train,y_train)
    elif model_type == 'ridge':
        reg = RidgeCV(alphas=[1e-3,1e-2,1e-1,1,10,100,1000], store_cv_values=True).fit(X_train,y_train)
        print(reg.cv_values_.shape) # num_datapoints x num_alphas
        print(np.mean(reg.cv_values_, axis=0))
        print('alpha:', reg.alpha_)
    elif model_type == 'lasso':
        reg = LassoCV(random_state=0, alphas=[1e-3,1e-2,1e-1,1,10,100,1000], max_iter=100000, tol=1e-3).fit(X_train,y_train)
        print('alpha:', reg.alpha_)
        print('alphas:', reg.alphas_)
    elif model_type == 'elastic':
        reg = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1],alphas=[1e-5,1e-4,1e-3,1e-2,1e-1,1,10]).fit(X_train,y_train)
        print('alpha:', reg.alpha_)
        print('l1_ratio:', reg.l1_ratio_)
    else:
        raise ValueError('Unexpected model_type encountered; model_type = ' + model_type)

    # print number of estimated model coefficients. Need to add one to account for y-intercept (not included in reg.coef_ call)
    print('# model coefs = ' + str(len(reg.coef_)+1))

    return reg



- What is the model's train and test error? How does this compare to the unregularized model we fit using all predictor variables? How does this model compare to the best univariate model we fit?
  - The ridge model does much better (i.e., in terms of Test RMSE) than the unregularized model that uses all predictor vars.
  - Unregularized_all_predictors_testRMSE: 3562241001
  - Unregularized_best_univariate_testRMSE: 48243
  - Regularized_all_predictors_testRMSE: 39004

- What alpha value was selected using RidgeCV? Is it a lower or higher value? What does this value tell you about the model?
  - This model is highly regularized/penalized since it has a large alpha value



In [None]:
# from sklearn.linear_model import RidgeCV

# # trained_model = RidgeCV(alphas=alphas, cv=10)
# trained_model = LassoCV(alphas=alphas, cv=5, max_iter=10000)

# trained_model = trained_model.fit(X_train_z, y_train)
# y_pred_train = trained_model.predict(X_train_z)
# y_pred_test = trained_model.predict(X_test_z)

# from regression_predict_sklearn import measure_model_err
# error_df = measure_model_err(y=y, baseline_pred=y.mean(),
#                               y_train=y_train, y_pred_train=y_pred_train,
#                               y_test=y_test, y_pred_test=y_pred_test,
#                               metric='RMSE', y_log_scaled=True)


# error_df.head()

In [None]:
### try pca before LASSO
from sklearn.decomposition import PCA
import pandas as pd
N_PC = min(80, X_train_z.shape[1])
pca = PCA(n_components=N_PC)
pca.fit(X_train_z)

# Transform both training and test data using the same top 30 principal components
X_train_pca = pca.transform(X_train_z)
X_test_pca = pca.transform(X_test_z)

# Create new DataFrames with the transformed values and the original column names
X_train_pca_df = pd.DataFrame(X_train_pca, columns=[f'PC{i+1}' for i in range(N_PC)])
X_test_pca_df = pd.DataFrame(X_test_pca, columns=[f'PC{i+1}' for i in range(N_PC)])

# full-dim
trained_model, error_df = fit_eval_model(y=y, baseline_pred=y.mean(),
               X_train=X_train_pca_df, y_train=y_train,
               X_test=X_test_pca_df, y_test=y_test, 
               predictors=X_train_pca_df.columns,
               metric='RMSE',
               y_log_scaled=True,
               model_type='unregularized',
               include_plots=True, plot_raw=True, verbose=True)

# LASSO
import numpy as np
alphas = np.logspace(-5, 1, 500)
trained_model, error_df = fit_eval_model(y=y, baseline_pred=y.mean(),
                                         X_train=X_train_pca_df, y_train=y_train,
                                         X_test=X_test_pca_df, y_test=y_test, 
                                         predictors=X_train_pca_df.columns,
                                         metric='RMSE',
                                         y_log_scaled=True,
                                         model_type='LassoCV', alphas=alphas, cv=3, max_iter=100000,
                                         include_plots=True, plot_raw=True, verbose=True)

In [None]:

##### CODE_START
# Extract target, `y` and predictors, `X`.
y_log = np.log(housing['target']) 
predictors = ['LotArea', 'YearBuilt', 'YearRemodAdd', 'GarageArea', 'GarageCars', 'Neighborhood'] 
X=housing['data'][predictors]
X.head()
##### CODE_END


##### CODE_START
# Preprocess the data
from preprocessing import encode_predictors_housing_data
X_enc = encode_predictors_housing_data(X)
X_enc.head()

from preprocessing import remove_bad_cols
X_good = remove_bad_cols(X_enc, 95) 
##### CODE_END


##### CODE_START
multicollinearity_test(X_good);
X_better = X_good.drop(['GarageCars','YearBuilt'],axis = 1)
multicollinearity_test(X_better);
##### CODE_END


##### CODE_START
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_better, y_log, test_size=0.33, random_state=0)
##### CODE_END


##### CODE_START
from preprocessing import zscore
X_train_z = zscore(df=X_train, train_means=X_train.mean(), train_stds=X_train.std())
X_test_z = zscore(df=X_test, train_means=X_train.mean(), train_stds=X_train.std())
X_train_z.head()
##### CODE_END


##### CODE_START
# Add a constant column to the predictor variables dataframe
X_train_z = sm.add_constant(X_train_z)
print(X_train_z.head())
# Add the constant to the test set as well so we can use the model to form predictions on the test set later
X_test_z = sm.add_constant(X_test_z)
print(X_test_z.head())
# Fit the multivariate regression model
model = sm.OLS(y_train, X_train_z)
trained_model = model.fit()

##### CODE_END


##### CODE_START
from regression_predict_sklearn import measure_model_err
# to calculate residuals and R-squared for the test set, we'll need to get the model predictions first
y_pred_train = trained_model.predict(X_train_z)
y_pred_test = trained_model.predict(X_test_z)
errors_df = measure_model_err(y, np.mean(y),
                      y_train, y_pred_train,
                      y_test, y_pred_test,
                      'RMSE', y_log_scaled=True) 

errors_df.head()
##### CODE_END


##### CODE_START
eval_regression_assumptions(trained_model=trained_model, X=X_train_z, y=y_train, 
                            y_pred=y_pred_train, y_log_scaled=True, plot_raw=False, threshold_p_value=.05);
##### CODE_END

#### EXERCISE_END