In [1]:
# Importing necessary packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
%matplotlib inline
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
import sklearn.metrics as metrics
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.datasets import make_regression

import os

  data = yaml.load(f.read()) or {}
  defaults = yaml.load(f)


In [2]:
data = pd.read_csv("housing_data_clean.csv")

In [3]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
# Dealing with Categorical values by One-Hot Encoding

dummy = pd.get_dummies(data["ocean_proximity"])
dummy.head()

Unnamed: 0,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,0,0,0,1,0
1,0,0,0,1,0
2,0,0,0,1,0
3,0,0,0,1,0
4,0,0,0,1,0


In [5]:
df = pd.concat([data, dummy], axis=1)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,0,0,0,1,0


In [6]:
df = df.drop('ocean_proximity', axis=1)
df = df.rename(columns={"<1H OCEAN":"1H OCEAN"})
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,0,1,0


In [7]:
# Spliting target variable and independent variables
X = df.drop(['median_house_value'], axis = 1)
y = df['median_house_value']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=10)

In [None]:
# The following function is for counting unique values
def unique_value_counts(d):
    dict = {}
    for i in list(d.columns):
        dict[i] = d[i].value_counts().shape[0]
    return pd.DataFrame(dict,index=["unique count"]).transpose()


# The following function is for creating heatmap figure
def heat_map(d):
    plt.figure(figsize=(12,8))
    _ = sns.heatmap(d.corr(), annot=True)
    
    
# The following function is for fitting Linear Regression Model
def lin_reg_model(x_trn, x_tst, y_trn, y_tst):
    lm = LinearRegression()
    lm.fit(x_trn, y_trn)
    return (lm.predict(x_trn), lm.predict(x_tst));


# The following function is for fitting Ridge Regression Model
def ridge_reg_model(x_trn, x_tst, y_trn, y_tst, alpha):
    scaler = StandardScaler()
    X_train_std = scaler.fit_transform(x_trn)
    X_test_std = scaler.transform(x_tst)

    # Fit a Ridge regression model with regularization parameter alpha=0.1
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train_std, y_trn)

    # Evaluate the model on the test set
    return (ridge.predict(X_train_std), ridge.predict(X_test_std));


# The following function is for fitting Lasso Regression Model
def lasso_reg_model(x_trn, x_tst, y_trn, y_tst, alpha):
    scaler = StandardScaler()
    X_train_std = scaler.fit_transform(x_trn)
    X_test_std = scaler.transform(x_tst)

    # Fit a Ridge regression model with regularization parameter alpha=0.1
    lasso = Lasso(alpha=alpha)
    lasso.fit(X_train_std, y_trn)

    # Evaluate the model on the test set
    return (lasso.predict(X_train_std), lasso.predict(X_test_std));


# The following function is for fitting Random Forest Regression Model
def rand_reg_model(x_trn, x_tst, y_trn, y_tst):
    rfm = RandomForestRegressor()
    rfm.fit(x_trn, y_trn)
    return (rfm.predict(x_trn), rfm.predict(x_tst));


# The following function is for fitting XGBRegression Model
def xgb_reg_model(x_trn, x_tst, y_trn, y_tst):
    xgb = XGBRegressor()
    xgb.fit(x_trn, y_trn)
    return (xgb.predict(x_trn), xgb.predict(x_tst));

In [15]:
for alpha in range(1,100,10):
    (y_trn_pred_ridge, y_tst_pred_ridge) = ridge_reg_model(X_train, X_test, y_train, y_test, alpha)
    rmse_test_ridge = np.sqrt(metrics.mean_squared_error(y_test, y_tst_pred_ridge))
    rmse_train_ridge = np.sqrt(metrics.mean_squared_error(y_train, y_trn_pred_ridge))
    print(f"alpha={alpha}, {rmse_train_ridge}, {rmse_test_ridge}")


alpha=1, 68855.35566764213, 67391.81859717317
alpha=11, 68855.94982233206, 67392.68055031117
alpha=21, 68857.46145959804, 67394.72063207836
alpha=31, 68859.79991310702, 67397.81895317644
alpha=41, 68862.88453025333, 67401.86890534798
alpha=51, 68866.64340342589, 67406.77548798357
alpha=61, 68871.01228371462, 67412.45387310901
alpha=71, 68875.93364765923, 67418.82817078664
alpha=81, 68881.35589289412, 67425.83036365567
alpha=91, 68887.23264276366, 67433.39938473416


In [17]:
for alpha in range(1,100,10):
    (y_trn_pred_lasso, y_tst_pred_lasso) = lasso_reg_model(X_train, X_test, y_train, y_test, 0.0001)
    rmse_test_lasso = np.sqrt(metrics.mean_squared_error(y_test, y_tst_pred_lasso))
    rmse_train_lasso = np.sqrt(metrics.mean_squared_error(y_train, y_trn_pred_lasso))
    print(f"alpha={alpha}, {rmse_train_lasso}, {rmse_test_lasso}")


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


alpha=1, 68855.35053862493, 67391.80226603009
alpha=11, 68855.35053862493, 67391.80226603009
alpha=21, 68855.35053862493, 67391.80226603009


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


alpha=31, 68855.35053862493, 67391.80226603009
alpha=41, 68855.35053862493, 67391.80226603009
alpha=51, 68855.35053862493, 67391.80226603009


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


alpha=61, 68855.35053862493, 67391.80226603009
alpha=71, 68855.35053862493, 67391.80226603009
alpha=81, 68855.35053862493, 67391.80226603009
alpha=91, 68855.35053862493, 67391.80226603009


  model = cd_fast.enet_coordinate_descent(


**Comments:** Increasing alpha does not improve the model performance for Ridge and Lasso. We will choose alpha=0.1 therefore.

In [9]:
# Create a list of models
models = [
    LinearRegression(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    XGBRegressor(),
    Ridge(alpha=0.1),
    Lasso(alpha=0.1)
]

# Fit each model to the training data and make predictions on the validation data
for model in models:
    model.fit(X_train, y_train)
    #y_pred = model.predict(X_test)
    #rmse = metrics.mean_squared_error(y_test, y_pred, squared=False)
    #print(f"{type(model).__name__}: RMSE={rmse:.5f}")

# Create a table of RMSE values
rmse_values_test = [metrics.mean_squared_error(y_test, model.predict(X_test), squared=False) for model in models]
rmse_values_train = [metrics.mean_squared_error(y_train, model.predict(X_train), squared=False) for model in models]
model_names = [type(model).__name__ for model in models]
data = {'Model': model_names, 'RMSE_train': rmse_values_train, 'RMSE_test': rmse_values_test}
df = pd.DataFrame(data)
df

  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,Model,RMSE_train,RMSE_test
0,LinearRegression,68855.350539,67391.802258
1,DecisionTreeRegressor,0.0,64616.800399
2,RandomForestRegressor,18051.948574,47487.658015
3,XGBRegressor,29571.483003,45632.474641
4,Ridge,68855.363823,67391.84202
5,Lasso,68855.350811,67391.811623
