In [1]:
# imports
import os
import tarfile
import urllib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pandas.plotting import scatter_matrix

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error

In [2]:
# load data into dataframe
df = pd.read_csv("datasets/housing/housing.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [3]:
df = df.drop("ocean_proximity", axis=1)

In [4]:
if "median_income_cat" not in list(df):
    df.insert(9, "median_income_cat", 
          pd.cut(df["median_income"],bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1,2,3,4,5]))

In [5]:
type(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   longitude           20640 non-null  float64 
 1   latitude            20640 non-null  float64 
 2   housing_median_age  20640 non-null  float64 
 3   total_rooms         20640 non-null  float64 
 4   total_bedrooms      20433 non-null  float64 
 5   population          20640 non-null  float64 
 6   households          20640 non-null  float64 
 7   median_income       20640 non-null  float64 
 8   median_house_value  20640 non-null  float64 
 9   median_income_cat   20640 non-null  category
dtypes: category(1), float64(9)
memory usage: 1.4 MB


NoneType

In [6]:
# perform stratified training/test splitting base on income_cat

cv1, cv2, cv1_splits, cv2_splits = (
    StratifiedShuffleSplit(n_splits = 5, test_size = .2, random_state = 42),
    StratifiedShuffleSplit(n_splits = 5, test_size = .2, random_state = 43),
    [],
    []
)

for splitter, splits in [(cv1, cv1_splits),(cv2, cv2_splits)]:
    for train_index, test_index in splitter.split(df, df["median_income_cat"]):
        splits.append((train_index, test_index))

# set initial dataset
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"].copy()

In [7]:
preprocess_pipe = Pipeline(
    [
        (
            "ctransformer", ColumnTransformer([
                ( 
                    "num_pipe", 
                    Pipeline([
                        ("imputer", SimpleImputer(strategy="median")),
                        ("scaler", StandardScaler())
                    ]), 
                    list(X.select_dtypes(include=[np.number])),
                )
            ],
            remainder = "drop")
        )
    ]
)

In [8]:
final_pipe = Pipeline([
    ("preprocess", preprocess_pipe),
    ("model", DecisionTreeRegressor())
])

In [9]:
param_grid = [
    {
        "preprocess__ctransformer__num_pipe__imputer__strategy": ["mean","median"],
        "model__max_depth": range(1, 21), "model__max_features": np.linspace(0.1,1,19)
    }
]

In [10]:
grid_search = GridSearchCV(
    final_pipe, param_grid, cv = cv1, scoring = "neg_mean_squared_error"
)

mdl = grid_search.fit(X, y)

generalization_error = cross_val_score(
    mdl, X = X, y = y, cv = cv2, scoring = "neg_mean_squared_error"
)

TypeError: cannot pickle 'generator' object

In [None]:
np.sqrt(-generalization_error).mean()