In [73]:
import sklearn
import sklearn.preprocessing as preprocessing
import sklearn.impute as impute
import sklearn.compose as compose
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

from pathlib import Path
import tarfile
import requests
from typing import Tuple
import matplotlib.pyplot as plt

sklearn.set_config(transform_output="pandas")
pd.set_option('display.width', 1000)

In [81]:
def load_housing_data():
  tar_path = Path("datasets/housing.tgz")

  if not tar_path.is_file():
    Path("datasets").mkdir(parents=True, exist_ok=True)
    url = "https://github.com/ageron/data/raw/main/housing.tgz"

    # download .tgz
    response = requests.get(url)
    assert response.status_code == 200, "failed to download the data file"
    with open(tar_path, "wb") as file:
      file.write(response.content)

    # extract .tgz
    with tarfile.open(tar_path) as file:
      file.extractall(Path("datasets"))

  # read as DataFrame
  return pd.read_csv(Path("datasets/housing/housing.csv"));

housing = load_housing_data()
print(housing)
print("*" * 150)
print(housing.info())
print("*" * 150)
print(housing.describe())


       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  population  households  median_income  median_house_value ocean_proximity
0        -122.23     37.88                41.0        880.0           129.0       322.0       126.0         8.3252            452600.0        NEAR BAY
1        -122.22     37.86                21.0       7099.0          1106.0      2401.0      1138.0         8.3014            358500.0        NEAR BAY
2        -122.24     37.85                52.0       1467.0           190.0       496.0       177.0         7.2574            352100.0        NEAR BAY
3        -122.25     37.85                52.0       1274.0           235.0       558.0       219.0         5.6431            341300.0        NEAR BAY
4        -122.25     37.85                52.0       1627.0           280.0       565.0       259.0         3.8462            342200.0        NEAR BAY
...          ...       ...                 ...          ...             ...         ...       

In [102]:
# Preprocessing [DataFrame] : Category -> Get Dummy Columns
print(housing["ocean_proximity"].value_counts())
pd.get_dummies(housing, columns=['ocean_proximity'])

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,0,1,0,0,0
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,0,1,0,0,0
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,0,1,0,0,0
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,0,1,0,0,0


In [104]:
# Sklearn-Pipeline: Combine Encoder/Imputer in a pipeline
#   OrdinalEncoder: Categories -> 0, 1, 2, 3, ..etc
transformers = [
    ("encoder", preprocessing.OrdinalEncoder(), ["ocean_proximity"]),
    ("imputer", impute.SimpleImputer(strategy="mean"), ["total_bedrooms"])
]
column_transformer = compose.ColumnTransformer(transformers)

pipeline = Pipeline(steps=[
    ("transformers", column_transformer),
])
housing_tr = pipeline.fit_transform(housing)

print(housing_tr)
print("*" * 150)
print(housing_tr.info())
print("*" * 150)
print(housing_tr.describe())

       encoder__ocean_proximity  imputer__total_bedrooms
0                           3.0                    129.0
1                           3.0                   1106.0
2                           3.0                    190.0
3                           3.0                    235.0
4                           3.0                    280.0
...                         ...                      ...
20635                       1.0                    374.0
20636                       1.0                    150.0
20637                       1.0                    485.0
20638                       1.0                    409.0
20639                       1.0                    616.0

[20640 rows x 2 columns]
******************************************************************************************************************************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 2 columns):
 #   Column                    