# Loading the data
The code is for downloading the data from the URL to not dwell on it. First, we imported the os module for interacting with the Operating System. After that, we imported the tarfile module for accessing and manipulating tar files. Lastly, we imported the urllib for using URL manipulation functions.

Then, we set our paths appropriately. In the get_data() function, we made a directory for our data, retrieved it from the URL then extracted and stored it.

So, in your working directory, you will notice a directory called datasets created. On opening it, you will get another directory called housing with a file named housing.csv in it. We will use this file.

In [1]:
import os
import tarfile
from six.moves import urllib

OUR_ROOT_URL = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
OUR_PATH = "datasets/housing"
OUR_DATA_URL = OUR_ROOT_URL + OUR_PATH + "/housing.tgz"

def get_data(our_data_url=OUR_DATA_URL, our_path=OUR_PATH):
      if not os.path.isdir(our_path):
            os.makedirs(our_path)
      #setting the zip file path      
      zipfile_path = os.path.join(our_path, "housing.tgz")
      #getting the file from the url and extracting it
      urllib.request.urlretrieve(our_data_url, zipfile_path)
      our_zip_file = tarfile.open(zipfile_path)
      our_zip_file.extractall(path=our_path)
      our_zip_file.close()

get_data()

In [2]:
import pandas as pd

def load_our_data(our_path=OUR_PATH):
    #setting the csv file path
    our_file_path = os.path.join(our_path, "housing.csv")
    #reading it using Pandas
    return pd.read_csv(our_file_path)

our_dataset = load_our_data()

In [3]:
our_dataset.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
our_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


# Cleaning data
The cleaning operation we will do here is filling empty numeric attributes with their median values. We will use the SimpleImputer, an estimator, to do that. But, first, we set the strategy to median to calculate the median value for each column’s empty data.

In [5]:
from sklearn.impute import SimpleImputer
'''setting the `strategy` to `median` so that it calculates the median value for each column's empty data'''
imputer = SimpleImputer(strategy="median")
#removing the ocean_proximity attribute for it is textual
our_dataset_num = our_dataset.drop("ocean_proximity", axis=1)
#estimation using the fit method
imputer.fit(our_dataset_num)
#transforming using the learnedparameters
X = imputer.transform(our_dataset_num)
#setting the transformed dataset to a DataFrame
our_dataset_numeric = pd.DataFrame(X, columns=our_dataset_num.columns)

We dropped the ocean_proximity attribute because it’s a text attribute that will handle in the next section.

# Handling text and categorical attributes
We cannot handle text and numerical attributes similarly. So, for example, we cannot compute the median of text.

We will use a transformer for this called the OrdinalEncoder. It is chosen because it is more pipeline friendly. Moreover, it assigns numbers to the corresponding text attributes, e.g., 1 for NEAR and 2 for FAR.

In [6]:
from sklearn.preprocessing import OrdinalEncoder
#selecting the textual attribute
our_text_cats = our_dataset[['ocean_proximity']]
our_encoder = OrdinalEncoder()
#transforming it
our_encoded_dataset = our_encoder.fit_transform(our_text_cats)

# Data Transformer
This is where we will create the custom transformer. We will be adding these three attributes:

* Rooms per household.
* Population per household.
* Bedrooms per household.

In [7]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
#initialising column numbers
rooms,  bedrooms, population, household = 3,4,5,6

class CustomTransformer(BaseEstimator, TransformerMixin):
    #the constructor
    '''setting the add_bedrooms_per_room to True helps us check if the hyperparameter is useful'''
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    #estimator method
    def fit(self, X, y = None):
        return self
    #transfprmation
    def transform(self, X, y = None):
        #getting the three extra attributes by dividing appropriate attributes
        rooms_per_household = X[:, rooms] / X[:, household]
        population_per_household = X[:, population] / X[:, household]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms] / X[:, rooms]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attrib_adder = CustomTransformer()
our_extra_attributes = attrib_adder.transform(our_dataset.values)  

In [8]:
our_extra_attributes

array([[-122.23, 37.88, 41.0, ..., 6.984126984126984, 2.5555555555555554,
        0.14659090909090908],
       [-122.22, 37.86, 21.0, ..., 6.238137082601054, 2.109841827768014,
        0.15579659106916466],
       [-122.24, 37.85, 52.0, ..., 8.288135593220339, 2.8022598870056497,
        0.12951601908657123],
       ...,
       [-121.22, 39.43, 17.0, ..., 5.20554272517321, 2.325635103926097,
        0.21517302573203195],
       [-121.32, 39.43, 18.0, ..., 5.329512893982808, 2.1232091690544412,
        0.21989247311827956],
       [-121.24, 39.37, 16.0, ..., 5.254716981132075, 2.616981132075472,
        0.22118491921005387]], dtype=object)

# Full pipeline

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
#the numeric attributes transformation pipeline
numeric_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CustomTransformer()),
    ])
numeric_attribs = list(our_dataset_numeric)
#the textual transformation pipeline
text_attribs = ["ocean_proximity"]
#setting the order of the two pipelines
our_full_pipeline = ColumnTransformer([
        ("numeric", numeric_pipeline, numeric_attribs),
        ("text", OrdinalEncoder(), text_attribs),
    ])
'''Finally, scaling the data and learning the scaled parameters from the pipeline
'''
our_dataset_prepared = our_full_pipeline.fit_transform(our_dataset)

In [12]:
our_dataset_prepared

array([[-122.23      ,   37.88      ,   41.        , ...,    2.55555556,
           0.14659091,    3.        ],
       [-122.22      ,   37.86      ,   21.        , ...,    2.10984183,
           0.15579659,    3.        ],
       [-122.24      ,   37.85      ,   52.        , ...,    2.80225989,
           0.12951602,    3.        ],
       ...,
       [-121.22      ,   39.43      ,   17.        , ...,    2.3256351 ,
           0.21517303,    1.        ],
       [-121.32      ,   39.43      ,   18.        , ...,    2.12320917,
           0.21989247,    1.        ],
       [-121.24      ,   39.37      ,   16.        , ...,    2.61698113,
           0.22118492,    1.        ]])

In [13]:
our_dataset_prepared.shape

(20640, 13)

# References
* https://github.com/carloslme/handson-ml2-book/blob/main/02_end_to_end_machine_learning_project.ipynb
* https://www.section.io/engineering-education/custom-transformer
* https://www.oreilly.com/library/view/hands-on-machine-learning/9781492032632/