### Ch2 - Geron ML book - Notes

#### Carlos J. Arguello, Ph.D.

Analyzing housing data

#### Sequence:

1. Create directory, download tar data, untar
2. Import pandas, import data to dataframe, use into and describe methods
3. Take a look at the distribution of numerical data. Use histograms 
4. Create train and test sets from the data. Stratify if needed. 
   Nice example here (not stratified) where the train and test sets are generated using a hash function (md5)
5. Leave alone test set. Visualize train set.
6. Determine correlations of features with feature to be predicted. Create new features if needed. 
7. Prepare data for training. Imputer for numerical values, encoder for categorical variables. 
8. Imputer is an "estimator". So it has a fit, and a transform method. Same with encoder.
9. It is possible to define custom estimators/transformers. Use BaseEstimator and TransformerMixin as base classes, define fit and transform methods as desired.
10. 

In [None]:
import os
import tarfile
from six.moves import urllib

In [None]:
# Get data files

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [None]:
# fetch data:
fetch_housing_data()

In [None]:
os.listdir("datasets/housing")

In [None]:
import pandas as pd

In [None]:
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [None]:
housing = load_housing_data()

In [None]:
housing.head()

In [None]:
housing.info()

In [None]:
housing["ocean_proximity"].value_counts()

In [None]:
housing.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))

In [None]:
import numpy as np
np.random.seed(42)

In [None]:
# Create a test set
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(test_ratio*len(data))
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[test_indices], data.iloc[train_indices]

In [None]:
test_set, train_set = split_train_test(housing, 0.2)

In [None]:
print(len(test_set), len(train_set))

In [None]:
# Use a hash of an instance identifier to split data in training and test sets:
import hashlib

def test_set_check(identifier, test_ratio, hash):
    return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio

def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
# Create test train with hash
housing_with_id = housing.reset_index()
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")

In [None]:
# Crete unique id, and then use hash on that id
housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")

In [None]:
# Or... just use scikit-learn:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [None]:
# Stratified sampling: Use median income as example.
# Define categoric variable for median income:

housing["income_cat"] = np.ceil(housing["median_income"]/1.5)

In [None]:
housing["median_income"].hist()

In [None]:
housing["income_cat"].hist()

In [None]:
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

In [None]:
housing["income_cat"].hist()

In [None]:
# Stratified train test per income mategory:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
strat_train_set.income_cat.value_counts()/len(strat_train_set)

In [None]:
housing.income_cat.value_counts()/len(housing)

In [None]:
# Remove added stratification category:
for set_ in (strat_test_set, strat_train_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [None]:
strat_test_set.info()

## Visualize:

In [None]:
housing = strat_train_set.copy()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1,
            s=housing.population/100, label="population", figsize=(10,7),
            c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True)
plt.legend()

In [None]:
# Correlations
corr_matrix = housing.corr()

In [None]:
corr_matrix.median_house_value.sort_values(ascending=False)

In [None]:
import matplotlib.pyplot as plt

plt.matshow(corr_matrix)
plt.xticks(range(len(corr_matrix.columns)), corr_matrix.columns);
plt.yticks(range(len(corr_matrix.columns)), corr_matrix.columns);

In [None]:
from pandas.plotting import scatter_matrix

In [None]:
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12,8))

## Define other attributes

In [None]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_household"] = housing["total_bedrooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]

In [None]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

## Prepare data for ML

In [None]:
# Clean copies of training instances and labels:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [None]:
# Imputer for total bedrooms:
from sklearn.impute import SimpleImputer as Imputer

imputer = Imputer(strategy="median")

# Use only numerical columns for imputer:
housing_num = housing.drop("ocean_proximity", axis=1)

imputer.fit(housing_num)

In [None]:
imputer.statistics_

In [None]:
housing_num.median().values

In [None]:
# Use imputer to transform the training set 
X = imputer.transform(housing_num)

In [None]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns)

In [None]:
housing_tr.head()

In [None]:
# Dealing with categorical variables:
# Encode categorical variables:

housing_cat = housing.ocean_proximity
housing_cat_encoded, housing_categories = housing_cat.factorize()

In [None]:
print(housing_cat_encoded)
print(housing_categories)

In [None]:
# Get rid of ordinal problem for categories:

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
# encoder.fit(housing_cat_encoded.reshape(-1,1))
# housing_cat_1hot = encoder.transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))

In [None]:
housing_cat_1hot

In [None]:
housing_cat_1hot.toarray()

In [None]:
# Or, use Categorical Encoder (not avaliable as of sklearn 0.19.1:

# from sklearn.preprocessing import CategoricalEncoder

# cat_encoder = CategoricalEncoder()
# housing_cat_reshaped = housing_cat.values.reshape(-1,1)
# housing_cat_1hot = cat_encoder.fit_transform(housing_cat_reshaped)
# housing_cat_1hot

## Custom Transformers

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
rooms_ix, bedrooms_ix, population_ix, household_ix = 3,4,5,6

In [None]:
# Duck typing (not inheritance). If it walks like a duck and talks like a duck... so just define whatever class and include the methods fit, transform and fit_transform:
# BaseEstimator as base class to get methods get_params and set_params
# TransformerMixin as base class to get method fit_transform

# This is a custom transformer class that adds the features from before:
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # Nothing to do here
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix]/X[:, household_ix]
        population_per_household = X[:, population_ix]/X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix]/X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [None]:
# Now, use the class to add attributes to the training set:

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.fit_transform(housing.values)

## Feature Scaling and Pipelines

In [None]:
# Standarization and min-max scaling:
from sklearn.preprocessing import MinMaxScaler # Transformer for this
from sklearn.preprocessing import StandardScaler # Ditto

In [None]:
# sklearn module to define transformation pipelines:
from sklearn.pipeline import Pipeline

In [None]:
# Define pipeline as list of tuples for numerical features:

num_pipeline = Pipeline([
    ('imputer', Imputer(strategy="median")),
    ('attrbs_adder', CombinedAttributesAdder()),
    ('std_scalar', StandardScaler())
])

In [None]:
housing_num_tr = num_pipeline.fit_transform(housing_num)

In [None]:
# Now to also deal with categorical vatiables:
# Column selector transfomer, returns numpy object
# Sklearn doesn't play nicely with numpy 

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self.attribute_names].values

In [None]:
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

In [None]:
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy="median")),
    ('attrbs_adder', CombinedAttributesAdder()),
    ('std_scalar', StandardScaler())
])

In [None]:
# Now for categorical
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): 
        return self
    def transform(self, X, y=None):
        encoder = OneHotEncoder()
        X_cat_encoded, X_categories = pd.factorize(X.flatten())
        return encoder.fit_transform(X_cat_encoded.reshape(-1,1))

In [None]:
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('encoder', CategoricalEncoder())
])

In [None]:
from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline)
])

In [None]:
housing_prepared = full_pipeline.fit_transform(housing)

In [None]:
# Train test model

from sklearn.linear_model import LinearRegression

In [None]:
lin_reg = LinearRegression()

In [None]:
lin_reg.fit(housing_prepared, housing_labels)