In [1]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [2]:
# fetch_housing_data()

In [3]:
import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [4]:
housing = load_housing_data()
# housing.head()

In [5]:
# housing.info()

In [6]:
# %matplotlib inline
# import matplotlib.pyplot as plt
# housing.hist(bins=50, figsize=(20,15))
# plt.show()

In [7]:
import numpy as np
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
type(housing)

pandas.core.frame.DataFrame

In [8]:
# %matplotlib inline
# import matplotlib.pyplot as plt
# housing["income_cat"].value_counts().sort_index().plot(kind='bar')

In [9]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
    
for set in (strat_train_set, strat_test_set):
    set.drop(["income_cat"], axis='columns', inplace=True)

In [10]:
housing = strat_train_set.copy()

In [11]:
# strat_test_set.hist(bins=50, figsize=(20,15))
# plt.show()

In [12]:
# plt.rcParams["figure.figsize"] = [12, 10]
# housing.plot(kind="scatter", x="longitude", y="latitude", alpha = 0.1)

In [13]:
# housing.plot(
#     kind="scatter",
#     x="longitude",
#     y="latitude",
#     alpha=0.4,
#     s=housing["population"] / 100,
#     label="population",
#     c="median_house_value",
#     colormap=plt.get_cmap("jet"),
#     colorbar=True,
# )
# plt.legend()

In [14]:
# corr_matrix = housing.corr()
# corr_matrix["median_house_value"].sort_values(ascending=False)

In [15]:
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms",
"housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0D43D350>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0E482290>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0E4A12B0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0E4B9410>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0E4D5570>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0E4D5590>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0E4EE6F0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0E5259D0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0E53EB30>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0E558C90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0E575DB0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0E58EF10>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0E5ABFB0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0E5CF130>,
        <matplotl

In [16]:
# housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)


In [17]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

In [18]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value          1.000000
median_income               0.687160
rooms_per_household         0.146285
total_rooms                 0.135097
housing_median_age          0.114110
households                  0.064506
total_bedrooms              0.047689
population_per_household   -0.021985
population                 -0.026920
longitude                  -0.047432
latitude                   -0.142724
bedrooms_per_room          -0.259984
Name: median_house_value, dtype: float64

In [19]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [20]:
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="median")
housing_num = housing.drop("ocean_proximity", axis=1)
X = imputer.fit_transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns)
# housing_tr

In [21]:
# housing_cat = housing["ocean_proximity"]

# from sklearn.preprocessing import LabelBinarizer
# encoder = LabelBinarizer()
# housing_cat_1hot = encoder.fit_transform(housing_cat)
# housing_cat_1hot

In [22]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    
    def fit(self, X, y=None):
        return self # nothing else to do
    
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
        
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [23]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names, factorize=False):
        self.attribute_names = attribute_names
        self.factorize = factorize
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        selection = X[self.attribute_names]
        if self.factorize:
            selection = selection.apply(lambda p: pd.factorize(p)[0] + 1)
        return selection.values

In [24]:
from sklearn.preprocessing import LabelBinarizer
class LabelBinarizerPipelineFriendly(LabelBinarizer):
    def fit(self, X, y=None):
        """this would allow us to fit the model based on the X input."""
        super(LabelBinarizerPipelineFriendly, self).fit(X)
    def transform(self, X, y=None):
        return super(LabelBinarizerPipelineFriendly, self).transform(X)

    def fit_transform(self, X, y=None):
        return super(LabelBinarizerPipelineFriendly, self).fit(X).transform(X)

In [25]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('label_binarizer', LabelBinarizerPipelineFriendly()),
])

full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

housing_prepared = full_pipeline.fit_transform(housing)
print(housing_prepared)

[[-1.15604281  0.77194962  0.74333089 ...  0.          0.
   0.        ]
 [-1.17602483  0.6596948  -1.1653172  ...  0.          0.
   0.        ]
 [ 1.18684903 -1.34218285  0.18664186 ...  0.          0.
   1.        ]
 ...
 [ 1.58648943 -0.72478134 -1.56295222 ...  0.          0.
   0.        ]
 [ 0.78221312 -0.85106801  0.18664186 ...  0.          0.
   0.        ]
 [-1.43579109  0.99645926  1.85670895 ...  0.          1.
   0.        ]]
