In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

import urllib.request
import tarfile

from pathlib import Path

### Fetch Data
California Housing Prices dataset from the StatLib repository

In [None]:
dataUrl = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.tgz"

dataTgzPath = Path("housing.tgz")
dataCsvPath = Path("housing.csv")

if not dataCsvPath.exists():
    urllib.request.urlretrieve(dataUrl, dataTgzPath)
    dataTgzFile = tarfile.open(dataTgzPath)
    dataTgzFile.extractall()
    dataTgzFile.close()
    dataTgzPath.unlink()

In [None]:
housingData = pd.read_csv(dataCsvPath)

---

###### Show the first 5 rows of the dataframe

In [None]:
housingData.head()

---

###### Show column names and data types

In [None]:
housingData.info()

Notice ```total_bedrooms``` has only 20433 nonnull values, meaning 207 districts are missing that feature

---

###### Show column count, mean, std, min, max, etc.

In [None]:
housingData.describe()

In [None]:
housingData.hist(bins=50, figsize=(20,15))
plt.show()

Some of these histograms are tail-heavy, they extend much farther to the right of the median. This may make it harder for some ML algorithms to detect patterns 

In [None]:
housingData["median_income"].hist(bins=50, figsize=(15,5))
plt.show()

The data has been scaled and capped at 15

15  - Higher Median Incomes

0.5 - Lower Median Incomes

This number also represents rought tens of thousands of dollars (eg. 3 is about $30,000)

In [None]:
housingData[["housing_median_age", "median_house_value"]].hist(figsize=(15,5))
plt.show()

Both of these attributes are capped, meaning you can collect propper labels or you can remove capped districts

---

Find correlations

In [None]:
pd.plotting.scatter_matrix(housingData, figsize=(25, 15))
housingData.corr()

### Data Cleaning

#### Missing Features

Since ```total_bedrooms``` is missing 207 values, the options are
1. Get rid of the corresponding districts:

```python
housingData.dropna(subset["total_bedrooms"])
```
2. Get rid of the whole attribute:

```python
housingData.drop(subset["total_bedrooms"], axis=1)
```
3. Set the values to some value:

```python
median = housingData["total_bedrooms"].median()
housingData["total_bedrooms"].fillna(median, inplace=True)
```

or use `sklearn.impute.SimpleImputer`

In [None]:
from sklearn.impute import SimpleImputer
inputer = SimpleImputer(strategy="median")

# ocean_proximty contains only text attributes
# create another dataframe based on the original
# with only numerical attributes

housingDataNumericalOnly = housingData.drop("ocean_proximity", axis=1)

# This will apply the inputer to all missing numerical
# values, not only ocean_proximity
inputer.fit(housingDataNumericalOnly)
print(inputer.statistics_)

transformedFeatures = inputer.transform(housingDataNumericalOnly)

# Put back into pandas dataframe
housingDataFull = pd.DataFrame(transformedFeatures, 
                           columns=housingDataNumericalOnly.columns, 
                           index=housingDataNumericalOnly.index)

housingDataFull.info()

#### Text and Categorial Attributes
```ocean_proximity``` contains text attributes

In [None]:
print(housingData["ocean_proximity"].unique())

There are a limited number of possible values, each of one represents a category. So this attribute is a categorial attribute.

```sklearn.preprocessing.OrdinalEncoder``` can convert text to numbers

In [None]:
from sklearn.preprocessing import OrdinalEncoder
ordEnc = OrdinalEncoder()
housingDataEncoded = ordEnc.fit_transform(housingData[["ocean_proximity"]])
np.unique(housingDataEncoded)

This can create a problem, some ML algorithms will assume that two nearby values are more similar than two distant values. This may be fine in some cases (e.g. some ordered categories lokes "bad", "average", "good" and "excelent"), but it is not the case here. To fix this issue, a common solution is to create one binary attribute per category.

In [None]:
from sklearn.preprocessing import OneHotEncoder
categoriesEncoder = OneHotEncoder()
housingCategories1H = categoriesEncoder.fit_transform(housingData[["ocean_proximity"]])
housingCategories1H

In [None]:
list(categoriesEncoder.categories_[0])

In [None]:
print(housingCategories1H[0, :].toarray())

#### Custom Transformers
It is a good idea to try attribute combinations:

```python
housingData["rooms_per_household"] = housingData["total_rooms"] / housingData["households"] 
housingData["bedrooms_per_room"] = housingData["total_bedrooms"] / housingData["total_rooms"] 
housingData["population_per_household"] = housingData["population"] / housingData["households"] 
```

The following class is a Custom Transformer that adds attributes combinations.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

# column index
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

# Create a class to select numerical or categorical columns 
class OldDataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

#### Feature Scaling
ML algorithms don't perform well when the input numerical attributes have very different scales

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler 

# Create a train set and a test set, the test set is usually 20% of the data
HousingDataTrainSet, housingDataTestSet = train_test_split(housingData, test_size=0.2, random_state=42)

HousingDataTrainSetToFit = HousingDataTrainSet.drop("median_house_value", axis=1)
HousingDataTrainSetLabels = HousingDataTrainSet["median_house_value"].copy()

housingDataNumericalOnly = HousingDataTrainSetToFit.drop("ocean_proximity", axis=1)

numericalAttributes = list(housingDataNumericalOnly)
categoricalAttributes = ["ocean_proximity"]

numericalPipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attributes_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

fullPipeline = ColumnTransformer([
    ('numerical', numericalPipeline, numericalAttributes),
    ('categorical', OneHotEncoder(), categoricalAttributes)
])

preparedHousingData = fullPipeline.fit_transform(HousingDataTrainSetToFit)

In [None]:
preparedHousingData.shape

In [None]:
preparedHousingData

In [None]:
from sklearn.linear_model import LinearRegression

linReg = LinearRegression()
linReg.fit(preparedHousingData, HousingDataTrainSetLabels)

In [None]:
# let's try the full preprocessing pipeline on a few training instances
some_data = HousingDataTrainSetToFit.iloc[:5]
some_labels = HousingDataTrainSetLabels.iloc[:5]
some_data_prepared = fullPipeline.transform(some_data)

print("Predictions:", linReg.predict(some_data_prepared))
plt.plot(linReg.predict(some_data_prepared))

In [None]:
print("Labels:", list(some_labels))
plt.plot(list(some_labels))

In [None]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse