In [1]:
import os
import tarfile
from six.moves import urllib
import pandas as pd
import numpy as np

In [2]:
HOUSING_PATH = os.path.join("datasets", "housing")

In [3]:
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [30]:
housing = load_housing_data()
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


---
### Data Exploration And Sampling (Test and Train Split)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

# histograms of features
housing.hist(bins=50, figsize=(20,15))
plt.show()

In [31]:
# stratified sampling based on income category
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6.0, np.inf],
                               labels=[1, 2, 3, 4, 5])

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['income_cat'].cat.codes):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]


for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [32]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [None]:
from pandas.plotting import scatter_matrix
# Exploring the train data

housing = strat_train_set.copy()

housing.plot(kind="scatter", x='longitude', y='latitude', alpha=0.1)

housing.plot(kind="scatter", x='longitude', y='latitude', alpha=0.4, s=housing['population']/100, label='population', figsize=(10,7), c='median_house_value', cmap=plt.get_cmap("jet"), colorbar=True)

# correlation between different features

corr_matrix = housing.corr() #ocean_proximity column is popped at this point, but I deleted the code for it; next time, copy the DF and pop the column.

attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))

# exploring new derived features; combined attributes
housing["rooms_per_household"] = housing["total_bedrooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]

corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [35]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# taking care of missing values
imputer = SimpleImputer(strategy="median")

housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)

X = imputer.fit_transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns)

# categorical data transformation

cat_encoder = OneHotEncoder()
housing_cat = housing[["ocean_proximity"]]

housing_cat_one_hot = cat_encoder.fit_transform(housing_cat)
housing_cat_one_hot

<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

---
### Data Processing Pipeline
Upto here, everything was done in chunks and for understanding purposes.
The following code is a complete pipeline that takes ```housing``` data and does all the preprocessing necessary.

In [33]:
from sklearn.base import BaseEstimator, TransformerMixin

# Custom Transformers for combined attributes
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self 
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]


In [36]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
# Using sklearn pipelines for all transformations

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

housing_num = housing.drop("ocean_proximity", axis=1)
numerical_attribs = list(housing_num)
categorical_attribs = ["ocean_proximity"]

end2end_pipeline = ColumnTransformer([
    ("numerical", numerical_pipeline, numerical_attribs),
    ("categorical", OneHotEncoder(), categorical_attribs)
])

housing_prepared = end2end_pipeline.fit_transform(housing)

In [37]:
housing_prepared.shape

(16512, 16)

---
### Model Selection and Training

Linear Regression

In [38]:
from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()
linear_reg.fit(housing_prepared, housing_labels)

In [42]:
sample = housing.iloc[5:10]
sample_labels = housing_labels.iloc[5:10]
sample_prepared = end2end_pipeline.transform(sample)
print("Predictions:", linear_reg.predict(sample_prepared))
print("Labels:", list(sample_labels))

In [45]:
from sklearn.metrics import mean_squared_error
housing_predictions = linear_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

68627.87390018745

Decision Tree

In [46]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse


0.0

In [47]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)

Scores: [72125.60240522 70473.2190477  68234.47347643 72456.00143424
 71349.78108396 79167.30312141 71057.35619355 73284.91710376
 68693.34948182 70187.11162972]
Mean: 71702.91149778356
Standard deviation: 2905.2445239862996
