In [None]:
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
data = pd.read_csv('./datasets/housing.csv')

data.head()

In [None]:
data.info()

In [None]:
data['ocean_proximity'].value_counts()

In [None]:
data.describe()

In [None]:
data.hist(bins=50, figsize=(20,15))

In [None]:
def split_train_test(data, test_ratio=0.2):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    train_indices = shuffled_indices[test_set_size:]
    test_indices = shuffled_indices[:test_set_size]

    return data.iloc[train_indices], data.iloc[test_indices] 

train_set, test_set = split_train_test(data)


In [None]:
housing = data

In [None]:
housing['income_cat'] = pd.cut(
    housing['median_income'], 
    bins=[.0, 1.5, 3., 4.5, 6, np.inf], 
    labels=[1, 2, 3, 4, 5]
    )

In [None]:
housing['income_cat'].hist()

In [None]:
shuffle_split = StratifiedShuffleSplit(n_splits=1, test_size=.2, random_state=42)

for train_index, test_index in shuffle_split.split(housing, housing['income_cat']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

# check income category proportions
train_proportions = strat_train_set['income_cat'].value_counts() / len(strat_train_set)
test_proportions = strat_test_set['income_cat'].value_counts() / len(strat_test_set)
init_data_proportions = housing['income_cat'].value_counts() / len(housing)

print('train_proportions: ', train_proportions)
print('-' * 100)
print('test_proportions: ', test_proportions)
print('-' * 100)
print('init data proportions: ', init_data_proportions)


In [None]:
# remove 'income_cat'

for set_ in (strat_train_set, strat_test_set):
    set_.drop('income_cat', axis=1, inplace=True)

In [None]:
exploring_housing = strat_train_set.copy()

In [None]:
exploring_housing.plot(kind='scatter', x='longitude', y='latitude', alpha=.1)

In [None]:
exploring_housing.plot(
    kind='scatter',
    x='longitude', 
    y='latitude', 
    alpha=.1, 
    s=exploring_housing['population'] / 100, 
    label='population', 
    figsize=(15, 11),
    c='median_house_value',
    cmap=plt.get_cmap('jet'),
    colorbar=True
)

plt.legend()

In [None]:
# standard correlation coefficient (called Pearson's r)
corr_matrix = housing.corr()
corr_matrix

In [None]:
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
attributes = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']

scatter_matrix(housing[attributes], figsize=(25, 18))

In [None]:
# The most promising attribute to predict the median house value is the median income

housing.plot(kind='scatter', x='median_income', y='median_house_value', alpha=.1, figsize=(25,18))

In [None]:
# try out various attribute combinations

housing['rooms_per_households'] = housing['total_rooms'] / housing['households']
housing['bedrooms_per_rooms'] = housing['total_bedrooms'] / housing['total_rooms']
housing['population_per_households'] = housing['population'] / housing['households']

corr_matrix = housing.corr()

corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
# Prepare the Data for Machine Learning Algorithms

housing = strat_train_set.drop('median_house_value', axis=1)
housing_labels = strat_train_set['median_house_value'].copy()

In [None]:
# clean null values

# housing.dropna(subset=['total_bedrooms']) # Get rid of the corresponding districts (delete rows which have 'total_bedrooms == null').
# housing.drop('total_bedrooms', axis=1) # Get rid of the whole attribute

# median = housing['total_bedrooms'].median() # Set null values to median value (or zero, means, etc)
# housing['total_bedrooms'].fillna(median, inplace=True)

# OR
# Scikit-Learn provides a handy class to take care of missing values
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')
housing_with_nums = housing.drop('ocean_proximity', axis=1)
imputer.fit(housing_with_nums)

# imputer.statistics_

X = imputer.transform(housing_with_nums) # return numpy array


housing_without_missing_values = pd.DataFrame(X, columns=housing_with_nums.columns, index=housing_with_nums.index)

housing_without_missing_values.info()




In [None]:
# Handling Text and Categorical Attributes

housing_cat = housing[['ocean_proximity']]

print(housing_cat.head(10))

from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat) # returns numpy array
print(housing_cat_encoded[:10]) # compare this with 'housing_cat.head(10)'
ordinal_encoder.categories_


from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder()
housing_cat_1hot = one_hot_encoder.fit_transform(housing_cat)

print('----------------------')
housing_cat_1hot.toarray()

In [None]:
# todo:
# you could replace each category with a learnable, low-dimensional vector called an embedding.

In [None]:
# todo: 99, Custom Transformers

from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]

        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]

            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]

        return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])

housing_num_tr = num_pipeline.fit_transform(housing_with_nums)

housing_num_tr

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_with_nums)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

housing_prepared = full_pipeline.fit_transform(housing)

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

In [None]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]

some_data_prepared = full_pipeline.transform(some_data)

# print(some_data)
# print('---' * 5)
# print(some_labels)

prediction = lin_reg.predict(some_data_prepared)

print('Prediction: ', list(prediction))
print('Labels: ', list(some_labels))

In [None]:
from sklearn.metrics import mean_squared_error

housing_prediction = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_prediction)
lin_rmse = np.sqrt(lin_mse)

lin_rmse

In [None]:
from sklearn.tree import DecisionTreeRegressor

decision_tree = DecisionTreeRegressor()
decision_tree.fit(housing_prepared, housing_labels)

In [None]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]

some_data_prepared = full_pipeline.transform(some_data)

prediction = decision_tree.predict(some_data_prepared)

print('Prediction: ', list(prediction))
print('Labels: ', list(some_labels))

In [None]:
housing_prediction = decision_tree.predict(housing_prepared)
decision_tree_mse = mean_squared_error(housing_labels, housing_prediction)
decision_tree_rmse = np.sqrt(decision_tree_mse)

decision_tree_rmse

In [None]:
# Better evaluetion using cross-validation

from sklearn.model_selection import cross_val_score

scores = cross_val_score(decision_tree, housing_prepared, housing_labels, scoring='neg_mean_squared_error', cv=10)

decision_tree_rmse_scores = np.sqrt(-scores)
# decision_tree_rmse_scores

def display_scores(scores):
    print('Scores: ', scores)
    print('Mean: ', scores.mean())
    print('Standart deviation: ', scores.std())

display_scores(decision_tree_rmse_scores)


In [None]:
lin_reg_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring='neg_mean_squared_error', cv=10)

lin_reg_rmse_scores = np.sqrt(-lin_reg_scores)
display_scores(lin_reg_rmse_scores)

In [None]:
# Random forest model

from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)

In [None]:
forest_reg_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring='neg_mean_squared_error', cv=10)
forest_reg_rmse_scores = np.sqrt(-forest_reg_scores)

display_scores(forest_reg_rmse_scores)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)

grid_search.fit(housing_prepared, housing_labels)


In [None]:
# grid_search.best_params_
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_

for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

In [None]:
feature_importance = grid_search.best_estimator_.feature_importances_
feature_importance

In [None]:
extra_attribs = ['rooms_per_household', 'population_per_household', 'bedrooms_per_room']
cat_encoder = full_pipeline.named_transformers_['cat']
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs

sorted(zip(feature_importance, attributes), reverse=True)

In [None]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop('median_house_value', axis=1)
y_test = strat_test_set['median_house_value'].copy()

X_test_prepared = full_pipeline.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(final_predictions, y_test)
final_rmse = np.sqrt(final_mse)

print(final_rmse)

In [None]:
from scipy import stats

confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1, loc=squared_errors.mean(), scale=stats.sem(squared_errors)))
