In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from pandas.plotting import scatter_matrix

The first task is to build a model of housing prices in California using the California census data. This data has metrics such as the population, median income, median housing price, and so on for each block group in California. Block groups are the smallest geographical unit for which the US Census Bureau publishes sample data (a block group typically has a population of 600 to 3,000 people).

A sequence of data processing components is called a data pipeline. Pipelines are very common in Machine Learning systems, since there is a lot of data to manipulate and many data transformations to apply.

Frame the problem: is it supervised, unsupervised, or Reinforcement Learning?

### Get the Data

In [None]:
csvPath = "/kaggle/input/california-housing-prices/housing.csv"
housing = pd.read_csv(csvPath)
housing.head()

In [None]:
housing.info()

In [None]:
housing["ocean_proximity"].value_counts()

In [None]:
housing.describe()

In [None]:
housing.hist(bins=150, figsize=(20,15))
plt.show()

The median income is a very important attribute to predict median housing prices. Wem may want to ensure that the test set is representative of the various categories of incomes in the whole dataset. Since the median income is a continuous numerical attribute, first we need to create an income category attribute

In [None]:
housing["housing_median_age"].hist(bins=10)
plt.show()

In [None]:
housing["households"].hist(bins=150)
plt.show()

In [None]:
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
labels=[1, 2, 3, 4, 5])
housing["income_cat"].hist()

### Create a Test Set

In [None]:
trainSet, testSet = train_test_split(housing, test_size=0.2, random_state=42)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) 
for trainIndex, testIndex in split.split(housing, housing["income_cat"]):
        stratTrainSet = housing.loc[trainIndex]
        stratTestSet = housing.loc[testIndex]

In [None]:
testSet["income_cat"].value_counts() / len(stratTestSet)

In [None]:
stratTestSet["income_cat"].value_counts() / len(stratTestSet)

In [None]:
housing["income_cat"].value_counts() / len(housing)

### Visualize the Data to Gain Insights

#### Visualizing Geographical Data

In [None]:
## Put the test set aside and only explore the training set.
housing = stratTrainSet.copy()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude")

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

In [None]:
housing.plot(
    kind="scatter", x="longitude", y="latitude", alpha=0.4,
    s=housing["population"]/100, label="population", figsize=(10,7),
    c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
)
plt.legend()

In [None]:
corrMatrix = housing.corr()

In [None]:
corrMatrix["median_house_value"].sort_values(ascending=False)

In [None]:
attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))

In [None]:
## The most promising attribute to predict the median house value is the median income
housing.plot(kind="scatter", x="median_income", y="median_house_value",
                 alpha=0.1)

- First, the correlation is very strong
- Second, the price cap is clearly visible as a horizontal line at 500000   
- A horizontal line around 450000, another around 350000


In [None]:
## Try out various attribute combinations
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

In [None]:
corrMatrix = housing.corr()
corrMatrix["median_house_value"].sort_values(ascending=False)

In [None]:
housing = stratTrainSet.drop("median_house_value", axis=1)
housingLabels = stratTrainSet["median_house_value"].copy()

### Data Cleaning

In [None]:
housing.dropna(subset=["total_bedrooms"]) # option 1 
housing.drop("total_bedrooms", axis=1) # option 2 
median = housing["total_bedrooms"].median() # option 3 
housing["total_bedrooms"].fillna(median, inplace=True)

Scikit-Learn provides a handy class to take care of missing values: _SimpleImputer_

In [None]:
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(strategy="median")

In [None]:
## The median can only be computed on numerical attributes, 
## we need to create a copy of the data without the text attribute ocean_proximity:
housingNum = housing.drop("ocean_proximity", axis=1)

In [None]:
imputer.fit(housingNum)

In [None]:
housingNum.median().values

In [None]:
## Use this imputer to transform the training set by replacing missing values by the learned medians
X = imputer.transform(housingNum)

### Handling Text and Categorical Attributes

In [None]:
housingCat = housing[["ocean_proximity"]]
housingCat.head(10)

In [None]:
# Convert these categories from text to numbers
from sklearn.preprocessing import OrdinalEncoder
ordinalEncoder = OrdinalEncoder()

In [None]:
housingCatEncoded = ordinalEncoder.fit_transform(housingCat)
housingCatEncoded[:10]

In [None]:
ordinalEncoder.categories_

In [None]:
housingCat

In [None]:
from sklearn.preprocessing import OneHotEncoder
catEncoder = OneHotEncoder()
housingCatOnehot = catEncoder.fit_transform(housingCat)
housingCatOnehot
#  the output is a SciPy sparse matrix, instead of a NumPy array.

In [None]:
catEncoder.categories_

### Custom Transformers

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room 
    def fit(self, X, y=None):
        return self # nothing else to do 
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix] 
        population_per_household = X[:, population_ix] / X[:, households_ix] 
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
            bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

### Feature Scaling

Machine Learning algorithms don’t perform well when the input numerical attributes have very different scales.  
The total number of rooms ranges from about 6 to 39,320, while the median incomes only range from 0 to 15
- min-max scaling (normalization): subtracting the min value and dividing by the max minus the min. It has a feature_range hyperparameter that lets you change the range if you don’t want 0–1 for some reason. _MinMaxScaler_   
- standardization: subtracts the mean value (so standardized values always have a zero mean), and then it divides by the standard deviation so that the resulting distribution has unit variance. _StandardScaler_  

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
numPipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])
housingNumTr = numPipeline.fit_transform(housingNum)

In [None]:
from sklearn.compose import ColumnTransformer 
numAttribs = list(housingNum)
catAttribs = ["ocean_proximity"]
fullPipeline = ColumnTransformer([
     ("num", numPipeline, numAttribs),
     ("cat", OneHotEncoder(), catAttribs),
 ])
housingPrepared = fullPipeline.fit_transform(housing)

In [None]:
from sklearn.linear_model import LinearRegression 
linReg = LinearRegression()
linReg.fit(housingPrepared, housingLabels)

In [None]:
someData = housing.iloc[:5]
someLabels = housingLabels.iloc[:5]
someDataPrepared = fullPipeline.transform(someData)
print("Predictions:", linReg.predict(someDataPrepared))

In [None]:
from sklearn.metrics import mean_squared_error
housingPreedictions = linReg.predict(housingPrepared)
linMse = mean_squared_error(housingLabels, housingPreedictions) 
linRmse = np.sqrt(linMse)
linRmse

In [None]:
from sklearn.ensemble import RandomForestRegressor


In [None]:
from xgboost import XGBoostClassifier

### Using Cross-Validation

### Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
paramGrid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forestReg = RandomForestRegressor()
gridSearch = GridSearchCV(forestReg, paramGrid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
gridSearch.fit(housingPrepared, housingLabels)

In [None]:
gridSearch.best_params_ 


In [None]:
gridSearch.best_estimator_

In [None]:
cvres = gridSearch.cv_results_
for meanScore, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-meanScore), params)

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_ 
feature_importances

### Ensemble Methods

### Evaluate Your System on the Test Set

In [None]:
finalModel = gridSearch.best_estimator_
XTest = stratTestSet.drop("median_house_value", axis=1)
yTest = stratTestSet["median_house_value"].copy()
XTestPrepared = fullPipeline.transform(XTest)
finalPredictions = finalModel.predict(XTestPrepared)
finalMse = mean_squared_error(yTest, finalPredictions) 
finalRmse = np.sqrt(finalMse)

In [None]:
from scipy import stats
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                         loc=squared_errors.mean(), 
                         scale=stats.sem(squared_errors)))