# Week 2 - End-to-end ML project
1. Get the data
2. Data exploration
3. Prepare datasets
4. Choose the models
5. Analyze the model


This project requires Python 3.7 or above:

In [None]:
import sys

assert sys.version_info >= (3, 7)

It also requires Scikit-Learn ≥ 1.0.1:

In [None]:
from packaging import version
import sklearn

assert version.parse(sklearn.__version__) >= version.parse("1.0.1")

## 1. Get the Data

Our task is to predict median house values in Californian districts, given a number of features from these districts.

In [None]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("datasets/housing/housing.csv"))

housing = load_housing_data()

## 2. Data exploration

In [None]:
housing.head()

In [None]:
housing.info()

In [None]:
housing.describe()

### Visualize the data

In [None]:
import matplotlib.pyplot as plt

plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)

housing.hist(bins=50, figsize=(12, 8))
plt.show()

### Create a Test Set

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
len(train_set)

### Discover and Visualize the Data to Gain Insights

In [None]:
housing = train_set.copy()

### Visualizing Geographical Data

In [None]:
train_set.plot(kind="scatter", x="longitude", y="latitude", grid=True)
plt.show()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", grid=True, alpha=0.2)
plt.show()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", grid=True,
             s=housing["population"] / 100, label="population",
             c="median_house_value", cmap="jet", colorbar=True,
             legend=True, sharex=False, figsize=(10, 7))
plt.show()

### Looking for Correlations

In [None]:
corr_matrix = housing.corr()

In [None]:
corr_matrix

In [None]:
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))
plt.show()

## Experimenting with Attribute Combinations (a.k.a creating a new attribute)

In [None]:
housing["rooms_per_house"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_ratio"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["people_per_house"] = housing["population"] / housing["households"]

In [None]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

## 3. Prepare datasets

In [None]:
housing = train_set.drop("median_house_value", axis=1)
housing_labels = train_set["median_house_value"].copy()

### Data Cleaning

Missing value - three ways to address
- Drop the the districts with nan values
- Drop the columns with nan values
- Replace the nan values with certain values (zero, mean, median etc.)

```python
housing.dropna(subset=["total_bedrooms"], inplace=True)    # option 1

housing.drop("total_bedrooms", axis=1)       # option 2

median = housing["total_bedrooms"].median()  # option 3
housing["total_bedrooms"].fillna(median, inplace=True)
```


In [None]:
null_rows_idx = housing.isnull().any(axis=1)
housing.loc[null_rows_idx].head()

In [None]:
housing.isnull().any()

In [None]:
median = housing["total_bedrooms"].median()
housing["total_bedrooms"].fillna(median, inplace=True)  # option 3

housing.loc[null_rows_idx].head()

### Handling Text and Categorical Attributes

Now let's preprocess the categorical input feature, `ocean_proximity`:
- Ordinal encoder
- One-hot encoder

In [None]:
housing_cat = housing[["ocean_proximity"]]
housing_cat.head(8)

In [None]:
housing_cat['ocean_proximity'].unique()

In [None]:
from sklearn.preprocessing import OrdinalEncoder

def ordinal_encoder(catagorical_data):
  ordinal_encoder = OrdinalEncoder()
  return ordinal_encoder.fit_transform(catagorical_data)

In [None]:
housing_cat_encoded = ordinal_encoder(housing_cat)
housing_cat_encoded[:8]

In [None]:
from sklearn.preprocessing import OneHotEncoder

def onehot_encoder(categorical_data):
  cat_encoder = OneHotEncoder(sparse=False)
  return cat_encoder.fit_transform(categorical_data)
housing_cat_1hot = onehot_encoder(housing_cat)
housing_cat_1hot

In [None]:
housing['ocean_proximity_encoded'] = housing_cat_encoded
housing_prep = housing.drop(columns=['ocean_proximity'])

## 4. Choose the models

### Training and Evaluating on the Training Set

In [None]:
housing_prep

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prep, housing_labels)

In [None]:
housing_predictions = lin_reg.predict(housing_prep)
housing_predictions[:5]

In [None]:
from sklearn.metrics import mean_squared_error

lin_rmse = mean_squared_error(housing_labels, housing_predictions,
                              squared=False)
lin_rmse

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prep, housing_labels)

In [None]:
housing_predictions = tree_reg.predict(housing_prep)
tree_rmse = mean_squared_error(housing_labels, housing_predictions,
                              squared=False)
tree_rmse

## 5. Analyze the model

In [None]:
# final_model = grid_search.best_estimator_

X_test = test_set.drop("median_house_value", axis=1)
y_test = test_set["median_house_value"].copy()

median = X_test["total_bedrooms"].median()  # option 3
X_test["total_bedrooms"].fillna(median, inplace=True)
X_test['ocean_proximity_encoded'] = ordinal_encoder(X_test[['ocean_proximity']])
X_test = X_test.drop(columns=['ocean_proximity'])

final_predictions_lin = lin_reg.predict(X_test)
final_predictions_tree = tree_reg.predict(X_test)

final_rmse_lin = mean_squared_error(y_test, final_predictions_lin, squared=False)
final_rmse_tree = mean_squared_error(y_test, final_predictions_tree, squared=False)

print('Final rmse for linear regressor: ', final_rmse_lin)
print('Final rmse for tree regressor: ', final_rmse_tree)