In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import tarfile
import urllib.request

In [None]:
def load_housing_data():
  tarball_path = Path('datasets/housing.tgz')
  if not tarball_path.is_file():
    Path('datasets').mkdir(parents=True, exist_ok=True)
    url = "https://github.com/ageron/data/raw/main/housing.tgz"
    urllib.request.urlretrieve(url, tarball_path)
    with tarfile.open(tarball_path) as tarball_file:
      tarball_file.extractall(path='datasets')
  return pd.read_csv(Path('datasets/housing/housing.csv'))

In [None]:
housing_data = load_housing_data()

In [None]:
housing_data.head()

In [None]:
housing_data.info()

In [None]:
housing_data["ocean_proximity"].value_counts()

In [None]:
housing_data.describe()

In [None]:
housing_data.hist(bins=50, figsize=(12, 8));

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_set, test_set = train_test_split(housing_data, test_size=0.2, random_state=42)

In [None]:
len(train_set)

In [None]:
len(test_set)

In [None]:
housing_data["income_cat"] = pd.cut(
    housing_data["median_income"],
    bins=[0, 1.5, 3.0, 4.5, 6., np.inf],
    labels=[1, 2, 3, 4, 5]
)

In [None]:
housing_data["income_cat"].value_counts().sort_index().plot.bar(rot=0, grid=True);

In [None]:
new_train_set, new_test_set = train_test_split(
    housing_data, 
    test_size=0.2,
    stratify=housing_data["income_cat"],
    random_state=42
)

In [None]:
new_test_set["income_cat"].value_counts() / len(new_test_set)

# Let's Visualize our Data 

In [None]:
housing_data.plot(kind="scatter", x="longitude", y="latitude", grid=True);

In [None]:
housing_data.plot(kind="scatter", x="longitude", y="latitude", grid=True, alpha=0.2);

In [None]:
housing_data.plot(kind="scatter", x="longitude", y="latitude", grid=True,
    s=housing_data["population"] / 100, label="population",
    c="median_house_value", cmap="jet", colorbar=True,
    legend=True, sharex=False, figsize=(10, 7));

# Correlation Coefficient

In [None]:
housing_num = housing_data.select_dtypes(include=[np.number])

In [None]:
corr_matrix = housing_num.corr()

In [None]:
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix

In [None]:
attributes = ["median_house_value", "median_income", "total_rooms",
"housing_median_age"]

In [None]:
scatter_matrix(housing_data[attributes], figsize=(12, 8));

In [None]:
housing_data.plot(kind="scatter", x="median_income", y="median_house_value",
alpha=0.1, grid=True);

# Let's Experiment

In [None]:
housing_data["rooms_per_house"] = housing_data["total_rooms"] / housing_data["households"]
housing_data["bedrooms_ratio"] = housing_data["total_bedrooms"] / housing_data["total_rooms"]
housing_data["people_per_house"] = housing_data["population"] / housing_data["households"]

In [None]:
housing_num = housing_data.select_dtypes(include=[np.number])

In [None]:
corr_matrix = housing_num.corr()

In [None]:
corr_matrix["median_house_value"].sort_values(ascending=False)

# Let's Prepare the Data for Machine Learning Model

In [None]:
housing = new_train_set.drop("median_house_value", axis=1)
housing_labels = new_train_set["median_house_value"].copy()

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer = SimpleImputer(strategy="median")

In [None]:
housing_num = housing.select_dtypes(include=[np.number])

In [None]:
imputer.fit(housing_num)

In [None]:
imputer.statistics_

In [None]:
housing_num.median().values

In [None]:
X = imputer.transform(housing_num)

In [None]:
imputed_df = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)

In [None]:
imputed_df.info()

# Handling Text & Categorical Attributes

In [None]:
housing_cat = housing[["ocean_proximity"]]

In [None]:
housing_cat.head(10)

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
cat_encoder = OneHotEncoder()

In [None]:
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)

In [None]:
housing_cat_1hot = housing_cat_1hot.toarray()

In [None]:
encoded_df = pd.DataFrame(housing_cat_1hot, columns=cat_encoder.get_feature_names_out(), index=housing_cat.index)

In [None]:
encoded_df.head()

# Feature Scaling

In [None]:
housing.hist(bins=50, figsize=(12, 8));

In [None]:
plt.hist(housing["population"], bins=50);

In [None]:
plt.hist(np.log(housing["population"]), bins=50);

In [None]:
scale_attributes = ["total_rooms", "total_bedrooms", "population", "households", "median_income"]

In [None]:
rem_attributes = ["latitude", "longitude", "housing_median_age"]

In [None]:
column_names = rem_attributes + scale_attributes

In [None]:
rem_df = imputed_df[rem_attributes].copy()

In [None]:
scaled_df = imputed_df[scale_attributes].copy()

In [None]:
df_log = np.log(scaled_df)

In [None]:
nor_df = rem_df.join(df_log)

In [None]:
nor_df.head()

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
scaled_df = scaler.fit_transform(nor_df)

In [None]:
scaled_df_ = pd.DataFrame(scaled_df, columns=column_names, index=nor_df.index)

In [None]:
scaled_df_.head()

In [None]:
df_to_train = scaled_df_.join(encoded_df)

# Training a Model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()

In [None]:
df_to_train.head()

In [None]:
lr.fit(df_to_train, housing_labels)

In [None]:
df_to_train.head()

In [None]:
housing_predictions = lr.predict(df_to_train)

# Let's Evaluate the Model

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
rmse = mean_squared_error(housing_labels, housing_predictions, squared=False)

In [None]:
rmse