In [None]:
import os
import tarfile
import urllib.request
import pandas as pd

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL,housing_path=HOUSING_PATH):
    os.makedirs(housing_path,exist_ok=True)
    tgz_path = os.path.join(housing_path,"housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()


def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path,"housing.csv")
    return pd.read_csv(csv_path)

In [None]:
#running the functions to fetch and load data
fetch_housing_data()
housing = load_housing_data()

#getting the dataframe information:

print(housing.head())
print(housing.info())
print(housing.describe())
print(housing["ocean_proximity"].value_counts())

In [None]:
#data viusualization:
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
#spliting the data into training and testing sets:
from sklearn.model_selection import train_test_split
train_set,test_set=train_test_split(housing,test_size=0.25,random_state=42)

In [None]:
#defining the income categories:
import numpy as np
housing["income_cat"]= pd.cut(housing["median_income"],bins=[0.,1.5,3,4.5,6.,np.inf],labels=[1,2,3,4,5])
housing["income_cat"].hist()
plt.show()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing,housing["income_cat"]):
    strat_train_set=housing.loc[train_index]
    strat_test_set=housing.loc[test_index]

#droping the income cat columns of the set:
for set_ in (strat_train_set,strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [None]:
#exploring the data through visualization:
housing=strat_train_set.copy()
housing.plot(kind="scatter",x="longitude",y="latitude",alpha=0.1)

In [None]:
#exploring the data through visualization:
housing=strat_train_set.copy()
housing.plot(kind="scatter",x="longitude",y="latitude",alpha=0.4,s=housing["population"]/100,label="population",figsize=(10,7),c="median_house_value",cmap=plt.get_cmap("jet"))
plt.legend()
plt.show()

In [None]:
#explorign the data through corelation matrix: