In [None]:
import os
import tarfile
from six.moves import urllib
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#download the data file
DOWNLOADROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSINGPATH = os.path.join("datasets", "housing")
HOUSINGURL = DOWNLOADROOT + "datasets/housing/housing.tgz"
#Function for downloading the data
def fetchHousingData(housingUrl = HOUSINGURL, housingPath = HOUSINGPATH):
    if not os.path.isdir(housingPath):
        os.makedirs(housingPath)
    tgzPath = os.path.join(housingPath, "housing.tgz")
    urllib.request.urlretrieve(housingUrl, tgzPath)
    housingTgz = tarfile.open(tgzPath)
    housingTgz.extractall(path=housingPath)
    housingTgz.close()

In [None]:
#function for loading the data
def loadHousingData(housingPath = HOUSINGPATH):
    fetchHousingData()
    csvPath = os.path.join(housingPath,"housing.csv")
    return pd.read_csv(csvPath)

In [None]:
# Load housing data
housingData = loadHousingData()
# Read the top five data values and print
housingData.head()

In [None]:
#useful to get a quick description of the data, 
#in particular the total number of rows, 
#and each attribute’s type and number of non-null value
housingData.info()

In [None]:
#for categorical attribute it finds the number of values attribute takes and count number of data falls in that 
#category
housingData["ocean_proximity"].value_counts()

In [None]:
#Method shows the summery of the numerical Attribute
housingData.describe()

In [None]:
housingData.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
# Code for spliting the data to training and test
def split_training_set(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    size_test_set = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:size_test_set]
    train_indices = shuffled_indices[size_test_set:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
#Dividing the data set to test and train
train_set, test_set = split_training_set(housingData, 0.2)
print(len(train_set)," train +",len(test_set)," test")

In [None]:
#Sklearn in built function
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housingData, test_size=0.2, random_state=42)

In [None]:
housingData["income_cat"] = np.ceil(housingData["median_income"] / 1.5)
#print(housingData["income_cat"])
housingData["income_cat"].where(housingData["income_cat"] < 5, 5.0, inplace=True)

In [None]:
#print(housingData["income_cat"])
housingData["income_cat"].hist(bins=50,figsize=(20,15))
plt.show()

In [None]:
#Now doing stratified shuffling
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housingData, housingData["income_cat"]):
    strat_train_set = housingData.loc[train_index]
    strat_test_set = housingData.loc[test_index]
#print(strat_train_set)

In [None]:
strat_test_set["income_cat"].value_counts() / len(strat_test_set)

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [None]:
housing_data_copy = strat_train_set.copy()

In [None]:

housing_data_copy.plot(kind="scatter", x="longitude",y="latitude");
plt.show()

In [None]:
#Finding area of high density data
housing_data_copy.plot(kind="scatter", x="longitude",y="latitude", alpha=0.1);
plt.show()

In [None]:
#We will use a predefined color map (option cmap) called jet, 
#which ranges from blue (low values) to red (high prices).
housing_data_copy.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
    s=housing_data_copy["population"]/100, label="population", figsize=(10,7),
    c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True
)
plt.legend()
plt.show()

In [None]:
#Finding correlation b/w the data set
corr_matrix = housing_data_copy.corr()