In [23]:
import os
import tarfile
import urllib
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from pandas.plotting import scatter_matrix

dr = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
housepath = os.path.join("datasets", "housing")
# print(housepath)
houseurl = dr + "datasets/housing/housing.tgz"
# print(houseurl)

def fetch_housing_data(housing_url=houseurl, housing_path=housepath):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
#     print(tgz_path)
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

def load_house_data(hp=housepath):
    csv_path = os.path.join(hp, "housing.csv")
    return pd.read_csv(csv_path)

def split_train_test(data, ratio):
    shuffled = np.random.permutation(len(data))
    testsize = int(len(data) * ratio)
    test_indices = shuffled[:testsize]
    train_indices = shuffled[testsize:]
    return data.iloc[train_indices], data.iloc[test_indices]

housing = load_house_data()
# housing.head()
# housing.info()
# housing['ocean_proximity'].value_counts()
# housing.describe()
# housing.hist(bins=50, figsize=(20,15))
# plt.show()
# traindf, testdf = split_train_test(housing, .2)
# print(len(traindf), len(testdf))
traindf, testdf = train_test_split(housing, test_size=.2, random_state=42)
# stratified sampling using pandas
# housing.info()
housing["income_cat"] = pd.cut(housing["median_income"], 
                              bins=[0.,1.5,3.0,4.5,6., np.inf],
                              labels=[1,2,3,4,5])
split = StratifiedShuffleSplit(n_splits=1, test_size=.2, random_state=42)
for train_ind, test_ind in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_ind]
    strat_test_set = housing.loc[test_ind]

strat_test_set["income_cat"].value_counts() / len(strat_test_set)
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)
housing = strat_train_set.copy()
# housing.plot(kind="scatter", x = "longitude", y = "latitude")
# housing.plot(kind="scatter", x = "longitude", y = "latitude", alpha=.1)
# housing.plot(kind = "scatter", x = "longitude", y = "latitude", alpha = .4, 
#              s = housing["population"]/100, label = "population", figsize = (10,7), 
#             c = "median_house_value", cmap = plt.get_cmap("jet"), colorbar = True,)
# plt.legend()
corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending = False)
attributes = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']
# scatter_matrix(housing[attributes], figsize=(12,8))
# housing.plot(kind = 'scatter', x = 'median_income', y = 'median_house_value', alpha = .1)
housing['rooms_per_household'] = housing['total_rooms'] / housing['households']
housing['bedrooms_per_room'] = housing['total_bedrooms'] / housing['total_rooms']
housing['population_per_household'] = housing['population'] / housing['households']
corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending = False)

housing = strat_train_set.drop('median_house_value', axis=1)
housing_labels = strat_train_set['median_house_value'].copy()
imputer = SimpleImputer(strategy = 'median')
housing_num = housing.drop('ocean_proximity', axis = 1)
imputer.fit(housing_num)
# imputer.statistics_
# housing_num.median().values
x = imputer.transform(housing_num)
htr = pd.DataFrame(x, columns = housing_num.columns, index = housing_num.index)
htr.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
17606,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042
18632,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214
14650,-117.2,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621
3230,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839
3555,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347
