# 使用 housing.csv 数据集

## 下载数据集

## 由于使用 python 代码在 GitHub 上下载的压缩包是空的，故使用新的方法下载数据

In [None]:
# -*- coding: utf8 -*-
# 前期的配置
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [None]:
import os
import requests

HOUSING_URL = 'https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv' # the url of the housing data 
HOUSING_PATH = os.path.join("datasets", "housing") # save path

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    ''' Downloads file from the url and save it as filename '''
    # check if folder already exists
    if not os.path.exists(housing_path):
        os.makedirs(housing_path)
    else:
        print("Folder exists")
    # check if file already exists
    housing_file = os.path.join(housing_path, "housing.csv")
    if not os.path.isfile(housing_file):
        print('Downloading File')
        response = requests.get(housing_url)
        # Check if the response is ok (200)
        if response.status_code == 200:
            # Open file and write the content
            with open(housing_file, 'wb') as file:
                # A chunk of 128 bytes
                for chunk in response:
                    file.write(chunk)
            print('Download finish...')
    else:
        print('File exists')

fetch_housing_data()

# 加载 并 可视化 查看数据集的内容

In [None]:
# load the housing data
import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

housing = load_housing_data()
housing.head()

In [None]:
housing.info()

# 由于 ocean_proximity 这个特征是 object

## 查看一下里面包含了多少种值

In [None]:
housing["ocean_proximity"].value_counts()

In [None]:
housing.describe()

In [None]:
% matplotlib inline  
import matplotlib.pyplot as plt 
housing.hist(bins=50, figsize=(20,15)) 
plt.show()

# 分割 训练集（80%） 和 测试集（20%）

In [None]:
# create test set
import numpy as np

def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set), 'train +', len(test_set), "test")

# 上面采用的是随机选取测试集

## 为了避免多次运行后让 机器 在 测试集 上看到 所有的数据，设置唯一标识

### 不过我很好奇，为什么它不直接切割数据集的最后一部分作为 测试集，这样就没有那么多问题了。不过还是照着做先吧

In [None]:
import hashlib

def test_set_check(identifier, test_ratio, hash):
    return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio

def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))
    return data.loc[~in_test_set], data.loc[in_test_set]

# 增加一列索引

## 唯一标识有四种方式，1.用行索引直接加， 2.由 经纬度 组合成，3. 使用sklearn库， 4.使用StratifiedShuffleSplit

### example 1：

In [None]:
housing_with_id = housing.reset_index() # adds an `index` column
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")
test_set.head() # index 是随机挑选的

### example 2：

In [None]:
# 给这一列添加内容，由经纬度组合成
housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")
test_set.head()

# 查看添加索引后的数据集，多了 index 和 id 两个特征

In [None]:
housing_with_id.head()

### example 3：直接使用 sklearn 里面的 train_test_split 方法

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
test_set.head()

### example 4：由于上面的是随机抽样，会导致抽样偏差：即收入的偏差，如果随机抽取抽中的 90% 都是有钱人，那就会出现很严重的采样偏差，so，我们挑选 median_income 这个特征，确保抽出来的人群不集中在某个区域。

In [None]:
# 查看 median income 发现很杂乱，要整合一下，且收入越多的人数越少，可以将大于 5.0 的都整合到一起
print(len(housing["median_income"].value_counts()))
housing["median_income"].hist()

In [None]:
'''
ceil 的用法
>>> a = np.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0])
>>> np.ceil(a)
array([-1., -1., -0.,  1.,  2.,  2.,  2.])
'''
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].value_counts()

In [None]:
# 如果 income_cat 列的值>5，group列显示 5.0 ，否则显示 原本数据，inplace为真标识在原数据上操作
# eg: 1.0 < 5.0, 为 1.0
#     6.0 > 5.0, 为 5.0
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
housing["income_cat"].value_counts()

In [None]:
housing["income_cat"].hist()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
housing["income_cat"].value_counts() / len(housing)

# 对比 完整数据集，随机抽样数据集，分层抽样数据集 中的收入比例

In [None]:
def income_cat_proportions(data):
    return data["income_cat"].value_counts() / len(data)

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

compare_props = pd.DataFrame({
    "Overall": income_cat_proportions(housing),
    "Stratified": income_cat_proportions(strat_test_set),
    "Random": income_cat_proportions(test_set),
}).sort_index()
# 计算偏差值，公式 = 100 - 100 * （测试集中收入的个数/完整数据集中对应收入的个数）
compare_props["Rand. %error"] = 100 - 100 * compare_props["Random"] / compare_props["Overall"]
compare_props["Strat. %error"] = 100 - 100 * compare_props["Stratified"] / compare_props["Overall"]

In [None]:
compare_props

In [None]:
# 除去 income_cat 这个特征
for set in (strat_train_set, strat_test_set):
    set.drop(["income_cat"], axis=1, inplace=True)
strat_test_set.head()

# 地理数据可视化

## 这时候我们要把 测试集 丢在一旁，只能使用 训练集

In [None]:
housing = strat_train_set.copy()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude")
save_fig("bad_visualization_plot")

In [None]:
# alpha 图表的填充不透明 0~1，可以更清楚的看到高密度数据的位置
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
save_fig("better_visualization_plot")

In [None]:
'''
每个圆的半径大小代表了每
个地区的人口数量（选项s），颜色代表价格（选项c）。我们使用一
个名叫jet的预定义颜色表（选项cmap）来进行可视化，颜色范围从
蓝（低）到红（高）
'''
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
    s=housing["population"]/100, label="population", figsize=(10,7),
    c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
    sharex=False)
plt.legend()
save_fig("housing_prices_scatterplot")

In [None]:
import matplotlib.image as mpimg
california_img=mpimg.imread(PROJECT_ROOT_DIR + '/images/end_to_end_project/california.png')
ax = housing.plot(kind="scatter", x="longitude", y="latitude", figsize=(10,7),
                       s=housing['population']/100, label="Population",
                       c="median_house_value", cmap=plt.get_cmap("jet"),
                       colorbar=False, alpha=0.4,
                      )
plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5,
           cmap=plt.get_cmap("jet"))
plt.ylabel("Latitude", fontsize=14)
plt.xlabel("Longitude", fontsize=14)

prices = housing["median_house_value"]
tick_values = np.linspace(prices.min(), prices.max(), 11)
cbar = plt.colorbar()
cbar.ax.set_yticklabels(["$%dk"%(round(v/1000)) for v in tick_values], fontsize=14)
cbar.set_label('Median House Value', fontsize=16)

plt.legend(fontsize=16)
save_fig("california_housing_prices_plot")
plt.show()

In [None]:
# corr 是 Pearson 相关系数
corr_matrix = housing.corr()
corr_matrix

In [None]:
'''
相关系数的范围从-1变化到1。越接近1，表示有越强的正相关；
比如，当收入中位数上升时，房价中位数也趋于上升。当系数接近
于-1，则表示有强烈的负相关；注意看纬度和房价中位数之间呈现出
轻微的负相关（也就是说，越往北走，房价倾向于下降）。最后，系
数靠近0则说明二者之间没有线性相关性

说人话！！！
正相关： 我变大 你变大，我变小 你变小
负相关： 我变大 你变小，我变小 你变大
'''
corr_matrix["median_house_value"].sort_values(ascending=False)

# 这个东西有点好用呀！

## scatter_matrix 将 attributes 的值 做成 x， y 形式

In [None]:
# from pandas.tools.plotting import scatter_matrix # For older versions of Pandas
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))
save_fig("scatter_matrix_plot")

In [None]:
housing.plot(kind="scatter", x="median_income", y="median_house_value",
             alpha=0.1)
plt.axis([0, 16, 0, 550000])
save_fig("income_vs_house_value_scatterplot")

In [None]:
# 添加多几个特征属性
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

In [None]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
# total_beadrooms 接近 0 ，基本没啥相关性了
housing.plot(kind="scatter", x="total_bedrooms", y="median_house_value",
             alpha=0.2)
plt.axis([0, 100, 0, 520000])
plt.show()

In [None]:
housing.describe()