# 02链家二手房分析：特征工程

In [1]:
import numpy as np
import pandas as pd
# import math
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# 加载各城市房屋数据
km_df = pd.read_csv(r"./data/clean/km_house_clean.csv")
cd_df = pd.read_csv(r"./data/clean/cd_house_clean.csv")
hz_df = pd.read_csv(r"./data/clean/hz_house_clean.csv")
sh_df = pd.read_csv(r"./data/clean/sh_house_clean.csv")

In [3]:
# 数据备份
km = km_df.copy()
cd = cd_df.copy()
hz = hz_df.copy()
sh = sh_df.copy()

In [4]:
# 列表 city_list 存放各个城市房产数据信息
city_list = [km, cd, hz, sh]
# 存放城市名称的元祖
city_names = ("KunMing", "ChengDu", "HangZhou", "ShangHai")

In [5]:
hz.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2993 entries, 0 to 2992
Data columns (total 12 columns):
region           2993 non-null object
rooms            2993 non-null int64
halls            2993 non-null int64
towards          2993 non-null object
decoration       2993 non-null object
have_elevator    2993 non-null int64
visited          2993 non-null int64
attention        2993 non-null int64
publishday       2993 non-null float64
unit_price       2993 non-null float64
area             2993 non-null float64
total_price      2974 non-null float64
dtypes: float64(4), int64(5), object(3)
memory usage: 280.7+ KB


In [6]:
hz.head()

Unnamed: 0,region,rooms,halls,towards,decoration,have_elevator,visited,attention,publishday,unit_price,area,total_price
0,十亩田家园,2,1,南,精装,0,15,107,20.0,4.3952,45.05,198.0
1,太阳国际公寓,4,2,南北,简装,1,8,117,20.0,4.3549,167.63,730.0
2,十亩田家园,2,1,南北,精装,0,12,71,20.0,4.2337,57.87,245.0
3,盛世钱塘,4,2,南北,精装,1,4,57,20.0,5.8071,185.98,1080.0
4,六和源,4,2,南北,毛坯,0,5,37,20.0,3.8409,231.72,890.0


### (1) decoration 列的数值化

In [7]:
# 查看各个城市 decoration 列的分类汇总
for city_name,c_house in zip(city_names, city_list):
    print(city_name+"'s Decoration: \n", c_house.decoration.value_counts())

KunMing's Decoration: 
 其他    382
精装    100
毛坯     50
简装     36
Name: decoration, dtype: int64
ChengDu's Decoration: 
 其他    2026
精装     371
简装     361
毛坯     237
Name: decoration, dtype: int64
HangZhou's Decoration: 
 简装    1148
精装    1109
毛坯     485
其他     251
Name: decoration, dtype: int64
ShangHai's Decoration: 
 精装    1550
简装     769
其他     340
毛坯     324
Name: decoration, dtype: int64


In [8]:
def decorate_valued(city):
    # decoration 的数值化操作，具体规则如下： 精装: 3  简装: 2  毛坯: 1  其他: 0 
    city['decoration'] = city.decoration.map({"精装":3, "简装":2,"毛坯":1, "其他":0,})
    return city

In [9]:
# 数值化操作
km = decorate_valued(km)
cd = decorate_valued(cd)
hz = decorate_valued(hz)
sh = decorate_valued(sh)

In [11]:
sh.tail()

Unnamed: 0,region,rooms,halls,towards,decoration,have_elevator,visited,attention,publishday,unit_price,area,total_price
2978,前哨路198弄,2,1,南,1,0,4,43,19.2,2.681,71.99,193.0
2979,保利西子湾,2,2,南,3,1,1,18,19.2,3.9273,90.14,354.0
2980,金沙鼎苑,2,2,南北,1,0,21,24,19.2,3.8193,77.24,295.0
2981,誉品原墅,4,2,南,2,0,0,20,19.2,4.5806,181.2,830.0
2982,东方丽景,2,2,南,3,1,1,66,19.2,6.1422,98.5,605.0


### (2) towards 字段的字母化
#### 先进行字母化，具体转化方式如下：
 - 北：  'N '
 - 东北：'NE '
 - 东：  'E '
 - 东南：'SE '
 - 南：  'S '
 - 西南：'SW '
 - 西：  'W '
 - 西北：'NW '
 
 - 中英文朝向对照:
 - towards_origin = ["东北", "东南", "西南", "西北", "北", "东", "南", "西"]
 - towards_letter = ["NE", "SE", "SW", "NW", "N", "E", "S", "W"]
 

In [12]:
def toward_to_letters(city):
    # 该函数用于将 towards 的中文朝向转化为 字母朝向
    city.towards = city.towards.str.replace("东北","NE ").str.replace("东南","SE ").str.replace("西南","SW ").str.replace("西北","NW ").str.replace("北","N ").str.replace("东","E ").str.replace("南","S ").str.replace("西","W ")
    # 除去towards尾部的空格符
    city.towards = city.towards.str[:-1]
    return city


In [13]:
# 查看各个城市 towards 列的分类汇总
for city_name,c_house in zip(city_names, city_list):
    print(city_name+"'s Towards: \n", c_house.towards.value_counts())

KunMing's Towards: 
 东南      139
南       124
南北      110
西南       62
东        44
西        23
东西       23
东北       12
西北        8
北         4
南西        3
南西南       3
东南西南      2
东南南北      2
东南西北      1
东西北       1
西南南       1
东南南       1
西南西       1
西西北       1
东南北       1
南西北       1
西南东北      1
Name: towards, dtype: int64
ChengDu's Towards: 
 东南      708
南       691
西南      327
东       301
北       191
西北      185
西       176
东北      148
南北      145
东西       25
东南西北     22
东南南      14
南西南       8
西南东北      8
南西        7
东东南       6
西北北       4
南西北       4
西西北       4
西南西北      3
东南西南      3
西南西       2
东西南       2
东南西       2
东南北       1
东西北       1
南东北       1
东北东北      1
东东北       1
东南东北      1
南西南北      1
西南南       1
北东北       1
Name: towards, dtype: int64
HangZhou's Towards: 
 南       1742
南北       961
东南        61
南西北       33
东南北       32
西南        25
北         24
东         23
西         19
东西        18
南西        14
东南南        7
东北         7
西北         5
南西南        4
南东北        3


In [14]:
km = toward_to_letters(km)
cd = toward_to_letters(cd)
hz = toward_to_letters(hz)
sh = toward_to_letters(sh)

In [15]:
# 查询昆明 towards 列含有 东南（SE）朝向的记录
km[km.towards.str.contains(pat='SE', regex=True)]

Unnamed: 0,region,rooms,halls,towards,decoration,have_elevator,visited,attention,publishday,unit_price,area,total_price
0,昆明市教工二幼儿园宿舍,2,2,SE NW,0,0,6,9,19.0,1.3000,50.00,
1,新亚洲体育城星宇园,3,2,SE,3,0,0,0,19.0,1.3446,119.00,160.0
2,禧瑞都,3,2,SE,2,0,0,0,19.0,1.1021,147.00,162.0
3,金色交响家园,4,2,SE,2,0,1,1,15.0,1.4477,105.00,152.0
12,南方公园,5,3,SE,0,0,0,0,14.0,1.6020,103.00,165.0
14,圣世一品,3,2,SE,0,0,0,0,11.0,1.1250,120.00,135.0
20,新亚洲体育城星泽园,5,2,SE,0,0,14,4,19.0,1.0986,142.00,156.0
22,中豪逸境花园,2,2,SE,1,0,1,1,19.0,1.0094,107.00,108.0
23,万科白沙润园,4,2,SE,2,0,0,1,8.0,1.4468,165.89,240.0
41,彼岸二期,4,2,SE,2,0,2,13,19.0,1.0000,126.00,126.0


## 特征工程

 - 特征 city: 该特征表示房产所在城市
 #### 四个城市的标识对应如下：  昆明：K  成都：C  杭州：H  上海：S

In [16]:
km['city'] = "K"
cd['city'] = "C"
hz['city'] = "H"
sh['city'] = "S"

In [17]:
hz.sample(5)

Unnamed: 0,region,rooms,halls,towards,decoration,have_elevator,visited,attention,publishday,unit_price,area,total_price,city
2842,圆梦园,3,2,S N,2,1,1,12,20.0,2.9582,134.88,399.0,H
2098,大关西三苑,2,1,S,2,0,1,1,25.0,3.9684,54.43,216.0,H
1585,湘湖人家,3,2,S,2,1,7,11,20.0,2.4234,121.73,295.0,H
2506,北景园紫荆苑,5,2,S,1,1,0,20,20.0,2.6112,180.0,470.0,H
1286,钱江西溪和景,3,2,S,3,0,3,17,20.0,4.1012,178.0,730.0,H


 - 新特征 popular: 该特征表示房产的受欢迎程度。计算方式如下：
#### popular = (0.3*attention + 0.7*visited) / publishday
 - 即attention的权重是0.3, visited的权重是0.7 

In [18]:
# 计算 popular的数值
def house_popular(visited, attention, publish):
    popular = (0.7*visited + 0.3*attention) / publish
    return round(popular, 4)

In [19]:
km['popular'] = km.apply(lambda x: house_popular(x.visited, x.attention, x.publishday), axis=1)
cd['popular'] = cd.apply(lambda x: house_popular(x.visited, x.attention, x.publishday), axis=1)
hz['popular'] = hz.apply(lambda x: house_popular(x.visited, x.attention, x.publishday), axis=1)
sh['popular'] = sh.apply(lambda x: house_popular(x.visited, x.attention, x.publishday), axis=1)

In [20]:
km.head()

Unnamed: 0,region,rooms,halls,towards,decoration,have_elevator,visited,attention,publishday,unit_price,area,total_price,city,popular
0,昆明市教工二幼儿园宿舍,2,2,SE NW,0,0,6,9,19.0,1.3,50.0,,K,0.3632
1,新亚洲体育城星宇园,3,2,SE,3,0,0,0,19.0,1.3446,119.0,160.0,K,0.0
2,禧瑞都,3,2,SE,2,0,0,0,19.0,1.1021,147.0,162.0,K,0.0
3,金色交响家园,4,2,SE,2,0,1,1,15.0,1.4477,105.0,152.0,K,0.0667
4,金碧阳光商住楼,3,2,S N,0,0,1,3,19.0,1.8196,129.15,235.0,K,0.0842


In [None]:
# 添加新特征 id
# def create_id(df):
#     id = pd.Series(np.array(np.arange(1, df.shape[0]+1)),dtype='int64')
#     df['id'] = id
#     return df

In [None]:
# 给城市房产数据添加ID标识
# km = create_id(km)
# cd = create_id(cd)
# hz = create_id(hz)
# sh = create_id(sh)

In [21]:
# 新的列索引
new_cols=['region','city','rooms','halls','towards','decoration','have_elevator','visited','attention','publishday','popular','unit_price','area','total_price']
# 重新摆放列的位置
km = pd.DataFrame(data=km, columns=new_cols)
cd = pd.DataFrame(data=cd, columns=new_cols)
hz = pd.DataFrame(data=hz, columns=new_cols)
sh = pd.DataFrame(data=sh, columns=new_cols)


In [22]:
sh.head()

Unnamed: 0,region,city,rooms,halls,towards,decoration,have_elevator,visited,attention,publishday,popular,unit_price,area,total_price
0,百汇园,S,3,2,S,3,1,31,315,19.2,6.0521,9.9794,145.2,1449.0
1,汇龙新城,S,3,2,S,3,1,14,230,19.2,4.1042,9.3814,143.69,1348.0
2,上海绿城,S,3,2,S,3,1,26,1045,19.2,17.276,8.5471,134.55,1150.0
3,华宝花园,S,3,1,S,2,0,38,151,19.2,3.7448,4.9221,97.52,480.0
4,鸿凯湾绿苑,S,3,2,S,2,1,22,93,19.2,2.2552,9.0262,120.76,1090.0


In [23]:
# 查看 4个城市房产的基本信息
for city_name,c_house in zip(city_names, city_list):
    print(city_name+" infos: \n", c_house.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568 entries, 0 to 567
Data columns (total 14 columns):
region           568 non-null object
rooms            568 non-null int64
halls            568 non-null int64
towards          568 non-null object
decoration       568 non-null int64
have_elevator    568 non-null int64
visited          568 non-null int64
attention        568 non-null int64
publishday       568 non-null float64
unit_price       568 non-null float64
area             568 non-null float64
total_price      450 non-null float64
city             568 non-null object
popular          568 non-null float64
dtypes: float64(5), int64(6), object(3)
memory usage: 62.2+ KB
KunMing infos: 
 None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2995 entries, 0 to 2994
Data columns (total 14 columns):
region           2995 non-null object
rooms            2995 non-null int64
halls            2995 non-null int64
towards          2995 non-null object
decoration       2995 non-null int64

 - 拼接4个城市的数据集为一个总集

In [24]:
house_all = pd.concat([km,cd,hz,sh], axis=0)

In [25]:
house_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9539 entries, 0 to 2982
Data columns (total 14 columns):
region           9539 non-null object
city             9539 non-null object
rooms            9539 non-null int64
halls            9539 non-null int64
towards          9539 non-null object
decoration       9539 non-null int64
have_elevator    9539 non-null int64
visited          9539 non-null int64
attention        9539 non-null int64
publishday       9539 non-null float64
popular          9539 non-null float64
unit_price       9539 non-null float64
area             9539 non-null float64
total_price      8834 non-null float64
dtypes: float64(5), int64(6), object(3)
memory usage: 1.1+ MB


In [26]:
house_all[565:570]

Unnamed: 0,region,city,rooms,halls,towards,decoration,have_elevator,visited,attention,publishday,popular,unit_price,area,total_price
565,广福城怡福园,K,2,2,W,0,0,1,1,19.0,0.0526,1.4957,77.56,116.0
566,怡康温泉新村B区,K,4,2,E W,0,0,0,4,19.0,0.0632,0.9758,132.2,129.0
567,鑫都公寓,K,2,1,SE,0,0,0,0,19.0,0.0,2.1053,76.0,160.0
0,东苑A区,C,3,2,S N,2,1,32,22,26.0,1.1154,2.1368,105.3,225.0
1,蓝光诺丁山,C,3,1,W,2,1,24,300,18.5,5.773,1.3463,113.65,153.0


In [27]:
# write into file
# house_all.to_csv(r"./data/houses_all.csv", index=False)

In [28]:
# write into csv files
# km.to_csv(r"./data/clean/km_house_FE.csv", index=False)
# cd.to_csv(r"./data/clean/cd_house_FE.csv", index=False)
# hz.to_csv(r"./data/clean/hz_house_FE.csv", index=False)
# sh.to_csv(r"./data/clean/sh_house_FE.csv", index=False)