In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from sklearn.preprocessing import Binarizer, LabelEncoder
import matplotlib.pyplot as plt
from pyecharts import Bar, Geo, Radar, Scatter
%matplotlib inline

In [2]:
name=['City', 'Cuisine', 'Name', 'Star', 'Comments', 'PCC', 'Taste', 'Environment', 'Service', 'Addr']
data = pd.read_excel('/Users/binja/Data/dazongdianping/dzdp.xlsx', names=name)

# 数据处理

## 1.观察数据

In [3]:
data.head()

Unnamed: 0,City,Cuisine,Name,Star,Comments,PCC,Taste,Environment,Service,Addr
0,北京,私房菜,如壹私厨,45,152.0,68.0,8.4,9.0,8.7,798艺术区陶瓷二街
1,北京,私房菜,宴桃园餐厅(门头沟店),45,2391.0,70.0,8.5,9.1,8.8,冯村华润置地puls365二楼207室
2,北京,私房菜,海盗虾饭(慈云寺店)分店,45,376.0,45.0,8.4,8.7,8.6,朝阳路八里庄东里1号A区112A
3,北京,私房菜,三合楼(三源里街店),45,1508.0,62.0,8.1,8.2,7.7,东三环三元桥三源里街23号
4,北京,私房菜,来风鱼,45,58.0,37.0,8.8,8.7,8.6,北苑南路京通罗斯福广场地下一层


In [4]:
print(data.shape)

(585914, 10)


查看是否有重复地址的店

In [5]:
print('未去重地址数', len(data.Addr))
print('去重地址数', len(data.Addr.unique()))

未去重地址数 585914
去重地址数 516674


## 2.数据去重

In [6]:
data_new = data.drop_duplicates('Addr')
print(data_new.shape)

(516674, 10)


## 3.检查缺失值情况

In [7]:
data_new.apply(lambda x: x.isnull().sum())

City                0
Cuisine             0
Name               11
Star                0
Comments       123723
PCC            255451
Taste          130443
Environment    130437
Service        130438
Addr                1
dtype: int64

这里连`name`和`Addr`都懒得填的饭店是有多懒，直接删除。其他`Comments`和`PCC`等等没有分的我本来打算填0，还是全部删除吧，主要是考虑到他们还不够优秀！

In [8]:
data_clean = data_new.dropna()
print('去除所有缺失值后的饭店有：{} 间'.format(len(data_clean)))

去除所有缺失值后的饭店有：247430 间


In [9]:
data_clean.apply(lambda x: x.isnull().sum())

City           0
Cuisine        0
Name           0
Star           0
Comments       0
PCC            0
Taste          0
Environment    0
Service        0
Addr           0
dtype: int64

只去除`name`和`Addr`的，其他都填零。

In [10]:
data_new1 = data_new.fillna({'Comments':0, 'PCC': 0, 'Taste': 0, 'Environment': 0, 'Service': 0})
data_new1.apply(lambda x: x.isnull()).sum()

City            0
Cuisine         0
Name           11
Star            0
Comments        0
PCC             0
Taste           0
Environment     0
Service         0
Addr            1
dtype: int64

In [11]:
data_clean_sec = data_new1.dropna()
print('去除 name 和 Addr缺失值后的饭店有：{}'.format(len(data_clean_sec)))

去除 name 和 Addr缺失值后的饭店有：516662


## 4.构建特征

In [12]:
data_cleaned = data_clean.copy()

`Overall`表示综合特征

In [13]:
data_cleaned['Overall'] = round((data_cleaned['Taste'] + data_cleaned['Service'] + data_cleaned['Environment']) / 3, 1)

In [14]:
for i in data_cleaned:
    print("{} 类别包含以下信息：".format(i))
    print(data_cleaned[i].value_counts())

City 类别包含以下信息：
北京      12521
上海      10892
广州       9454
深圳       9425
天津       7944
苏州       7587
成都       7388
南京       7319
杭州       7010
武汉       6668
西安       6450
郑州       6174
青岛       6050
重庆       6046
沈阳       5866
大连       5337
无锡       5196
厦门       5164
济南       5120
宁波       5100
佛山       5061
哈尔滨      4793
东莞       4755
长沙       4644
合肥       4638
福州       4430
长春       4427
常州       4114
石家庄      4054
温州       3918
烟台       3869
泉州       3797
南通       3606
昆明       3590
金华       3300
乌鲁木齐     3224
惠州       3219
中山       3196
嘉兴       3184
南昌       3152
太原       3148
南宁       3013
徐州       2987
珠海       2942
绍兴       2927
贵阳       2904
台州       2881
海口       2526
兰州       2420
Name: City, dtype: int64
Cuisine 类别包含以下信息：
面包甜点      22684
火锅        22030
西餐        18110
川菜        17591
其他        14618
小吃快餐      14435
烧烤        12832
咖啡厅       12507
海鲜         9278
粤菜         8230
日本料理       8188
韩国料理       7014
湘菜         6619
面馆         6601
快餐简餐       6377
东北菜        6250


# 数据分析

## 1.整体分析

### 餐厅数量

In [15]:
dvalue = list(data_cleaned['City'].value_counts().values)
_index = list(data_cleaned['City'].value_counts().index)
bar_city_ct = Bar("城市比较", "餐厅数量", title_pos="center")
bar_city_ct.add("", _index, dvalue, mark_line=['average'], mark_point=['max', 'min'], xaxis_rotate=90, xaxis_interval=0)
bar_city_ct

### 菜系种类数量 

In [16]:
df2 = data_cleaned.set_index(['City'])['Cuisine']
dvalue = []

for i in df2.index.unique():
    x = len(df2[i].unique())
    dvalue.append(x)
X = list(df2.index.unique())
bar_city_cx = Bar("城市比较", "菜系种类数量", title_pos="center")
bar_city_cx.add("", X, dvalue, mark_line=['average'], mark_point=['max', 'min'], xaxis_rotate=90, xaxis_interval=0)
bar_city_cx

### 整体角度

In [17]:
count = list(data_cleaned['City'].value_counts().values)
city = list(data_cleaned['City'].value_counts().index)
data = list(zip(city, count))
geo = Geo("全国主要城市餐厅数量", "", title_color="#fff", title_pos="center",
width=1000, height=600, background_color='#404a59')
attr, value = geo.cast(data)
geo.add("", attr, value, visual_range=[0, 12000], visual_text_color="#fff", symbol_size=15, is_visualmap=True)
geo.add("", attr, value, type="heatmap", visual_range=[0, 12000], visual_text_color="#fff", symbol_size=15, is_visualmap=True)
geo

## 2.成都分析

In [18]:
data_cd = data_cleaned[data_cleaned['City']=='成都']
data_cd.head()

Unnamed: 0,City,Cuisine,Name,Star,Comments,PCC,Taste,Environment,Service,Addr,Overall
51294,成都,川菜,银杏·城南神仙树菜馆广告,40,579.0,365.0,8.4,8.6,8.2,桐梓林紫瑞北街102号,8.4
51295,成都,川菜,老妈蹄花总店(东城根街13号店)广告,40,3064.0,54.0,7.9,7.3,7.6,东城根南街13号,7.6
51296,成都,川菜,大小喜吃匠心集(in99店)广告,40,519.0,72.0,7.8,9.0,8.4,天府大道北段1199号银泰中心in99L7层717号,8.4
51297,成都,川菜,丁太婆老妈蹄花(总店)广告,45,377.0,46.0,8.5,8.1,8.3,东城根南街17号,8.3
51298,成都,川菜,许家菜(八宝街店)分店广告,50,706.0,147.0,8.9,9.1,9.0,八宝街111号,9.0


In [19]:
data_cd.shape

(7388, 11)

### 餐厅星级

In [20]:
bar_cd_star = Bar("成都", "星级分布", title_pos="center")
dvalue = list(data_cd['Star'].value_counts().sort_index().values)
_index = list(data_cd['Star'].value_counts().sort_index().index)
bar_cd_star.add("", _index, dvalue)
bar_cd_star

### 人均消费

In [21]:
print("最贵的：", data_cd['PCC'].max())
print("最便宜的：", data_cd['PCC'].min())
print("平均：", data_cd['PCC'].mean())

最贵的： 3460.0
最便宜的： 3.0
平均： 60.48497563616676


### 人均消费分布

In [22]:
cd_pcc = data_cd['PCC'].value_counts().sort_index()
_index = ["小于50", "50-100", "101-200", "201-300", "301-400", "401-500", "501-600", "601-700", "701-800", "801-900", "901-1000", "1001-2000", "2000以上"]

dvalue = []

s = 0
for i in [50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000]:
    temp = cd_pcc[cd_pcc.index <= i].values.sum()
    data = temp - s
    s = temp
    dvalue.append(data)
dvalue.append(cd_pcc[cd_pcc.index > 2001].values.sum())
dvalue

[3644, 2962, 626, 108, 28, 8, 3, 2, 2, 1, 1, 2, 1]

In [23]:
bar_pcc = Bar("成都", "人均消费分布", title_pos="center")
bar_pcc.add("", _index, dvalue, xaxis_rotate=40, xaxis_interval=0)
bar_pcc

### 最贵的餐厅

In [24]:
print("成都最贵的餐厅：")
print(data_cd.loc[data_cd['PCC'].idxmax()])

成都最贵的餐厅：
City                          成都
Cuisine                       西餐
Name                        格阁花园
Star                          30
Comments                      32
PCC                         3460
Taste                        6.3
Environment                  6.3
Service                      6.3
Addr           蜀汉路一品天下大街同和路9号附1号
Overall                      6.3
Name: 65856, dtype: object


### 人气情况（按照评论数来判断）

In [25]:
print("人气最高：", data_cd['Comments'].max())
print("人气最低：", data_cd['Comments'].min())
print("平均：", data_cd['Comments'].mean())

人气最高： 8533.0
人气最低： 1.0
平均： 268.5932593394694


In [26]:
_index = data_cd['Name'].values
dvalue = data_cd['Comments'].values
bar_rq = Bar("成都", "人气情况（评论数）", title_pos="center")
bar_rq.add("", _index, dvalue, mark_line=['max', 'min', 'average'], xaxis_rotate=40, xaxis_interval=0)
bar_rq

In [27]:
_index = ["小于100", "101-200", "201-300", "301-500", "501-1000", "1001-2000", "2001-4000", "4001-8000", "8000以上"]
dvalue = []
cd_cm = data_cd['Comments'].value_counts().sort_index()
s = 0
for i in [100, 200, 300, 500, 1000, 2000, 4000, 8000]:
    temp = cd_cm[cd_cm.index <= i].values.sum()
    data = temp - s
    s = temp
    dvalue.append(data)

dvalue.append(cd_cm[cd_cm.index > 8001].values.sum())
bar_rq_dis = Bar("成都", "人气分布（评论分布）", title_pos="center")
bar_rq_dis.add("", _index, dvalue, mark_line=['max', 'average'], xaxis_rotate=40, xaxis_interval=0)
bar_rq_dis

In [28]:
print("人气最高的是：")
print(data_cd.loc[data_cd['Comments'].idxmax()])

人气最高的是：
City                       成都
Cuisine                    粤菜
Name           香港新记茶餐厅(香槟广场店)
Star                       35
Comments                 8533
PCC                        62
Taste                     7.6
Environment               7.7
Service                   7.7
Addr             东大街97号香槟广场2层
Overall                   7.7
Name: 54327, dtype: object


### 各项得分（口味，环境，服务，综合）

In [29]:
data_cd['Taste'].groupby(data_cd['Star']).mean().values

array([7.58333333, 5.82857143, 6.74264706, 7.31669295, 7.94028557,
       8.55261438, 8.96064815])

In [30]:
taste_value = data_cd['Taste'].groupby(data_cd['Star']).mean().values
env_value = data_cd['Environment'].groupby(data_cd['Star']).mean().values
service_value = data_cd['Service'].groupby(data_cd['Star']).mean().values
overall_value = data_cd['Overall'].groupby(data_cd['Star']).mean().values
_index = list(data_cd['Star'].value_counts().sort_index().index)

bar_all = Bar("成都", "各项得分")
bar_all.add("口味", _index, taste_value, mark_line=['average'])
bar_all.add("环境", _index, env_value, mark_line=['average'])
bar_all.add("服务", _index, service_value, mark_line=['average'])
bar_all.add("综合", _index, overall_value, mark_line=['average'])

bar_all

In [31]:
taste = data_cd['Taste'].value_counts().sort_index()
environment = data_cd['Environment'].value_counts().sort_index()
service = data_cd['Service'].value_counts().sort_index()
overall = data_cd['Service'].value_counts().sort_index()

_index1 = taste.index
_index2 = environment.index
_index3 = service.index
_index4 = overall.index
t_v = taste.values
e_v = environment.values
s_v = service.values
o_v = service.values

bar_all = Bar("成都", "各项得分")
bar_all.add("口味", _index1, t_v, mark_line=['average'])
bar_all.add("环境", _index2, e_v, mark_line=['average'])
bar_all.add("服务", _index3, s_v, mark_line=['average'])
bar_all.add("综合", _index4, o_v, mark_line=['average'])

bar_all

### 综合水准

In [32]:
overall_max = data_cd['Overall'].values.max()
star_max = data_cd['Star'].values.max()
taste_max = data_cd['Taste'].values.max()
environment_max = data_cd['Environment'].values.max()
service_max = data_cd['Service'].values.max()

overall_mean = data_cd['Overall'].values.mean()
star_mean = data_cd['Star'].values.mean()
taste_mean = data_cd['Taste'].values.mean()
environment_mean = data_cd['Environment'].values.mean()
service_mean = data_cd['Service'].values.mean()

In [33]:
schema = [ 
    ("星级", star_max), ("综合", overall_max), ("服务", service_max),
    ("口味", taste_max), ("环境", environment_max)
]
v = [[star_mean, overall_mean, service_mean, taste_mean, environment_mean]]
radar = Radar("综合水准")
radar.config(schema)
radar.add("", v, is_splitline=True, is_axisline_show=True)
radar

### 综合得分最高的

In [93]:
print("综合得分最高的家店：")
print(data_cd.loc[data_cd['Overall'].idxmax()])
print("\n")
print("口味得分最高的家店：")
print(data_cd.loc[data_cd['Taste'].idxmax()])
print("\n")
print("环境得分最高的家店：")
print(data_cd.loc[data_cd['Environment'].idxmax()])
print("\n")
print("服务得分最高的家店：")
print(data_cd.loc[data_cd['Service'].idxmax()])

综合得分最高的家店：
City                         成都
Cuisine                      12
Name                  馨·享270°广告
Star                          0
Comments                    291
PCC                         261
Taste                       9.2
Environment                 9.3
Service                     9.4
Addr           大慈寺路朗御大厦2单元4004号
Overall                     9.3
Name: 51300, dtype: object


口味得分最高的家店：
City                        成都
Cuisine                     18
Name                食引力7071私房菜
Star                         0
Comments                    97
PCC                        196
Taste                      9.4
Environment                9.2
Service                    9.4
Addr           天仙桥北路2号金海岸公寓806
Overall                    9.3
Name: 55521, dtype: object


环境得分最高的家店：
City                              成都
Cuisine                           18
Name                          丑二哥-私厨
Star                               0
Comments                         286
PCC                            

### 从菜的角度来看

In [34]:
print("成都平均最贵的菜系：{}，平均人均要：{}元".format(data_cd.groupby('Cuisine').mean()['PCC'].idxmax(),
                                   round(data_cd.groupby('Cuisine').mean()['PCC'][data_cd.groupby('Cuisine').mean()['PCC'].idxmax()], 2)))

成都平均最贵的菜系：私房菜，平均人均要：118.55元


In [35]:
print("成都平均最受欢迎的菜系：{}，平均评论数：{}".format(data_cd.groupby('Cuisine').mean()['Comments'].idxmax(), 
                                      round(data_cd.groupby('Cuisine').mean()['Comments'][data_cd.groupby('Cuisine').mean()['Comments'].idxmax()], 0)))

成都平均最受欢迎的菜系：火锅，平均评论数：817.0


### 星级与评分之间的关系

In [36]:
v1 = data_cd['Star'].values
v2 = data_cd['Overall'].values

In [38]:
scatter = Scatter("星级 - 评分")
scatter.add("评分", v1, v2)
scatter

## 成都火锅和重庆火锅的比较

### 平均价格

In [39]:
data_cq = data_cleaned[data_cleaned['City']=='重庆']
print("成都火锅平均价格（人均）", round(data_cd.groupby('Cuisine').mean()['PCC']['火锅'], 2))
print("重庆火锅平均价格（人均）", round(data_cq.groupby('Cuisine').mean()['PCC']['火锅'], 2))

成都火锅平均价格（人均） 81.8
重庆火锅平均价格（人均） 71.07


In [40]:
data_sh = data_cleaned[data_cleaned['City']=='上海']
print("上海火锅平均价格（人均）", round(data_sh.groupby('Cuisine').mean()['PCC']['火锅'], 2))

上海火锅平均价格（人均） 128.43


### 综合评分

In [41]:
cq_taste = round(data_cq.groupby('Cuisine').mean()['Taste']['火锅'], 2)
cq_env = round(data_cq.groupby('Cuisine').mean()['Environment']['火锅'], 2)
cq_service = round(data_cq.groupby('Cuisine').mean()['Service']['火锅'], 2)
cq_overall = round(data_cq.groupby('Cuisine').mean()['Overall']['火锅'], 2)
cq_values = [cq_taste, cq_env, cq_service, cq_overall]
cd_taste = round(data_cd.groupby('Cuisine').mean()['Taste']['火锅'], 2)
cd_env = round(data_cd.groupby('Cuisine').mean()['Environment']['火锅'], 2)
cd_service = round(data_cd.groupby('Cuisine').mean()['Service']['火锅'], 2)
cd_overall = round(data_cd.groupby('Cuisine').mean()['Overall']['火锅'], 2)
cd_values = [cd_taste, cd_env, cd_service, cd_overall]

In [42]:
attr = ['口味', '环境', '服务', ' 综合']
barCQCD = Bar('重庆和成都火锅综合比较')
barCQCD.add("重庆", attr, cq_values)
barCQCD.add("成都", attr, cd_values, is_convert=True)
barCQCD

# 机器学习

### 将类别数值化

In [73]:
%%time
binarizer = Binarizer(threshold=39)
data_cd[['Star']] = binarizer.transform(data_cd[['Star']])
le = LabelEncoder()
data_cd['Cuisine'] = le.fit_transform(data_cd[['Cuisine']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
  y = column_or_1d(y, warn=True)


CPU times: user 1.22 s, sys: 26.5 ms, total: 1.25 s
Wall time: 1.43 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [74]:
data_cd.head()

Unnamed: 0,City,Cuisine,Name,Star,Comments,PCC,Taste,Environment,Service,Addr,Overall
51294,成都,12,银杏·城南神仙树菜馆广告,0,579.0,365.0,8.4,8.6,8.2,桐梓林紫瑞北街102号,8.4
51295,成都,12,老妈蹄花总店(东城根街13号店)广告,0,3064.0,54.0,7.9,7.3,7.6,东城根南街13号,7.6
51296,成都,12,大小喜吃匠心集(in99店)广告,0,519.0,72.0,7.8,9.0,8.4,天府大道北段1199号银泰中心in99L7层717号,8.4
51297,成都,12,丁太婆老妈蹄花(总店)广告,0,377.0,46.0,8.5,8.1,8.3,东城根南街17号,8.3
51298,成都,12,许家菜(八宝街店)分店广告,0,706.0,147.0,8.9,9.1,9.0,八宝街111号,9.0


### 划分 feature 和 label

In [75]:
data_features = data_cd[['Cuisine', 'Comments', 'PCC', 'Taste', 'Environment', 'Service']].values
data_labels = data_cd['Star'].values

In [76]:
from sklearn.feature_selection import SelectKBest, SelectPercentile

selection = SelectPercentile(percentile=80)
selection.fit(data_features, data_labels)
selection.transform(data_features)
print(selection.get_support())
print(selection.scores_)
features_new = data_features[:, selection.get_support()]
features_new

[ True  True  True  True False False]
[nan nan nan nan nan nan]


  msb = ssbn / float(dfbn)


array([[  12. ,  579. ,  365. ,    8.4],
       [  12. , 3064. ,   54. ,    7.9],
       [  12. ,  519. ,   72. ,    7.8],
       ...,
       [   4. ,   33. ,   65. ,    6.8],
       [   4. ,    7. ,   44. ,    7.5],
       [   4. ,  223. ,   18. ,    7.4]])

In [77]:
from sklearn.preprocessing import MinMaxScaler

Scaler = MinMaxScaler()
data_features = Scaler.fit_transform(features_new)

### XGBoost

In [78]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

xgb = XGBClassifier()
print(cross_val_score(xgb, data_features, data_labels, cv=10).mean())

1.0


### ???

In [80]:
xgb.fit(data_features, data_labels)
xgb.predict([100, 10, 10, ])

array([0])