In [1]:
import os
import datetime
import json
import numpy as np
import pandas as pd
import networkx as nx
from collections import defaultdict
from sklearn.preprocessing import minmax_scale

import _locale
_locale._getdefaultlocale = (lambda *args: ['zh_CN', 'utf8'])
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号

# 各菜系口味偏好的可视化

7种基本口味占各菜系所有菜的比例

In [2]:
caixi_flavor = pd.read_excel("caixi_flavor.xlsx",index_col=0)
caixi_flavor

Unnamed: 0,香,鲜,辣,甜,酸,咸,清淡
川菜,0.92322,0.39522,0.865202,0.118111,0.221105,0.161092,0.007483
粤菜,0.748955,0.425606,0.229969,0.235694,0.141541,0.159723,0.068423
东北菜,0.806845,0.474006,0.367444,0.11743,0.210523,0.047252,0.001661
湘菜,0.849831,0.423547,0.715348,0.080458,0.139161,0.139919,0.010256
浙菜,0.691796,0.296958,0.30471,0.123888,0.064388,0.036831,0.062846
鲁菜,0.708045,0.299014,0.253808,0.189782,0.145733,0.043974,
淮扬菜,0.80741,0.525787,0.136032,0.152402,0.043909,0.146651,0.022725
苏菜,0.6944,0.415557,0.118155,0.276602,0.17697,0.10841,0.012883
闽菜,0.646375,0.415261,0.195538,0.114542,0.103757,0.063783,0.000845
潮州菜,0.753859,0.123395,0.320721,0.037618,0.004755,0.04921,0.031041


- 为了可视化中便于比较，先对数据进行最大最小值归一化，将数量级统一到[0,1]
- 为了后续雷达图显示中，避免因为一根轴上数值为0时，图形失去面积的问题，统一+0.1保证最小面积

In [3]:
caixi_flavor = caixi_flavor.fillna(0.0)
for col in caixi_flavor.columns:
    caixi_flavor[col] = minmax_scale(caixi_flavor[col])+0.1
caixi_flavor

Unnamed: 0,香,鲜,辣,甜,酸,咸,清淡
川菜,1.1,0.775523,1.1,0.436811,1.1,1.1,0.209364
粤菜,0.534267,0.851037,0.249675,0.928825,0.732246,1.090285,1.1
东北菜,0.722199,0.971317,0.4337,0.433965,1.051089,0.292227,0.124281
湘菜,0.861749,0.845918,0.899405,0.279258,0.721243,0.949766,0.249887
浙菜,0.348706,0.531328,0.349723,0.460987,0.375633,0.21828,1.018488
鲁菜,0.401458,0.536437,0.281586,0.736711,0.751622,0.268965,0.1
淮扬菜,0.724036,1.1,0.12393,0.580301,0.280974,0.997534,0.43212
苏菜,0.357159,0.826063,0.1,1.1,0.896004,0.726182,0.288283
闽菜,0.20125,0.825326,0.203585,0.421881,0.557604,0.409523,0.112349
潮州菜,0.550187,0.1,0.371156,0.1,0.1,0.306114,0.553661


使用pyecharts进行动态可视化

In [4]:
import random
from pyecharts import Timeline, Radar
timeline = Timeline(is_auto_play=True, timeline_bottom=0,timeline_symbol_size=6)
schema = [ (k0, 1.2) for k0 in caixi_flavor.columns]
for caixi0, row in caixi_flavor.iterrows():
    values0 = [row.values.tolist()]
    radar0 = Radar()
    radar0.config(schema, radar_text_size=16)
    color = "RGB(%d,%d,%d)" % (random.randint(0,255),random.randint(0,255),random.randint(0,255))
    radar0.add("", values0,legend_selectedmode='single',is_legend_show=False, is_area_show=True, area_color=color, area_opacity=0.5)
    timeline.add(radar0, caixi0)
timeline

ERROR:lml.utils:failed to import pyecharts_snapshot
Traceback (most recent call last):
  File "D:\Program Files\Anaconda2\envs\py36\lib\site-packages\lml\utils.py", line 43, in do_import
    plugin_module = __import__(plugin_module_name)
ModuleNotFoundError: No module named 'pyecharts_snapshot'


# 不规范原料名的统一

从网络数据中整理出了标准食材名和它们的别称，利用它们统一不规范的原料名

In [5]:
with open("entity2alias.json","r") as fp:
    entity2alias = json.load(fp)
with open("entity2type.json","r") as fp:
    entity2type = json.load(fp)

In [6]:
print(entity2alias["小麦面粉"])

['小麦面粉', '小麦粉', '面粉', '淀粉', '水淀粉']


食材出现频率，其index为所有的食材

In [7]:
ingre_count = pd.read_excel("ingredient_count.xlsx",index_col=0)[0]
ingre_count.head()

盐     2400
鸡蛋     879
料酒     795
生抽     793
糖      664
Name: 0, dtype: int64

In [8]:
print("原始共有%d种食材" % len(ingre_count))

原始共有6994种食材


In [9]:
from harvesttext import HarvestText
ht = HarvestText()
ht.add_entities(entity2alias, entity2type) # 将标准名和别名提供给harvesttext，后续可以用于匹配

ingre2entity = {}
entity2ingres = defaultdict(list)
for ingre in ingre_count.index:
    entity_linked, type0 = ht.mention2entity(ingre)      # 使用harvesttext链接到实体，其利用先前提供的标准名和别名进行搜索
    if entity_linked is None:                            # 如果找不到对应实体，就让它单独成为实体
        ingre2entity[ingre] = ingre
        entity2ingres[ingre].append(ingre)
    else:
        ingre2entity[ingre] = entity_linked
        entity2ingres[entity_linked].append(ingre)

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\KELEN\AppData\Local\Temp\jieba.cache
DEBUG:jieba:Loading model from cache C:\Users\KELEN\AppData\Local\Temp\jieba.cache
Loading model cost 1.892 seconds.
DEBUG:jieba:Loading model cost 1.892 seconds.
Prefix dict has been built succesfully.
DEBUG:jieba:Prefix dict has been built succesfully.


In [10]:
print("链接后共有%d种食材" % len(entity2ingres))

链接后共有2784种食材


In [11]:
print(entity2ingres["豆瓣酱"])

['豆瓣酱', '郫县豆瓣酱', '正宗郫县豆瓣酱', '六月香豆瓣酱', '豆瓣酱盐糖鸡精香菜', '辣豆瓣酱', '老干妈和豆瓣酱可以二选一']


In [12]:
print(entity2ingres["小麦面粉"])

['面粉', '淀粉', '低筋面粉', '水淀粉', '高筋面粉', '中筋面粉', '普通面粉', '干淀粉', '小麦面粉', '淀粉水', '普通面粉（中筋面粉）', '中筋面粉（水油皮）', '低筋面粉（油酥）', '1淀粉', '湿淀粉', '干面粉', '小麦粉', '莜面粉', '普通面粉（油酥皮）', '普通面粉（水油皮）', '淀粉（腌肉用）', '面粉用清水', '中筋面粉 Unbleached All-Purpose Flour', '墨西哥酱-低筋面粉', '中筋面粉（面团）', '法国面粉T65', '新良芯面道面粉', '材料2:低筋面粉', '材料1:低筋面粉', '普通面粉（油皮）', '浓淀粉水', '水（根据实际情况，可将面粉成团即可）', '普通中筋面粉', '淀粉（玉米或红薯淀粉）', '淀粉（煎蛋用）', '木薯淀粉', '普通面粉（油酥）', '水淀粉(鱼香汁)', '水淀粉：冷水', '水淀粉：玉米淀粉', '淀粉(腌肉用)', '淀粉10克', '月饼皮：中筋面粉（普通面粉）', '淀粉（一斤肉以此类推）', '普通面粉（饼皮）', '淀粉（依个人口味）', '淀粉（浇汁用）', '普通面粉（葱花馅）', '月饼粉或低筋面粉(肉馅中)', '淀粉水(淀粉+水调开)', '低筋面粉  （泡芙）', '低筋面粉(馅)', '低筋面粉15克', '低筋面粉45克', '低筋面粉60克', '低筋面粉【饼底】', '低筋面粉（塔皮）', '低筋面粉（酥皮）', '低筋面粉（面包）', '勾芡淀粉水', 'A高筋面粉', '[醃鱼]太白粉/面粉', 'A普通面粉', '20ml淀粉糊（水+淀粉）', "'淀粉", '(酥皮)低筋面粉', '3淀粉水', '中筋粉（家用普通面粉）', '中筋面粉A', '中筋面粉（Maida）', '中筋面粉（墨西哥饼）', '中筋面粉（家常面粉）', '中筋面粉（普通面粉）200克', '中筋面粉（普通）', '中粉（普通面粉）', '五仁馅：熟面粉（糯米粉）', '中/高筋面粉', '中劲面粉', '中或低筋面粉', '披萨底：高筋面粉', '干淀粉（滚鸡蛋用）', '天然酵母（没养天然酵母折算成100克面粉、100克水，2克酵母）', '裹肉用淀粉', '表面装饰： 低筋面粉', '酥皮：

可以看到，经过这一步骤，我们合并了大量不规范的名称，有助于提高数据质量。

然后，就可以用链接的结果把文本规范化

In [13]:
ht.clear()          # 重置模型
ht.add_entities(entity2ingres)
ht.add_entities(entity2alias, entity2type)
ht.seg("牛腩 胡萝卜 辣豆瓣酱 料酒 八角 生姜片 香菜 洋葱 白糖",return_sent=True,standard_name=True)



'牛腩   胡萝卜   豆瓣酱   料酒   八角   姜   香菜   洋葱   白糖'