In [0]:
import requests
import csv
import xlrd
import re
import os
import time
from datetime import datetime
from retrying import retry



class CrawlerTool:

    resource_context = "/content/drive/My Drive/resource/"
    data_context = "/content/drive/My Drive/data/"

    filename_dict = {"lon_lat": "beijing_lon_lat.xls", "sheet_name": "beijing", "kid": "bjkid.csv", "uid": "userid.csv",

                     "bj_kitchen": "beijing_kitchenlist.csv", "bj_dish": "bj_dish.csv",

                     "kcomment": "kitchencomment.csv", "ktag": "kitchentag.csv", "ucomment": "usercomment.csv"
                     }

    @staticmethod
    @retry()
    def get_response(request_type, url, data=None, timeout=5):
        # 获得响应数据
        try:
            if request_type == "POST":
                response = requests.post(url, data, timeout)
                print("请求成功")
                return response
        except ConnectionError as e:
            print(e)
            raise e
        except TimeoutError as e:
            print(e)
            raise e

    @staticmethod
    def response_convert(convert_type, response):
        # 以json数据形式读取响应数据
        if convert_type == 'JSON':
            return response.json()

    @staticmethod
    def add_liststr_in_otherlist(data, listname, itemname):
        # 将data字典中的指定列表中的元素转换为字符串并去除回车换行符和英文逗号，并将该列表转换为字符串，再将英文逗号转为中文逗号并返回
        templist = []
        for i in data[listname]:
            try:
                templist.append(re.subn('\n|,|\r', '，', str(i[itemname]))[0])
            except KeyError:
                pass
        return str(templist).replace(',', '，')

    @staticmethod
    def save_csv(data_list, address, encoding='utf-8'):
        # 将数据列表保存在csv文件中
        fout = open(address, "a", encoding=encoding)
        CrawlerTool.list_element_convert_str(data_list)
        fout.write(",".join(data_list) + "\n")
        print(data_list)
        fout.close()

    @staticmethod
    def read_csv(address):
        # 读取csv中的数据，以列表返回
        data_list = []
        for i in csv.reader(open(address, encoding='utf-8')):
            data_list.append(i)
        return data_list

    @staticmethod
    def read_excel(address, sheetname, time, row_index, *args):
        # 读取excel中的数据，以列表形式返回
        workbook = xlrd.open_workbook(address)
        sheet = workbook.sheet_by_name(sheetname)
        data_list = []
        for row in range(time):
            str1 = '{0:.4f}'.format(sheet.cell(row + row_index, args[0]).value)
            str1 += ',' + '{0:.5f}'.format(sheet.cell(row + row_index, args[1]).value)
            data_list.append(str1)
        return data_list

    @staticmethod
    def list_element_convert_str(rawlist):
        # 将列表中所有不是字符串的元素转换为字符串
        for i in range(len(rawlist)):
            if not isinstance(rawlist[i], str):
                rawlist[i] = str(rawlist[i])

    @staticmethod
    def get_context(context):
        date = datetime.today().date()  # 获得日期
        date = str(date)  # 转换为字符串
        date = date.replace("-", "/")  # 替换-为/
        date = date + "/"  # /收尾
        context = context + date  # 连接字符串后得到存放文件地址的上下文
        return context

    # 文件夹目录不存在就创建
    @staticmethod
    def create_dirs(path):
        is_exist = os.path.exists(path)
        if not is_exist:
            os.makedirs(path)


class GetKitchen:

    def __init__(self):
        self.url = "https://user.mapi.jiashuangkuaizi.com/Kitchen/kitchenList"
        self.tag_name = ["平均价格", "开始营业时间", "停止营业时间", "收藏人数", "家厨图片链接", "家厨姓名", "封面图片链接", "起送价格", "距离", "has_more_dish",
                    "is_check", "is_collect", "is_distr", "is_new新店家？", "is_open", "kitchen_flag"
                    "地址", "厨房id", "厨房名称", "告示", "告示图标",
                    "小贴士tip", "月销量", "家厨籍贯", "over_business_time", "over_distr_radius", "厨房星数", "推荐消息", "tmr_in_business",
                    "tod_in_business", "活动图标", "活动名称", "send_level", "send_time", "推荐菜品图片链接", "推荐菜品标签"]
        self.taglist = ["avg_price", "business_start", "business_end", "collect_cnt", "cook_image_url", "cook_name",
                   "cover_image_url", "dispatch_threshold", "distance", "has_more_dish",
                   "is_check", "is_collect", "is_distr", "is_new", "is_open", "kitchen_flag"
                                                                              "kitchen_address", "kitchen_id",
                   "kitchen_name", "kitchen_notice", "kitchen_notice_icon", "kitchen_tip", "month_sale", "native_place",
                   "over_business_time", "over_distr_radius", "star", "recommend_msg", "tmr_in_business", "tod_in_business"]
        self.body = {'_cityid': '', 'user_coordinate': '', '_platform': 'Android', '_device': 'awift18:00:43:d7:07:6a',
                '_osversion': '5.1.1', '_screen': '720x1280', '_version': '3.9.7', 'channel': 'huijiachifan',
                '_time': '2019-05-27 23:27:52', '_build': '58', 'coordinate': '116.1459,39.68186', 'size': '1',
                'radius': "", 'page': '1'}
        self.item_type_is_list = ['activity_list', 'recommend_dishes']
        self.item_type_is_dict = ['express_tag']
        self.activity_item = ['icon', 'name']
        self.express_item = ['send_level', 'send_time']
        self.recommend_item = ['recommend_tag', 'url']
        self.listordict_name_keyvalue = {'activity_list': self.activity_item, 'express_tag': self.express_item,
                                    'recommend_dishes': self.recommend_item}

    def read_response(self, kitchen_list):
        count = 0
        result_list = []
        for kitchen in kitchen_list:    # 厨房字典
            result_list.append([])
            for tag in self.taglist:
                result_list[count].append(kitchen.get(tag))
            for listname in self.item_type_is_list:
                for itemname in self.listordict_name_keyvalue.get(listname):
                    templist = CrawlerTool.add_liststr_in_otherlist(kitchen, listname, itemname)
                    result_list[count].append(templist)
            for dictname in self.item_type_is_dict:
                for dictitem in self.listordict_name_keyvalue[dictname]:
                    result_list[count].append(kitchen[dictname][dictitem])
            print(result_list[count])
            count += 1
        return result_list

    def main(self):
        final_list_chongfu = []

        out_addr = CrawlerTool.get_context(CrawlerTool.data_context)

        CrawlerTool.create_dirs(out_addr)
        out_addr += CrawlerTool.filename_dict.get("bj_kitchen")

        # 保存表头在第一行
        final_list_chongfu.append(self.tag_name)

        # 读取经纬度xls文件，取出经纬度数据保存在列表中
        resource = CrawlerTool.resource_context + CrawlerTool.filename_dict.get("lon_lat")
        lon_latlist = CrawlerTool.read_excel(resource, CrawlerTool.filename_dict.get("sheet_name"), 1255, 1, 1, 2)

        final_list_diff = []
        kitchen_id = []  # 存放已有kitchen的id
        t = 0
        # 遍历经纬度列表，用经纬度数据发送请求获得厨房数据，并将其保存在csv文件中
        for location in lon_latlist:
            print(t, "、", location)
            t = t + 1
            self.body['coordinate'] = location
            response = CrawlerTool.get_response('POST', self.url, data=self.body)
            count = CrawlerTool.response_convert('JSON', response)['data']['count']
            if count != 0:
                self.body['size'] = count
                response = CrawlerTool.get_response('POST', self.url, data=self.body)
                kitchen_list = CrawlerTool.response_convert('JSON', response)['data']['list']
                one_col_lat_klist = self.read_response(kitchen_list)
                # 每一经纬度有一个厨房列表，遍历该厨房列表，将全部厨房单独拿出来通通添加到重复厨房列表fianl_list_chongfu中
                for one in one_col_lat_klist:
                    final_list_chongfu.append(one)
        for i in final_list_chongfu:
            # 如果id在kitchen_id里就证明是重复的
            if i[16] not in kitchen_id:
                kitchen_id.append(i[16])
                # 不重复就添加进最终kitchen表中
                final_list_diff.append(i)
        for i in final_list_diff:
            CrawlerTool.save_csv(i, out_addr)


class DishList:

    def __init__(self):
        self.tag_name = ["菜分类title", "box_fee餐盒费用", "collect_cnt收藏人数", "菜品介绍", "菜品id", "菜名", "菜品推荐标签", "吃过的人数", "是否有主食？",
                    "菜品图片链接", "缩略图链接", "is_collect", "是否新菜品",
                    "is_shelves", "厨房id", "max_discount_dish_preorder", "new_dish", "菜品价格", "推荐", "sequence",
                    "special_dish", "special_limit", "special_notice",
                    "special_price特价", "主食", "staple_num", "主食每份价格", "stock", "stock_notice", "tmr_only", "toast",
                    "type菜品类型", "activity_tags"]
        self.taglist = ["box_fee", "collect_cnt", "description", "dish_id", "dish_name", "dish_recommend_tag", "eat_num",
                   "has_staple", "image_url", "thumbnail_image_url", "is_collect", "is_new",
                   "is_shelves", "kitchen_id", "max_discount_dish_preorder", "new_dish", "price", "recommend", "sequence",
                   "special_dish", "special_limit", "special_notice",
                   "special_price", "staple_name", "staple_num", "staple_price", "stock", "stock_notice", "tmr_only",
                   "toast", "type"]
        self.body = {'_cityid': '', 'user_coordinate': '', '_platform': 'Android', '_device': 'awift09:00:27:d6:07:6a',
                '_osversion': '5.1.1', '_screen': '720x1280', '_version': '3.9.7', 'kitchen_id': '313589',
                'channel': 'huijiachifan', '_time': '2019-05-23 21:45:52', '_build': '58', 'date_type': '0',
                'coordinate': ''}
        self.url = "https://user.mapi.jiashuangkuaizi.com/Kitchen/dishList"
        self.item_type_is_list = ['activity_tags']

    # 将菜品字典中键为tag的值放在列表中
    def to_read(self, response):
        alldish = response["data"]["dish_list"]
        result_list = []
        count = 0
        for title_dishlist in alldish:
            dish_list = title_dishlist.get("list")
            for dish in dish_list:
                result_list.append([])
                result_list[count].append(title_dishlist.get("title"))
                for tag in self.taglist:
                    try:
                        result_list[count].append(
                            re.subn('\n|,|\r', '，', str(dish[tag]))[0])  # 需要转换为字符串,才能将评论中回车键和英文逗号换掉
                    except KeyError as e:
                        result_list[count].append('0')
                for listname in self.item_type_is_list:
                    result_list[count].append(re.subn('\n|,', '，', str(dish[listname]))[0])
                count += 1
        return result_list

    def main(self):

        out_addr = CrawlerTool.get_context(CrawlerTool.data_context)

        CrawlerTool.create_dirs(out_addr)
        out_addr += CrawlerTool.filename_dict.get("bj_dish")
        # 保存表头在第一行
        CrawlerTool.save_csv(self.tag_name, out_addr)

        # 读取厨房id列表
        resource = CrawlerTool.resource_context+CrawlerTool.filename_dict.get("kid")
        id_list = CrawlerTool.read_csv(resource)

        # 遍历id列表，发出请求并分析数据，再将数据存储在csv中
        for id in id_list:
            self.body["kitchen_id"] = id
            response = CrawlerTool.get_response('POST', self.url, self.body)
            json_data = CrawlerTool.response_convert('JSON', response)
            if json_data["data"] != []:
                result_list = self.to_read(json_data)
                for i in result_list:
                    CrawlerTool.save_csv(i, out_addr)


class KitchenComment:

    def __init__(self):
        self.url = "https://user.mapi.jiashuangkuaizi.com/UComment/getListByKitchenIdTags"
        self.tag_name = ["卖方评论总数", "买方年龄段", "买方头像", "评论的id", "评论内容", "评论时间", "送餐服务星数", "送餐类型", "买家id", "评论晒图数量", "评论图片链接",
                    "评论图片缩略图", "也是买家id", "is_myself",
                    "is_praised", "厨房id", "厨房图片链接", "厨房名称", "菜品星数", "买家昵称", '买家性别', "吃过次数", "工作", "order_id",
                    "order_no", "买家电话", "平台回复", "评论点赞数", "reply_display", 'send_type', 'sender_type',
                    '点赞菜品id', '点赞菜品名称', '点赞菜品is_praise', '点踩菜品id', '点踩菜品名称', '点踩菜品is_praise', '商家回复内容', '商家回复时间',
                    '商家回复中的sender_type', '送餐评论标签', '订单评论标签']
        self.taglist = ["age", "avatar_url", "comment_id", "content", "create_time", "express_star", "express_type", "id",
                   "image_cnt", "image_url", "thumbnail_image_url", "user_id", "is_myself",
                   "is_praised", "kitchen_id", "kitchen_image_url", "kitchen_name", "star", "nickname", 'sex',
                   "ordinal", "occupation", "order_id", "order_no", "phone", "platform", "praise_num", "reply_display",
                   'send_type', 'sender_type']
        self.body = {'_cityid': '', 'user_coordinate': '', '_platform': 'Android', '_device': 'awift18:00:33:d6:07:6a',
                '_osversion': '5.1.1', '_screen': '720x1280', '_version': '3.9.7', 'channel': 'huijiachifan',
                '_time': '2019-05-26 00:20:52', '_build': '58', 'coordinate': '', 'size': '100',
                'page': '1', 'tag_id': '0', 'tag_type': '0', 'kitchen_id': '216471', 'content': '0'}
        self.item_type_is_list = ['praise', 'awful', 'children', 'express_tag', 'order_tag']
        self.praise_item = ['dish_name', 'dish_id', 'is_praise']
        self.awful_item = ['dish_name', 'dish_id', 'is_praise']
        self.children_item = ['content', 'create_time', 'sender_type']
        self.express_item = ['distr_tag']
        self.order_item = ['ordered_tag']
        self.listordict_name_keyvalue = {'praise': self.praise_item, 'awful': self.awful_item, 'children': self.children_item,
                                    'express_tag': self.express_item, 'order_tag': self.order_item}

    # 读取评论列表中的所需数据
    def read_response(self, commentlist, *args):
        count = 0
        result_list = []
        for comment in commentlist:
            result_list.append([])
            for i in args:
                result_list[count].append(i)
            for tag in self.taglist:
                result_list[count].append(re.subn('\n|,|\r', '，', str(comment[tag]))[0]) # 列表需要转换为字符串
            for listname in self.item_type_is_list:
                for itemname in self.listordict_name_keyvalue.get(listname):
                    templist = CrawlerTool.add_liststr_in_otherlist(comment, listname, itemname)
                    result_list[count].append(templist)
            count += 1
        return result_list

    def main(self):

        out_addr = CrawlerTool.get_context(CrawlerTool.data_context)

        CrawlerTool.create_dirs(out_addr)
        out_addr += CrawlerTool.filename_dict.get("kcomment")
        # 将表头保存在第一行
        CrawlerTool.save_csv(self.tag_name, out_addr)

        # 读取厨房id文件
        resource = CrawlerTool.resource_context+CrawlerTool.filename_dict.get("kid")
        id_list = CrawlerTool.read_csv(resource)

        # 遍历id列表取出数据并保存
        count = 0
        for id in id_list:
            self.body['kitchen_id'] = id
            response = CrawlerTool.get_response('POST', self.url, data=self.body)
            response = CrawlerTool.response_convert('JSON', response)
            print(response)
            if response["data"] != []:
                total = response['data']['total']
                totalpage = response['data']['totalPage']
                print('商家id：', id, '  总评论数：', total, '  总页数：', totalpage)
                for i in range(totalpage):
                    self.body['page'] = i + 1
                    response = CrawlerTool.get_response('POST', self.url, data=self.body)
                    commentlist = CrawlerTool.response_convert('JSON', response)['data']['list']
                    print(commentlist)
                    result_list = self.read_response(commentlist, total)
                    for comment in result_list:
                        CrawlerTool.save_csv(comment, out_addr)
                count += 1
                self.body['page'] = 1
                print('已爬完', count, '个商家全部评论')


class UserComment:

    def __init__(self):
        self.url = 'https://user.mapi.jiashuangkuaizi.com/UComment/getUserComments'
        self.taglist = ["content", "create_time", "star", "comment_id", 'ordinal', "order_no", "praise_num", "image_url",
                   "thumbnail_image_url", "is_myself", "is_praised", "reply_display", 'send_type', 'platform']
        self.tagname = ['评论内容', '评论时间', '评论星数', '评论id', 'ordinal', '订单编号', '评论点赞数', '评论图片链接', '评论缩略图链接', 'is_myself',
                   'is_praised', 'reply_display', 'send_type', '平台回复', '点赞菜品id', '点赞菜品名称', 'is_praise', '商家回复', '回复时间',
                   'sender_type', '用户id', '用户昵称', '用户头像', '厨房id', '厨房图片链接', '厨房名称', '平均星数', '月销量', '商家籍贯']
        self.body = {'_cityid': '', 'user_coordinate': '', '_platform': 'Android', '_device': 'awift18:00:33:d6:07:6a',
                '_osversion': '5.1.1', '_screen': '720x1280', '_version': '3.9.7', 'channel': 'huijiachifan',
                '_time': '2019-05-26 00:20:52', '_build': '58', 'coordinate': '', 'size': '6000',
                'page': '', 'user_id': '101471'}
        self.item_type_is_list = ['praise', 'children']
        self.item_type_is_dict = ['user_info', 'kitchen']
        self.praise_item = ['dish_name', 'dish_id', 'is_praise']
        self.children_item = ['content', 'create_time', 'sender_type']
        self.userinfo_item = ['user_id', 'nickname', 'avatar_url']
        self.kitchen_item = ['kitchen_id', 'kitchen_image_url', 'kitchen_name', 'avg_star', 'month_sale', 'native_place']
        self.listordict_name_keyvalue = {'praise': self.praise_item, 'children': self.children_item, 'user_info': self.userinfo_item,
                                    'kitchen': self.kitchen_item}

    # 读取评论列表中的所需数据
    def read_response(self, commentlist):
        count = 0
        result_list = []
        for comment in commentlist:
            result_list.append([])
            for tag in self.taglist:
                result_list[count].append(re.subn('\n|,|\r', '，', str(comment[tag]))[0])  # 需要转换为字符串,才能将评论中回车换行符和英文逗号换掉
            for listname in self.item_type_is_list:
                for itemname in self.listordict_name_keyvalue.get(listname):
                    templist = CrawlerTool.add_liststr_in_otherlist(comment, listname, itemname)
                    result_list[count].append(templist)
            for dictname in self.item_type_is_dict:
                for itemname in self.listordict_name_keyvalue.get(dictname):
                    result_list[count].append(comment[dictname][itemname])
            count += 1
        return result_list

    def main(self):

        out_addr = CrawlerTool.get_context(CrawlerTool.data_context)

        CrawlerTool.create_dirs(out_addr)
        out_addr += CrawlerTool.filename_dict.get("ucomment")
        # 保存表头
        CrawlerTool.save_csv(self.tagname, out_addr)

        # 读取用户id文件
        resource = CrawlerTool.resource_context+CrawlerTool.filename_dict.get("uid")
        userid_list = CrawlerTool.read_csv(resource)

        # 遍历用户id列表，获得每个用户的评论
        count = 0
        for user in userid_list:
            self.body['user_id'] = user
            response = CrawlerTool.get_response('POST', self.url, data=self.body)
            response = CrawlerTool.response_convert('JSON', response)
            if type(response['data']['list']) != dict:
                result_list = self.read_response(response['data']['list'])
                for comment in result_list:
                    CrawlerTool.save_csv(comment, out_addr)
                count += 1
                print('已爬取', count, '个用户的评论')


class KitchenTag:

    def __init__(self):
        self.body = {'_cityid': '', 'user_coordinate': '', '_platform': 'Android', '_device': 'awift18:00:33:d6:07:6a',
                '_osversion': '5.1.1', '_screen': '720x1280', '_version': '3.9.7', 'channel': 'huijiachifan',
                '_time': '2019-05-26 00:20:52', '_build': '58', 'coordinate': '',
                'kitchen_id': '45810'}
        self.url = 'https://user.mapi.jiashuangkuaizi.com//UKitchen/detailTags'
        self.tagname = ["厨房id", "评论总数", "description描述", "图标", "inner_description", "状态", "标题", "类型",
                   "tag_count_all", "tag_count_content", "tag_id", "tag_name标签名", "tag_type"]
        self.taglist = ['comment_num']
        self.item_type_is_list = ['auth_msg_list', 'comment_tag']
        self.auth_item_list = ['description', 'icon', 'inner_description', 'status', 'title', 'type']
        self.tag_item_list = ['tag_count_all', 'tag_count_content', 'tag_id', 'tag_name', 'tag_type']
        self.listordict_name_keyvalue = {'auth_msg_list': self.auth_item_list, 'comment_tag': self.tag_item_list}

    def read_response(self, response):
        result_list = []
        response = response.get('data')
        for tag in self.taglist:
            result_list.append(response.get(tag))
        for listname in self.item_type_is_list:
            for itemname in self.listordict_name_keyvalue.get(listname):
                templist = CrawlerTool.add_liststr_in_otherlist(response, listname, itemname)
                result_list.append(templist)
        return result_list

    def main(self):
        out_addr = CrawlerTool.get_context(CrawlerTool.data_context)

        CrawlerTool.create_dirs(out_addr)
        out_addr += CrawlerTool.filename_dict.get("ktag")

        CrawlerTool.save_csv(self.tagname, out_addr)
        resource = CrawlerTool.resource_context+CrawlerTool.filename_dict.get("kid")
        id_list = CrawlerTool.read_csv(resource)
        for id in id_list:
            self.body['kitchen_id'] = id
            response = CrawlerTool.get_response("POST", self.url, data=self.body)
            result_list = self.read_response(CrawlerTool.response_convert("JSON", response))
            result_list.insert(0, id)
            CrawlerTool.save_csv(result_list, out_addr)


from google.colab import drive
drive.mount('/content/drive')           
            
#while True:
    #interval = 3600*24  # 间隔一天爬取一次

#getKitchen = GetKitchen()
#getKitchen.main()

#dishList = DishList()
#dishList.main()

#kitchenComment = KitchenComment()
#kitchenComment.main()

userComment = UserComment()
userComment.main()

kitchenTag = KitchenTag()
kitchenTag.main()

    #time.sleep(interval)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
['评论内容', '评论时间', '评论星数', '评论id', 'ordinal', '订单编号', '评论点赞数', '评论图片链接', '评论缩略图链接', 'is_myself', 'is_praised', 'reply_display', 'send_type', '平台回复', '点赞菜品id', '点赞菜品名称', 'is_praise', '商家回复', '回复时间', 'sender_type', '用户id', '用户昵称', '用户头像', '厨房id', '厨房图片链接', '厨房名称', '平均星数', '月销量', '商家籍贯']
请求成功
请求成功
['', '2018/07/30', '5', '5697746', '2', '532621324063428', '0', '', '', '0', '0', 'None', '1', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '2072427', '脚本0127', 'http://image.jiashuangkuaizi.com/images/user/2072427/1501295405_51992.jpg', '279044', 'http://image.jiashuangkuaizi.com/abyss/160101/kitchener/2017-11-29/11018-89872C64875F6493.jpg', 'Tiramisu', '-1', '16', '黑龙江 佳木斯人']
['', '2018/07/27', '5', '5693596', '1', '532146765067034', '0', '', '', '0', '0', 'None', '1', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '2072427', '脚本0127', 'http://image.jiashuangkuaizi.com/images/