In [51]:
import re
import pandas as pd
import yaml
from metadata import get_china_province_code, get_china_city_code, get_china_area_name

def read_report(fp):
    with open(fp, "r") as f:
        return yaml.load(f, Loader=yaml.Loader)

def parse_int(text):
    if text:
        result = re.search("\\d+", text)
        if result:
            return result.group()

def parse_list(text, keys):
    if text:
        result = []
        for i in re.finditer("([\u4e00-\u9fa5]+)(\\d+)", text):
            result.append({
                keys[0]: i.group(1),
                keys[1]: i.group(2)
            })
        return result

def normalize_list(area_list, area_key):
    if area_key == "province":
        for x in area_list:
            name = x[area_key]
            x[area_key] = get_china_area_name(get_china_province_code(name), name)
    else:
        for x in area_list:
            name = x[area_key]
            x[area_key] = get_china_city_code(get_china_province_code(name), name)

def parse_report(report):
    date = str(report["时间"])
    area_key = "province"
    province = report.get("省")
    confirmed = parse_int(report.get("确诊"))
    suspected = parse_int(report.get("疑似"))
    cured = parse_int(report.get("治愈"))
    dead = parse_int(report.get("死亡"))
    area_key = "city" if province else "province"
    confirmed_list = parse_list(report.get("确诊详情"), [area_key, "confirmed"])
    suspected_list = parse_list(report.get("疑似详情"), [area_key, "suspected"])
    cured_list = parse_list(report.get("治愈详情"), [area_key, "cured"])
    dead_list = parse_list(report.get("死亡详情"), [area_key, "dead"])

    data = None
    if confirmed or suspected or cured or dead:
        data = {
            "confirmed": confirmed,
            "suspected": suspected,
            "cured": cured,
            "dead": dead
        }
        if province:
            data["provinceCode"] = get_china_province_code(province)
            data["province"] = get_china_area_name(data["provinceCode"], province)

    for data_list in [confirmed_list, suspected_list, cured_list, dead_list]:
        if data_list:
            for x in data_list:
                if province:
                    x["provinceCode"] = get_china_province_code(province)
                    x["province"] = get_china_area_name(x["provinceCode"], province)
                    x["cityCode"] = get_china_city_code(x["provinceCode"], x["city"])
                    x["city"] = get_china_area_name(x["cityCode"], x["city"])
                else:
                    x["provinceCode"] = get_china_province_code(x["province"])
                    x["province"] = get_china_area_name(x["provinceCode"], x["province"])

    df_list = [pd.DataFrame(x) for x in [confirmed_list, suspected_list, cured_list, dead_list] if x]
    df = None
    for x in df_list:
        if df is None:
            df = x
        else:
            df = pd.merge(df, x, on=area_key, how="outer", suffixes=["", "_2"], sort=False, copy=False)

    columns = ["date","country","countryCode","province","provinceCode","city","cityCode","confirmed","suspected","cured","dead"]
    if df is None:
        df = pd.DataFrame([data], columns=columns)
    else:
        df = pd.DataFrame(df, columns=columns)
        df = df.append([data])
    df["date"] = date
    df["country"] = "中国"
    df["countryCode"] = "CN"
    df["province"].fillna("", inplace=True)
    df["provinceCode"].fillna("", inplace=True)
    df["city"].fillna("", inplace=True)
    df["cityCode"].fillna("", inplace=True)
    df.sort_values(["date", "countryCode", "provinceCode", "cityCode"], inplace=True)
    return df

report = read_report("report/2020年1月27日湖北省.yaml")
report_data = parse_report(report)
report_data.to_csv(f"""report_data/{date}{report.get("省", "")}.csv""", index=False)
report_data

Unnamed: 0,date,country,countryCode,province,provinceCode,city,cityCode,confirmed,suspected,cured,dead
0,2020-01-27,中国,CN,湖北省,420000,,,2714,,47.0,100.0
0,2020-01-27,中国,CN,湖北省,420000,武汉市,420100.0,1590,,,85.0
1,2020-01-27,中国,CN,湖北省,420000,黄石市,420200.0,53,,,1.0
2,2020-01-27,中国,CN,湖北省,420000,十堰市,420300.0,65,,,
4,2020-01-27,中国,CN,湖北省,420000,宜昌市,420500.0,51,,,1.0
3,2020-01-27,中国,CN,湖北省,420000,襄阳市,420600.0,70,,,
7,2020-01-27,中国,CN,湖北省,420000,鄂州市,420700.0,57,,,
6,2020-01-27,中国,CN,湖北省,420000,荆门市,420800.0,114,,,3.0
8,2020-01-27,中国,CN,湖北省,420000,孝感市,420900.0,173,,,1.0
5,2020-01-27,中国,CN,湖北省,420000,荆州市,421000.0,71,,,2.0
