In [6]:
import os
import re
import pandas as pd
import yaml
from metadata import get_china_province_code, get_china_city_code, get_china_area_name

def read_report(fp):
    with open(fp, "r") as f:
        return yaml.load(f, Loader=yaml.Loader)

def parse_int(text):
    if text:
        result = re.search("\\d+", text)
        if result:
            return result.group()

def parse_list(text, keys):
    if text:
        result = []
        for i in re.finditer("([\u4e00-\u9fa5]+)(\\d+)", text):
            result.append({
                keys[0]: i.group(1),
                keys[1]: i.group(2)
            })
        return result

def normalize_list(area_list, area_key):
    if area_key == "province":
        for x in area_list:
            name = x[area_key]
            x[area_key] = get_china_area_name(get_china_province_code(name), name)
    else:
        for x in area_list:
            name = x[area_key]
            x[area_key] = get_china_city_code(get_china_province_code(name), name)

def parse_report(report):
    date = str(report["时间"])
    area_key = "province"
    province = report.get("省")
    confirmed = parse_int(report.get("确诊"))
    suspected = parse_int(report.get("疑似"))
    cured = parse_int(report.get("治愈"))
    dead = parse_int(report.get("死亡"))
    area_key = "city" if province else "province"
    confirmed_list = parse_list(report.get("确诊详情"), [area_key, "confirmed"])
    suspected_list = parse_list(report.get("疑似详情"), [area_key, "suspected"])
    cured_list = parse_list(report.get("治愈详情"), [area_key, "cured"])
    dead_list = parse_list(report.get("死亡详情"), [area_key, "dead"])

    data = None
    if confirmed or suspected or cured or dead:
        data = {
            "confirmed": confirmed,
            "suspected": suspected,
            "cured": cured,
            "dead": dead
        }
        if province:
            data["provinceCode"] = get_china_province_code(province)
            data["province"] = get_china_area_name(data["provinceCode"], province)

    for data_list in [confirmed_list, suspected_list, cured_list, dead_list]:
        if data_list:
            for x in data_list:
                if province:
                    x["provinceCode"] = get_china_province_code(province)
                    x["province"] = get_china_area_name(x["provinceCode"], province)
                    x["cityCode"] = get_china_city_code(x["provinceCode"], x["city"])
                    x["city"] = get_china_area_name(x["cityCode"], x["city"])
                else:
                    x["provinceCode"] = get_china_province_code(x["province"])
                    x["province"] = get_china_area_name(x["provinceCode"], x["province"])

    df_list = [pd.DataFrame(x) for x in [confirmed_list, suspected_list, cured_list, dead_list] if x]
    df = None
    for index, x in enumerate(df_list):
        if df is None:
            df = x
        else:
            df = pd.merge(df, x, on=area_key, how="outer", suffixes=["", f"""_{index}"""], sort=False, copy=False)

    columns = ["date","country","countryCode","province","provinceCode","city","cityCode","confirmed","suspected","cured","dead"]
    if df is None:
        df = pd.DataFrame([data], columns=columns)
    else:
        df = pd.DataFrame(df, columns=columns)
        df = df.append([data])
    df["date"] = date
    df["country"] = "中国"
    df["countryCode"] = "CN"
    df["province"].fillna("", inplace=True)
    df["provinceCode"].fillna("", inplace=True)
    df["city"].fillna("", inplace=True)
    df["cityCode"].fillna("", inplace=True)
    df.sort_values(["date", "countryCode", "provinceCode", "cityCode"], inplace=True)
    return df

for r in os.listdir("Report"):
    report = read_report(os.path.join("Report", r))
    report_data = parse_report(report)
    report_data.to_csv(f"""ReportData/{report.get("时间")}{report.get("省", "")}.csv""", index=False)


In [4]:
import os
import pandas as pd

df_list = [pd.read_csv(os.path.join("ReportData", x)) for x in os.listdir("ReportData")]

Unnamed: 0,date,country,countryCode,province,provinceCode,city,cityCode,confirmed,suspected,cured,dead
0,2020-01-24,中国,CN,湖北省,420000,,,729,,32.0,39.0
1,2020-01-24,中国,CN,湖北省,420000,武汉市,420100.0,572,,,38.0
2,2020-01-24,中国,CN,湖北省,420000,十堰市,420300.0,5,,,
3,2020-01-24,中国,CN,湖北省,420000,宜昌市,420500.0,1,,,1.0
4,2020-01-24,中国,CN,湖北省,420000,鄂州市,420700.0,1,,,
5,2020-01-24,中国,CN,湖北省,420000,荆门市,420800.0,21,,,
6,2020-01-24,中国,CN,湖北省,420000,孝感市,420900.0,26,,,
7,2020-01-24,中国,CN,湖北省,420000,荆州市,421000.0,10,,,
8,2020-01-24,中国,CN,湖北省,420000,黄冈市,421100.0,64,,,
9,2020-01-24,中国,CN,湖北省,420000,随州市,421300.0,5,,,


In [6]:
import os

sorted(os.listdir("ReportData"))

['2020-01-20.csv',
 '2020-01-20湖北省.csv',
 '2020-01-21.csv',
 '2020-01-21湖北省.csv',
 '2020-01-22.csv',
 '2020-01-22湖北省.csv',
 '2020-01-23.csv',
 '2020-01-23湖北省.csv',
 '2020-01-24.csv',
 '2020-01-24湖北省.csv',
 '2020-01-25.csv',
 '2020-01-25湖北省.csv',
 '2020-01-26.csv',
 '2020-01-26湖北省.csv',
 '2020-01-27.csv',
 '2020-01-27湖北省.csv',
 '2020-01湖北省武汉市.csv']

In [1]:
import pandas as pd

csv_file = "ReportData/2020-01湖北省武汉市.csv"
dtype = {"date": str, "provinceCode": str, "cityCode": str}
columns = ["date","country","countryCode","province","provinceCode","city","cityCode","confirmed","suspected","cured","dead"]
df = pd.read_csv(csv_file, dtype=dtype)
df_country = pd.DataFrame(df.groupby(["date","country","countryCode"],as_index=False).sum(), columns=columns)
df_province = pd.DataFrame(df.groupby(["date","country","countryCode","province","provinceCode"],as_index=False).sum(),columns=columns)
df = pd.concat([df, df_country, df_province])
df["country"].fillna("", inplace=True)
df["countryCode"].fillna("", inplace=True)
df["province"].fillna("", inplace=True)
df["provinceCode"].fillna("", inplace=True)
df["city"].fillna("", inplace=True)
df["cityCode"].fillna("", inplace=True)
df["confirmed"] = df["confirmed"].fillna(0).astype(int)
df["suspected"] = df["suspected"].fillna(0).astype(int)
df["cured"] = df["cured"].fillna(0).astype(int)
df["dead"] = df["dead"].fillna(0).astype(int)
df.drop_duplicates(
    subset=["date", "country", "province", "city"], inplace=True)
df.sort_values(["date", "countryCode", "provinceCode", "cityCode", "city"], inplace=True)
df 
# df.to_csv(csv_file, index=False, encoding='utf-8')