In [8]:
import pandas as pd
import numpy as np

In [9]:
df = pd.read_csv("CityData.csv")

df

Unnamed: 0,countryName,provinceName,cityName,city_confirmedCount,city_suspectedCount,city_curedCount,city_deadCount,updateTime
0,中国,黑龙江省,哈尔滨,231,8,195,4,2020/4/18
1,中国,黑龙江省,双鸭山,52,0,49,3,2020/4/18
2,中国,黑龙江省,绥化,47,0,43,4,2020/4/18
3,中国,黑龙江省,鸡西,46,0,46,0,2020/4/18
4,中国,黑龙江省,齐齐哈尔,43,0,42,1,2020/4/18
...,...,...,...,...,...,...,...,...
25535,中国,贵州省,铜仁,1,0,0,0,2020/1/24
25536,中国,贵州省,黔南州,1,0,0,0,2020/1/24
25537,中国,甘肃省,兰州,1,0,0,0,2020/1/24
25538,中国,甘肃省,白银,1,0,0,0,2020/1/24


In [10]:
# merge and sum data with the same province and the same updateTime, dropping countryName and cityName
df = (
    df.drop("countryName", axis=1)
    .groupby(["updateTime", "provinceName"])
    .sum()
    .drop("cityName", axis=1)
)

df = df.reset_index()
df["updateTime"] = pd.to_datetime(df["updateTime"])
df = df.sort_values(by=["provinceName", "updateTime"])
df

Unnamed: 0,updateTime,provinceName,city_confirmedCount,city_suspectedCount,city_curedCount,city_deadCount
81,2020-01-27,上海市,53,0,3,0
111,2020-01-28,上海市,66,0,3,0
140,2020-01-29,上海市,96,0,5,1
169,2020-01-30,上海市,112,0,0,1
199,2020-01-31,上海市,165,0,5,1
...,...,...,...,...,...,...
1686,2020-04-14,黑龙江省,493,0,469,13
1702,2020-04-15,黑龙江省,501,8,469,13
1715,2020-04-16,黑龙江省,505,8,469,13
1732,2020-04-17,黑龙江省,508,8,469,13


In [11]:
# calculate cumulated confirmed

# Step 1: Create a complete date range for all provinces
all_dates = pd.date_range(
    start=df["updateTime"].min(), end=df["updateTime"].max()
)
all_provinces = df["provinceName"].unique()

# Create a complete DataFrame with all combinations of dates and provinces
complete_df = pd.DataFrame(
    {
        "updateTime": np.tile(all_dates, len(all_provinces)),
        "provinceName": np.repeat(all_provinces, len(all_dates)),
    }
)

# Step 2: Merge the complete DataFrame with the original DataFrame
df = pd.merge(complete_df, df, on=["updateTime", "provinceName"], how="left")

# Step 3: Fill missing values with zeros or appropriate values
df["city_confirmedCount"] = df["city_confirmedCount"].fillna(0)
df["city_suspectedCount"] = df["city_suspectedCount"].fillna(0)
df["city_deadCount"] = df["city_deadCount"].fillna(0)
df["city_curedCount"] = df["city_curedCount"].fillna(0)

# Step 4: Calculate the adjusted confirmed count
df["confirmed_adjusted"] = df["city_confirmedCount"] - df["city_deadCount"] - df["city_curedCount"]

# Step 5: Calculate the cumulative sum
df["cumulated_confirmed"] = df.groupby("provinceName")["confirmed_adjusted"].cumsum()

df

Unnamed: 0,updateTime,provinceName,city_confirmedCount,city_suspectedCount,city_curedCount,city_deadCount,confirmed_adjusted,cumulated_confirmed
0,2020-01-24,上海市,0.0,0.0,0.0,0.0,0.0,0.0
1,2020-01-25,上海市,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-01-26,上海市,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-01-27,上海市,53.0,0.0,3.0,0.0,50.0,50.0
4,2020-01-28,上海市,66.0,0.0,3.0,0.0,63.0,113.0
...,...,...,...,...,...,...,...,...
2661,2020-04-14,黑龙江省,493.0,0.0,469.0,13.0,11.0,8642.0
2662,2020-04-15,黑龙江省,501.0,8.0,469.0,13.0,19.0,8661.0
2663,2020-04-16,黑龙江省,505.0,8.0,469.0,13.0,23.0,8684.0
2664,2020-04-17,黑龙江省,508.0,8.0,469.0,13.0,26.0,8710.0


In [12]:
df = df.drop("confirmed_adjusted", axis=1)
df

Unnamed: 0,updateTime,provinceName,city_confirmedCount,city_suspectedCount,city_curedCount,city_deadCount,cumulated_confirmed
0,2020-01-24,上海市,0.0,0.0,0.0,0.0,0.0
1,2020-01-25,上海市,0.0,0.0,0.0,0.0,0.0
2,2020-01-26,上海市,0.0,0.0,0.0,0.0,0.0
3,2020-01-27,上海市,53.0,0.0,3.0,0.0,50.0
4,2020-01-28,上海市,66.0,0.0,3.0,0.0,113.0
...,...,...,...,...,...,...,...
2661,2020-04-14,黑龙江省,493.0,0.0,469.0,13.0,8642.0
2662,2020-04-15,黑龙江省,501.0,8.0,469.0,13.0,8661.0
2663,2020-04-16,黑龙江省,505.0,8.0,469.0,13.0,8684.0
2664,2020-04-17,黑龙江省,508.0,8.0,469.0,13.0,8710.0


In [13]:
df["updateTime"] = df["updateTime"].astype(str)

In [14]:
result = {}
for updateTime, group in df.groupby("updateTime"):
    result[updateTime] = group[["provinceName", "cumulated_confirmed"]].values.tolist()

import json

with open("data.json", "w") as outfile:
    json.dump(result, outfile, ensure_ascii=False)