In [1]:
import spacy

import pandas as pd
import os
from itertools import islice
from collections import defaultdict
from functools import reduce
# Load SpaCy English model
# ! python -m spacy download en_core_web_trf
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("sentencizer")

  from .autonotebook import tqdm as notebook_tqdm


<spacy.pipeline.sentencizer.Sentencizer at 0x27491039a80>

In [2]:
# Helper function to window iterate
def window(seq, n=2):
    """
    Returns a sliding window (of width n) over data from the iterable"
    "   s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ... 
    """
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result

## Reading in Data

In [3]:
pwd = os.path.dirname("")
nhc_path = os.path.abspath(os.path.join(pwd, "latest_nhc_data.csv"))
out_file = os.path.abspath(os.path.join(pwd, "test_parse.xlsx"))

# Read data
nhc_data = pd.read_csv(nhc_path)


In [10]:
nhc_data

Unnamed: 0,date,url,read_time,content
0,2022-04-07,http://en.nhc.gov.cn/2022-04/07/c_85911.htm,2022-04-08T03:32:46Z,"On April 6, 31 provincial-level regions and th..."
1,2022-04-06,http://en.nhc.gov.cn/2022-04/06/c_85910.htm,2022-04-06T04:00:00Z,"On April 5, 31 provincial-level regions and th..."
2,2022-04-05,http://en.nhc.gov.cn/2022-04/05/c_85909.htm,2022-04-06T04:00:00Z,"On April 4, 31 provincial-level regions and th..."
3,2022-04-04,http://en.nhc.gov.cn/2022-04/04/c_85908.htm,2022-04-06T04:00:00Z,"On April 3, 31 provincial-level regions and th..."
4,2022-04-03,http://en.nhc.gov.cn/2022-04/03/c_85907.htm,2022-04-06T04:00:00Z,"On April 2, 31 provincial-level regions and th..."
...,...,...,...,...
719,2020-02-03,http://en.nhc.gov.cn/2020-02/03/c_76225.htm,2022-03-23T04:00:00Z,"On Feb 2, 31 provincial-level regions on the C..."
720,2020-02-01,http://en.nhc.gov.cn/2020-02/01/c_76084.htm,2022-03-23T04:00:00Z,Security inspector measures a passenger's temp...
721,2020-01-31,http://en.nhc.gov.cn/2020-01/31/c_76065.htm,2022-03-23T04:00:00Z,A medical worker in hazmat suit speaks with su...
722,2020-01-30,http://en.nhc.gov.cn/2020-01/30/c_76048.htm,2022-03-23T04:00:00Z,Chinese health authorities announced on Jan 30...


## Trying to parse with SpaCy

In [5]:
content_to_parse = nhc_data["content"][0]

# Using English language model 
parsed_content = nlp(content_to_parse)

# print([(ent.text, ent.label_) for ent in parsed_content.ents])
parsed_split = list(parsed_content.sents)
parsed_split


[On April 6, 31 provincial-level regions and the Xinjiang Production and Construction Corps on the Chinese mainland reported 1,323 new cases of confirmed infections (39 imported cases, 15 in Fujian province, 11 in Sichuan province, 7 in Shanghai municipality, 1 in Beijing municipality, 1 in Liaoning province, 1 in Jiangxi province, 1 in Hunan province, 1 in Guangdong province and 1 in Yunnan province, including 10 confirmed cases converting from asymptomatic cases, 9 in Sichuan province and 1 in Guangdong province; 1,284 indigenous cases, 890 in Jilin province including 766 in Changchun, 112 in Jilin city, 11 in Baicheng and 1 in Siping, 322 in Shanghai municipality including 150 in Pudong New Area, 31 in Changning district, 28 in Putuo district, 27 in Jing’an district, 21 in Minhang district, 19 in Xuhui district, 12 in Yangpu district, 9 in Huangpu district, 9 in Baoshan district, 6 in Jiading district, 4 in Qingpu district, 3 in Jinshan district, 2 in Songjiang district and 1 in Hon

In [6]:

def get_case_counts(parsed_data: list):
    """
    Given a list of Docs, identify the localities and their case counts
    and return as a list of dicts
    """
    out = []

    for sent in parsed_data:
        sub_out = []
        dd_out = defaultdict(list)

        for tup in window(sent.ents, 2):
            prev_token = tup[0]
            ent = tup[1]
            
            numberish = prev_token.text.replace(",", "").isdigit()

            # If we find a province / city name
            if ent.label_ in ("ORG", "GPE", "LOC"):
                # and the previous token was number-like
                if prev_token[0].like_num and numberish:
                    # Append it to the list, with key equal to the location
                    # and the value
                    sub_out.append({ent.text: prev_token.text})
                

        # Grow each key (province / city) if there are multiple entries
        for dic in sub_out:
            for k, v in dic.items():
                # Convert text to int to avoid oddities with extend
                dd_out[k].extend([int(v.replace(",", ""))])
        out.append(dd_out)



    return out

# Pull case counts
case_counts = get_case_counts(parsed_split)

case_counts[0]

defaultdict(list,
            {'Fujian province': [15, 7, 5],
             'Sichuan province': [11, 6],
             'Shanghai': [7, 322, 15, 4],
             'Beijing': [1, 1],
             'Liaoning province': [1],
             'Jiangxi province': [1],
             'Hunan province': [1],
             'Guangdong province': [1],
             'Yunnan province': [1],
             'Sichuan': [9, 10],
             'Guangdong': [1],
             'Jilin province': [890, 53],
             'Changchun': [766],
             'Jilin city': [112],
             'Siping': [1],
             'Pudong': [150],
             'Changning district': [31],
             'Putuo': [28],
             'Jing’an district': [27],
             'Minhang': [21],
             'Xuhui': [19],
             'Yangpu': [12],
             'Huangpu district': [9],
             'Baoshan': [9],
             'Jiading': [6],
             'Qingpu': [4],
             'Jinshan': [3],
             'Songjiang': [2],
             'Hongkou'

### "flatten" counts

By extending the list, we captured multiple entries for case counts per locality.  
That's a helpful structure if we decided later to disaggregate counts, but we just want the total.

Easy sum operation, but will the counts align?

In [7]:
def flatten_cases(cases: dict):
    out = cases.copy()
    for k, v in out.items():
        out.update({k: sum(v)})
    return out


flat_cases = [flatten_cases(a) for a in case_counts if len(a) > 0]
frames = [pd.DataFrame(a, index = ["cases"]) for a in flat_cases]

frames

[       Fujian province  Sichuan province  Shanghai  Beijing  \
 cases               27                17       348        2   
 
        Liaoning province  Jiangxi province  Hunan province  \
 cases                  1                 1               1   
 
        Guangdong province  Yunnan province  Sichuan  ...  Tongliao  Shenyang  \
 cases                   1                1       19  ...         1         1   
 
        Fuyang  Zhoukou  Qianxinan Bouyei  Shandong province  Hainan province  \
 cases       1        1                 1                  2                2   
 
        Inner Mongolia  Anhui  Henan province  
 cases               1      1               1  
 
 [1 rows x 69 columns],
        Shanghai  Pudong  Minhang  Jiading  Xuhui  Huangpu district  Putuo  \
 cases     19660    8296     2387     1402   1087              1041   1008   
 
        Songjiang  Hongkou  Baoshan  ...  Foshan  Sichuan  Leshan  Chengdu  \
 cases        781      669      653  ...       4        

### Tidying

Pivot and combine the the list of tables into a single data frame.  

(with the idea being that we now have columns for confirmed and asymptomatic by locality)

In [8]:

def transpose_and_combine(dfs: list, metrics: list=["confirmed", "asymptomatic"]):
    """
    Given a list of Dfs in an expected order,
    transpose and column-bind
    """
    out_list = []
    for item in zip(dfs, metrics):
        df = item[0].T.reset_index()
        df.columns = ["Location", item[1]]
        out_list.append(df)
    
    out = reduce(lambda left,right: pd.merge(left, right, on='Location'), out_list)

    return out

transpose_and_combine(frames).to_csv("2020_04_06_extracted.csv")

In [11]:
def extract_case_counts(raw_text: str, date: str):
    # Using English language model 
    parsed_content = nlp(raw_text)

    parsed_split = list(parsed_content.sents)
    
    case_counts = get_case_counts(parsed_split)
    flat_cases = [flatten_cases(a) for a in case_counts if len(a) > 0]
    frames = [pd.DataFrame(a, index = ["cases"]) for a in flat_cases]

    out = transpose_and_combine(frames)

    return (date, out)

extracted = list(map(extract_case_counts, nhc_data["content"], nhc_data["date"]))

TypeError: reduce() of empty sequence with no initial value

In [None]:
with pd.ExcelWriter(out_file) as writer:
    for day in extracted:
        day[1].to_excel(writer, sheet_name=day[0], index=False)