In [21]:
import spacy
from spacy.matcher import Matcher

import pandas as pd
import os
from itertools import islice, chain
from collections import defaultdict

# Load SpaCy English model
# ! python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x26106153f40>

In [2]:
# Helper function to window iterate
def window(seq, n=2):
    """
    Returns a sliding window (of width n) over data from the iterable"
    "   s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ... 
    """
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result

## Reading in Data

In [16]:
pwd = os.path.dirname("")
nhc_path = os.path.abspath(os.path.join(pwd, "latest_nhc_data.csv"))
out_file = os.path.abspath(os.path.join(pwd, "test_parse.xlsx"))

# Read data
nhc_data = pd.read_csv(nhc_path)


## Trying to parse with SpaCy

In [4]:
content_to_parse = nhc_data["content"][0]

# Using English language model 
parsed_content = nlp(content_to_parse)

# print([(ent.text, ent.label_) for ent in parsed_content.ents])
parsed_split = list(parsed_content.sents)
parsed_split


[On April 5, 31 provincial-level regions and the Xinjiang Production and Construction Corps on the Chinese mainland reported 1,415 new cases of confirmed infections (32 imported cases, 14 in Sichuan province, 5 in Guangdong province, 4 in Shanghai municipality, 4 in Guangxi Zhuang autonomous region, 1 in Tianjin municipality, 1 in Fujian province, 1 in Shandong province, 1 in Chongqing municipality and 1 in Yunnan province, including 17 confirmed cases converting from asymptomatic cases, 13 in Sichuan province, 2 in Guangxi Zhuang autonomous region, 1 in Guangdong province and 1 in Yunnan province; 1,383 indigenous cases, 973 in Jilin province including 817 in Changchun, 136 in Jilin city, 11 in Baicheng, 5 in Siping and 4 in Baishan, 311 in Shanghai municipality including 162 in Pudong New Area, 23 in Xuhui district, 23 in Minhang district, 16 in Huangpu district, 14 in Songjiang district, 12 in Jing’an district, 11 in Yangpu district, 11 in Jiading district, 8 in Hongkou district, 7 

In [37]:

def get_case_counts(parsed_data: list):
    """
    Given a Doc, identify the localities and their case counts
    and return as a list of dicts
    """
    out = []

    for sent in parsed_data:
        sub_out = []
        dd_out = defaultdict(list)

        for tup in window(sent.ents, 2):
            prev_token = tup[0]
            ent = tup[1]
            
            # If we find a province / city name
            if ent.label_ in ("ORG", "GPE", "LOC"):
                # and the previous token was number-like
                if prev_token[0].like_num:
                    # Append it to the list, with key equal to the location
                    # and the value
                    sub_out.append({ent.text: prev_token.text})
                

        # Grow each key (province / city) if there are multiple entries
        for dic in sub_out:
            for k, v in dic.items():
                # Convert text to int to avoid oddities with extend
                dd_out[k].extend([int(v.replace(",", ""))])
        out.append(dd_out)



    return out

# Pull case counts
case_counts = get_case_counts(parsed_split)

case_counts[0]

defaultdict(list,
            {'Sichuan province': [14, 13, 2],
             'Guangdong province': [5, 1, 2],
             'Shanghai': [4, 311, 40],
             'Guangxi': [4, 2],
             'Tianjin': [1, 3, 2],
             'Fujian province': [1, 7],
             'Shandong province': [1],
             'Chongqing': [1],
             'Yunnan province': [1, 1],
             'Jilin province': [973, 65],
             'Changchun': [817],
             'Jilin city': [136],
             'Baicheng': [11],
             'Siping': [5],
             'Baishan': [4],
             'Pudong': [162],
             'Xuhui': [23],
             'Minhang': [23],
             'Huangpu district': [16],
             'Songjiang': [14],
             'Jing’an district': [12],
             'Yangpu': [11],
             'Jiading': [11],
             'Hongkou': [8],
             'Putuo': [7],
             'Baoshan': [7],
             'Changning district': [6],
             'Qingpu': [6],
             'Fengxian': [2