# Find a better way to clean locations, with less code

In [2]:
import json
import numpy as np
import networkx as nx
import pandas as pd
import re

from pathlib import Path

data_dir = Path('/Users/chena/covid_ui/data').resolve() # Resolve any symlinks --> absolute path
data_dir

PosixPath('/Volumes/GoogleDrive/My Drive/covid_data')

In [3]:
patient_meta_files = sorted((data_dir / 'patient_meta').glob('*.tsv'))
print('Collecting {} patient metadata files...'.format(len(patient_meta_files)), end='', flush=True)
patient_meta_df = pd.DataFrame()
for f in patient_meta_files:
    _df = pd.read_csv(f, sep='\t', skiprows=2)
    patient_meta_df = pd.concat([patient_meta_df, _df], ignore_index=True)

# Save dataframe
patient_meta_df.to_csv(data_dir / 'patient_meta.csv', index=False)
print('done', flush=True)

# Location data is stored in one column, "region / country / division / location"
location_df = (
    patient_meta_df['Location'].str.split('/', expand=True)
    .iloc[:, :4] # Only take 4 columns
    # Rename columns
    .rename(columns={0: 'region', 1: 'country', 2: 'division', 3: 'location'})
    .applymap(lambda x: x.strip() if x else x)
    # Placeholder for missing values, so that it will still 
    # be caught by groupby() later on
    .fillna(-1)
)
# Re-add metadata columns
location_df['name'] = patient_meta_df['Virus name']
location_df['gisaid_id'] = patient_meta_df['Accession ID']
location_df['sample_date'] = patient_meta_df['Collection date']

# Convert sample_date to datetime
location_df['sample_date'] = pd.to_datetime(location_df['sample_date'], yearfirst=True)

Collecting 14 patient metadata files...done


In [4]:
location_df

Unnamed: 0,region,country,division,location,name,gisaid_id,sample_date
0,Asia,China,Hubei,Wuhan,hCoV-19/Wuhan/IVDC-HB-01/2019,EPI_ISL_402119,2019-12-30
1,Asia,China,Hubei,Wuhan,hCoV-19/Wuhan/IVDC-HB-04/2020,EPI_ISL_402120,2020-01-01
2,Asia,China,Hubei,Wuhan,hCoV-19/Wuhan/IVDC-HB-05/2019,EPI_ISL_402121,2019-12-30
3,Asia,China,Hubei,Wuhan,hCoV-19/Wuhan/IPBCAMS-WH-01/2019,EPI_ISL_402123,2019-12-24
4,Asia,China,Hubei,Wuhan,hCoV-19/Wuhan/WIV04/2019,EPI_ISL_402124,2019-12-30
...,...,...,...,...,...,...,...
37392,Europe,United Kingdom,England,-1,hCoV-19/England/NORW-EB900/2020,EPI_ISL_457455,2020-05-08
37393,Europe,United Kingdom,England,-1,hCoV-19/England/NORW-EB91F/2020,EPI_ISL_457456,2020-05-08
37394,Europe,United Kingdom,England,-1,hCoV-19/England/NORW-EB93D/2020,EPI_ISL_457457,2020-05-08
37395,Europe,United Kingdom,England,-1,hCoV-19/England/NORW-EB94C/2020,EPI_ISL_457458,2020-05-08


In [8]:
location_df.loc[
    (location_df['country'] == 'South Africa') &
    (location_df['division'] == 'EC'),
    :
]

Unnamed: 0,region,country,division,location,name,gisaid_id,sample_date
15691,Africa,South Africa,EC,-1,hCoV-19/South Africa/R05475/2020,EPI_ISL_435059,2020-03-20


In [22]:
# Make rules, which are mappings of input region/country/division/location -> output region/country/division/location

rules = [
    (
        {'country': 'South Africa', 'division': 'EC'}, # Input
        {'division': 'Eastern Cape'}  # Output
    )
]

In [11]:
from functools import reduce

In [15]:
reduce(lambda x, y: (x**2) + (y**2), [1, 2, 3, 4, 5])

1373609

In [18]:
reduce(lambda x, y: (x**2) + (y**2), [3,])

3

In [20]:
for rule in rules:
    # print(rule)
    input_rule = rule[0]
    output_rule = rule[1]
    
    # Get matching entries for the input rule
    # by creating a logical mask
    # Start out with matching everything
    loc_mask = pd.Series(np.repeat(True, len(location_df)))
    for key in input_rule.keys():
        vals = input_rule[key]
        # Make it a list if it's just a single value
        if type(vals) is not list:
            vals = [vals]
            
        # Turn each value into a logical mask
        vals = [location_df[key] == v for v in vals]
        # Combine logical masks with logical ORs, and merge into the master mask with AND
        loc_mask = (loc_mask & reduce(lambda x, y: (x | y), vals)) 
        
        # Set the output rules on the matching entries from loc_mask
        for okey in output_rule.keys():
            location_df.loc[loc_mask, okey] = output_rule[okey]
    
    print(location_df.loc[loc_mask, :])
    
    

Empty DataFrame
Columns: [region, country, division, location, name, gisaid_id, sample_date]
Index: []


In [23]:
location_df.loc[
    (location_df['country'] == 'South Africa') &
    (location_df['division'] == 'EC'),
    :
]

Unnamed: 0,region,country,division,location,name,gisaid_id,sample_date


In [24]:
location_df.loc[
    (location_df['country'] == 'South Africa') &
    (location_df['division'] == 'Eastern Cape'),
    :
]

Unnamed: 0,region,country,division,location,name,gisaid_id,sample_date
534,Africa,South Africa,Eastern Cape,-1,hCoV-19/South Africa/R03006/2020,EPI_ISL_417186,2020-03-07
7909,Africa,South Africa,Eastern Cape,-1,hCoV-19/South Africa/KRISP-02/2020,EPI_ISL_421572,2020-03-23
7910,Africa,South Africa,Eastern Cape,-1,hCoV-19/South Africa/KRISP-06/2020,EPI_ISL_421573,2020-03-31
7911,Africa,South Africa,Eastern Cape,-1,hCoV-19/South Africa/KRISP-07/2020,EPI_ISL_421574,2020-04-01
7912,Africa,South Africa,Eastern Cape,-1,hCoV-19/South Africa/KRISP-011/2020,EPI_ISL_421575,2020-04-01
7913,Africa,South Africa,Eastern Cape,-1,hCoV-19/South Africa/KRISP-012/2020,EPI_ISL_421576,2020-04-01
11279,Africa,South Africa,Eastern Cape,-1,hCoV-19/South Africa/R02827/2020,EPI_ISL_430297,2020-03-06
15690,Africa,South Africa,Eastern Cape,-1,hCoV-19/South Africa/R02606/2020,EPI_ISL_435058,2020-03-11
15691,Africa,South Africa,Eastern Cape,-1,hCoV-19/South Africa/R05475/2020,EPI_ISL_435059,2020-03-20
16329,Africa,South Africa,Eastern Cape,-1,hCoV-19/South Africa/KRISP-04/2020,EPI_ISL_436684,2020-03-31
