In [1]:
# JSON 2 FASTA
import re
import json
import gzip
from Bio import SeqIO

in_fp = '/valhalla/al_tmp/provision.json'
out_fp = '/valhalla/gisaid/test.fasta'
regex = re.compile('[^a-zA-Z]')
print(f"Loading JSON...")
data = [json.loads(line) for line in open(in_fp, 'r')]
print(f"Converting to dict...")
seqs_dict = {sample['covv_virus_name'].replace('hCoV-19/', '').replace(' ', ''): 
             regex.sub('', sample['sequence'].replace('\n', '')) for sample in data}
print(f"Converting to FASTA...")
with open(out_fp, 'w') as f:
    f.write(''.join(f'>{idx}\n{seq}\n' for idx, seq in seqs_dict.items()))
print(f"FASTA output generated and saved in {out_fp}")

Loading JSON...
Converting to dict...
Converting to FASTA...
FASTA output generated and saved in /valhalla/gisaid/test.fasta


## Location normalization

In [1]:
import sys
sys.path.append('../')
import gc
import re
import json
import fiona
import pandas as pd
import geopandas as gpd
import visualize as bv
import data as bd

In [2]:
metacols = ['covv_virus_name', 'covsurver_prot_mutations', 'covv_location',
             'covv_lineage', 'covv_collection_date', 'covv_accession_id',
             'pangolin_lineages_version', 'covv_clade', 'covv_subm_date']
in_fp = '/valhalla/gisaid/2021-02-17.json'
# df = pd.read_json('/valhalla/gisaid/2021-02-17.json', lines=True)
data = [json.loads(line) for line in open(in_fp, 'r')]

In [5]:
len(data)

550092

## Raw sequence parsing

In [6]:
out_fp = '/valhalla/gisaid/raw_sequences_2021-02-17.fasta'

In [11]:
def dict2fasta(seqs: dict, fasta_fp: str, wrap=80):
    with open(fasta_fp, 'w') as f:
        for gid, gseq in seqs.items():
            f.write('>{}\n'.format(gid))
            for i in range(0, len(gseq), wrap):
                f.write('{}\n'.format(gseq[i:i + wrap])) 
    return 0

In [12]:
print(f"Converting to dict...")
regex = re.compile('[^a-zA-Z]')
seqs_dict = {sample['covv_virus_name'].replace('hCoV-19/', '').replace(' ', ''): 
             regex.sub('', sample['sequence'].replace('\n', '')) for sample in data}
print(f"Converting to FASTA...")
dict2fasta(seqs_dict, out_fp)
# with open(out_fp, 'w') as f:
#     f.write(''.join(f'>{idx}\n{seq}\n' for idx, seq in seqs_dict.items()))
print(f"FASTA output generated and saved in {out_fp}")

Converting to dict...
Converting to FASTA...
FASTA output generated and saved in /valhalla/gisaid/sequences_2021-02-17.fasta


In [7]:
del data

In [9]:
gc.collect()

88

## Metadata parsing

In [19]:
COLS = [
'covv_virus_name', 
'covv_location', 
'covv_subm_date',
'covv_clade',
'covv_lineage',
'pangolin_lineages_version',
'covv_accession_id'
        ]

In [20]:
df = pd.DataFrame(data, columns=COLS)

In [21]:
df.columns

Index(['covv_virus_name', 'covv_location', 'covv_subm_date', 'covv_clade',
       'covv_lineage', 'pangolin_lineages_version', 'covv_accession_id'],
      dtype='object')

In [23]:
assert df['covv_accession_id'].shape[0]==df['covv_accession_id'].unique().shape[0], f'ERROR: gisaid accession ids not unique'

In [25]:
df['covv_accession_id'].unique().shape

(550092,)

### Location Normalization

In [26]:
df.rename(columns={
                   'covv_virus_name': 'strain', 
                   'covv_location': 'location', 
                   'covv_subm_date': 'date_submitted',
                   'covv_clade': 'clade',
                   'covv_lineage': 'pango_lineage',
                   'pangolin_lineages_version': 'pango_version',
                   'covv_accession_id': 'accession_id'
                  }, inplace=True)

In [31]:
def fetch_gadm_names(filename, usecols, **kwargs):
    with fiona.open(filename, **kwargs) as source:
        for feature in source:
            f = {k: feature[k] for k in ['id', 'geometry']}
            f['properties'] = {k: feature['properties'][k] for k in usecols}
            yield f

In [34]:
gadm_fp = '/home/al/data/geojsons/gadm36.shp'
gadm = gpd.read_file('/home/al/data/geojsons/gadm36.shp')
gadm_cols = [f'NAME_{i}' for i in range(5)]
gadm = gadm[gadm_cols]
# gpd.GeoDataFrame.from_features(fetch_gadm_names(gadm_fp), gadm_cols)

In [35]:
gadm.columns

Index(['NAME_0', 'NAME_1', 'NAME_2', 'NAME_3', 'NAME_4'], dtype='object')

In [37]:
res = pd.DataFrame(df['location'].str.split('/').tolist(), 
             columns=['region',
                    'country', 
                    'division', 
                    'location', 
                    'city', 
                    'town'])
df['country'] = res['country'].str.strip()
df['division'] = res['division'].str.strip()
df['location'] = res['location'].str.strip()

In [38]:
df.loc[~df['location'].isna()][['country', 'division', 'location']]

Unnamed: 0,country,division,location
12,Australia,New South Wales,Sydney
58,Netherlands,Noord Holland,Diemen
128,Australia,New South Wales,Sydney
131,Australia,New South Wales,Sydney
134,Australia,New South Wales,Sydney
...,...,...,...
550087,Poland,Wielkopolskie,Rogierowko
550088,Poland,Wielkopolskie,Skorzewo
550089,Poland,Wielkopolskie,Poznan
550090,Poland,Wielkopolskie,Poznan


In [65]:
# res['country'].value_counts().iloc[:40]

### Admin0 Normalization

In [39]:
df['country_normed'] = df['country'].copy()
df['country_normed'].fillna('None', inplace=True)
df.loc[df['country_normed']=='USA', 'country_normed'] = 'United States'
df.loc[df['country_normed'].str.contains('Congo'), 'country_normed'] = 'Republic of Congo'
df.loc[df['country_normed'].str.contains('Cote dIvoire'), 'country_normed'] = "Côte d'Ivoire"
df.loc[df['country_normed'].str.contains('North Macedonia'), 'country_normed'] = "Macedonia"
df.loc[df['country_normed'].str.contains('Curacao'), 'country_normed'] = "Curaçao"
df.loc[df['country_normed'].str.contains('Saint Martin'), 'country_normed'] = "Saint-Martin"
df.loc[df['country_normed'].str.contains('Trinidad'), 'country_normed'] = 'Trinidad and Tobago'
df.loc[df['country_normed'].str.contains('Czech republic'), 'country_normed'] = 'Czech Republic'
df.loc[df['country_normed'].str.contains('St Eustatius'), 'country_normed'] = 'Netherlands'
df.loc[df['country_normed'].str.contains('Saint Barthelemy'), 'country_normed'] = 'Saint-Barthélemy'
df.loc[df['country_normed'].str.contains('Palestine'), 'country_normed'] = "Palestina"
df.loc[df['country_normed'].str.contains("Germany /"), 'country_normed'] = "Germany"
df.loc[df['country_normed'].str.contains("France /Nouvelle-Aquitaine"), 'division'] = "Nouvelle-Aquitaine"
df.loc[df['country_normed']=="France /Nouvelle-Aquitaine", 'country_normed'] = "France"
df.loc[df['country_normed'].str.contains("France /Nouvelle-Aquitaine/ Limoges"), 'division'] = "Nouvelle-Aquitaine"
df.loc[df['country_normed'].str.contains("France /Nouvelle-Aquitaine/ Limoges"), 'location'] = "Limoges"
df.loc[df['country_normed']=="France /Nouvelle-Aquitaine/ Limoges", 'country_normed'] = "France"
df.loc[df['country_normed']=="Kenya /", 'country_normed'] = "Kenya"
df.loc[df['country_normed']=="Switzerland/ Schwyz", 'division'] = "Schwyz"
df.loc[df['country_normed']=="Switzerland/ Schwyz", 'country_normed'] = "Switzerland"
df.loc[df['country_normed']=="USA /Wisconsin", 'division'] = "Wisconsin"
df.loc[df['country_normed']=="USA /Wisconsin", 'country_normed'] = "United States"
df.loc[df['country_normed']=="Jonavos apskritis", 'country_normed'] = "Lithuania"
df.loc[df['country_normed']=="Thailand /Singburi", 'division'] = "Singburi"
df.loc[df['country_normed']=="Thailand /Singburi", 'country_normed'] = "Thailand"
df.loc[df['country_normed']=="Norway /", 'country_normed'] = "Norway"
df.loc[df['country_normed']=="Morocoo", 'country_normed'] = "Morocco"

In [40]:
# print(sorted(gadm_0))

In [43]:
gisaid_0 = set(df['country_normed'].unique())
gadm_0 = set(gadm['NAME_0'].unique())
print(len(gisaid_0))
print(len(gadm_0))
print(f'Number of countries captured in GADM: {len(gisaid_0 & gadm_0)}')
print(f'Countries in GISAID not captured in GADM: {gisaid_0 - gadm_0}')
#TODO: fix Morocoo

153
256
Number of countries captured in GADM: 151
Countries in GISAID not captured in GADM: {'Crimea', 'Caribbean'}


In [44]:
missing_countries = ['Crimea', 'Caribbean']
samples_missing_country = df[df['country'].isin(missing_countries)]
print(f'Number of samples missing country-level geo-information: {samples_missing_country.shape[0]}')

Number of samples missing country-level geo-information: 19


### Admin1 Normalization

In [45]:
df.loc[df['division'].isna(), 'division'] = 'None'
df['division_normed'] = df['division'].copy()

In [46]:
# print(sorted(gadm_1))

In [61]:
# TODO: Spain, rest of EU!
country = 'Brazil'
if country:
    gisaid_1 = set(df[df['country']==country]['division_normed'].unique())
else:
    gisaid_1 = set(df['division_normed'].unique())
gadm_1 = set(gadm[~gadm['NAME_1'].isna()]['NAME_1'].unique())
print(len(gisaid_1))
print(len(gadm_1))
print(len(gisaid_1&gadm_1))
print(sorted(gisaid_1 - gadm_1))

30
3487
28
['None', 'Rondonia']


In [54]:
df['division_normed'] = df['division'].copy()
df.loc[df['division_normed']=='USA', 'division_normed'] = 'United States'
df.loc[df['division_normed'].str.contains('Georgia /'), 'division_normed'] = 'Georgia'
df.loc[df['division_normed'].str.contains('Antwerp'), 'division_normed'] = 'Vlaanderen'
df.loc[df['division_normed'].str.contains('Andalu'), 'division_normed'] = 'Andalucía'
df.loc[df['division_normed'].str.contains('Cairo'), 'division_normed'] = 'Al Qahirah'
df.loc[df['division_normed'].str.contains('Northern territory'), 'division_normed'] = 'Northern Territory'
df.loc[df['division_normed'].str.contains('Fayoum'), 'division_normed'] = 'Al Fayyum'
df.loc[df['division_normed'].str.contains('Musca'), 'division_normed'] = 'Muscat'
df.loc[df['division_normed'].str.contains('Kalyoubia'), 'division_normed'] = 'Al Qalyubiyah'
df.loc[df['division_normed'].str.contains('Buraymi'), 'division_normed'] = 'Al Buraymi'
df.loc[df['division_normed'].str.contains('Buraimi'), 'division_normed'] = 'Al Buraymi'
df.loc[df['division_normed'].str.contains('Dakhiliyah'), 'division_normed'] = 'Ad Dakhliyah'
df.loc[df['division_normed'].str.contains('Dhahirah'), 'division_normed'] = 'Al Dhahira'
df.loc[df['division_normed'].str.contains('North Batinah'), 'division_normed'] = 'Al Batinah North'
df.loc[df['division_normed'].str.contains('South Batinah'), 'division_normed'] = 'Al Batinah South'
df.loc[df['division_normed'].str.contains('North Sharqiyah'), 'division_normed'] = 'Ash Sharqiyah North'
df.loc[df['division_normed'].str.contains('Wuhan'), 'division_normed'] = 'Hubei'
df.loc[df['division_normed'].str.contains('Quebec'), 'division_normed'] = 'Québec'
df.loc[df['division_normed'].str.contains('Toronto'), 'division_normed'] = 'Ontario'
df.loc[df['division_normed'].str.contains('Coahuila de Zaragoza'), 'division_normed'] = 'Coahuila'
df.loc[df['division_normed'].str.contains('Mexico City'), 'division_normed'] = 'México'
df.loc[df['division_normed'].str.contains('Michoacan'), 'division_normed'] = 'Michoacán'
df.loc[df['division_normed'].str.contains('Nuevo Leon'), 'division_normed'] = 'Nuevo León'
df.loc[df['division_normed'].str.contains('Queretaro'), 'division_normed'] = 'Querétaro'
df.loc[df['division_normed'].str.contains('SanLuisPotosi'), 'division_normed'] = 'San Luis Potosí'
df.loc[df['division_normed'].str.contains('San Luis Potosi'), 'division_normed'] = 'San Luis Potosí'
df.loc[df['division_normed'].str.contains('State of Mexico'), 'division_normed'] = 'México'
df.loc[df['division_normed'].str.contains('Yucatan'), 'division_normed'] = 'Yucatán'
df.loc[df['division_normed'].str.contains('Bethlehem'), 'division_normed'] = 'West Bank'
df.loc[df['division_normed'].str.contains('Hebron'), 'division_normed'] = 'West Bank'
df.loc[df['division_normed'].str.contains('Jenin'), 'division_normed'] = 'West Bank'
df.loc[df['division_normed'].str.contains('Jericho'), 'division_normed'] = 'West Bank'
df.loc[df['division_normed'].str.contains('Ramallah'), 'division_normed'] = 'West Bank'
df.loc[df['division_normed'].str.contains('Tulkarem'), 'division_normed'] = 'West Bank'
df.loc[df['division_normed'].str.contains('Nablus'), 'division_normed'] = 'West Bank'
df.loc[df['division_normed'].str.contains('Sharja'), 'division_normed'] = 'Sharjah'
df.loc[df['division_normed'].str.contains('Copenhagen'), 'division_normed'] = 'Hovedstaden'
df.loc[df['division_normed'].str.contains('Sjaelland'), 'division_normed'] = 'Sjælland'
df.loc[df['division_normed'].str.contains('Cape Town'), 'division_normed'] = 'Western Cape'
df.loc[df['division_normed'].str.contains('Western Cape'), 'division_normed'] = 'Western Cape'
df.loc[df['division_normed'].str.contains('Amapa'), 'division_normed'] = 'Amapá'
df.loc[df['division_normed'].str.contains('Ceara'), 'division_normed'] = 'Ceará'
df.loc[df['division_normed'].str.contains('Goias'), 'division_normed'] = 'Goiás'
df.loc[df['division_normed'].str.contains('Maranhao'), 'division_normed'] = 'Maranhão'
df.loc[df['division_normed'].str.contains('Paraiba'), 'division_normed'] = 'Paraíba'
df.loc[df['division_normed'].str.contains('Parana'), 'division_normed'] = 'Paraná'
df.loc[df['division_normed'].str.contains('Piaui'), 'division_normed'] = 'Piauí'
df.loc[df['division_normed'].str.contains('Sao Paulo'), 'division_normed'] = 'São Paulo'
df.loc[df['division_normed'].str.contains('Aragon'), 'division_normed'] = 'Aragón'
df.loc[df['division_normed'].str.contains('Asturias'), 'division_normed'] = 'Principado de Asturias'
df.loc[df['division_normed'].str.contains('Balear Islands'), 'division_normed'] = 'Islas Baleadf'
df.loc[df['division_normed'].str.contains('Balear_Islands'), 'division_normed'] = 'Islas Baleadf'
df.loc[df['division_normed'].str.contains('Illes Balears'), 'division_normed'] = 'Islas Baleadf'
df.loc[df['division_normed'].str.contains('Canary Islands'), 'division_normed'] = 'Canaries'
df.loc[df['division_normed'].str.contains('Canary_Islands'), 'division_normed'] = 'Canaries'
df.loc[df['division_normed'].str.contains('Castilla La Mancha'), 'division_normed'] = 'Castilla-La Mancha'
df.loc[df['division_normed'].str.contains('Castilla la Mancha'), 'division_normed'] = 'Castilla-La Mancha'
df.loc[df['division_normed'].str.contains('Castilla y Leon'), 'division_normed'] = 'Castilla y León'
df.loc[df['division_normed'].str.contains('Ceuta'), 'division_normed'] = 'Ceuta y Melilla'
df.loc[df['division_normed'].str.contains('Melilla'), 'division_normed'] = 'Ceuta y Melilla'
df.loc[df['division_normed'].str.contains('Comunitat Valenciana'), 'division_normed'] = 'Comunidad Valenciana'
df.loc[df['division_normed'].str.contains('Comunitat_Valenciana'), 'division_normed'] = 'Comunidad Valenciana'
df.loc[df['division_normed'].str.contains('La_Rioja'), 'division_normed'] = 'La Rioja'
df.loc[df['division_normed'].str.contains('Madrid'), 'division_normed'] = 'Comunidad de Madrid'
df.loc[df['division_normed'].str.contains('Murcia'), 'division_normed'] = 'Región de Murcia'
df.loc[df['division_normed'].str.contains('Navarra'), 'division_normed'] = 'Comunidad Foral de Navarra'
df.loc[df['division_normed'].str.contains('Catalunya'), 'division_normed'] = 'Cataluña'
df.loc[df['division_normed'].str.contains('Catalonia'), 'division_normed'] = 'Cataluña'
df.loc[df['division_normed'].str.contains('Baden-Wuerttemberg'), 'division_normed'] = 'Baden-Württemberg'
df.loc[df['division_normed'].str.contains('Baden-Wurttemberg'), 'division_normed'] = 'Baden-Württemberg'
df.loc[df['division_normed'].str.contains('Bavaria'), 'division_normed'] = 'Bayern'
df.loc[df['division_normed'].str.contains('Hesse'), 'division_normed'] = 'Hessen'
df.loc[df['division_normed'].str.contains('Lower Saxony'), 'division_normed'] = 'Niedersachsen'
df.loc[df['division_normed'].str.contains('Mecklenburg-Western Pomerania'), 'division_normed'] = 'Mecklenburg-Vorpommern'
df.loc[df['division_normed'].str.contains('Rhineland-Palatinate'), 'division_normed'] = 'Rheinland-Pfalz'
df.loc[df['division_normed'].str.contains('Saxony'), 'division_normed'] = 'Sachsen'
df.loc[df['division_normed'].str.contains('Saxony-Anhalt'), 'division_normed'] = 'Sachsen-Anhalt'
df.loc[df['division_normed'].str.contains('North Rhine-Westphalia'), 'division_normed'] = 'Nordrhein-Westfalen'
df.loc[df['division_normed'].str.contains('Thuringia'), 'division_normed'] = 'Thüringen'

In [62]:
df.loc[df['country'].str.contains('Emirates'), 'country'].unique()

array(['United Arab Emirates'], dtype=object)

In [63]:
# print(sorted(gadm_2))

In [64]:
corrections = bd.COUNTY_CORRECTIONS

In [65]:
# res['location'] = res['location'].str.replace(',', '').str[:-2]
df.loc[df['location'].isna(), 'location'] = 'None'
df['location_normed'] = df['location'].copy()

In [66]:
for key, val in corrections.items():
    df.loc[:, 'location_normed'] = df['location_normed'].str.replace(key, val)
df.loc[:, 'location_normed'] = df['location_normed'].str.replace('County', '').str.replace('county', '').str.replace(',', '')
df.loc[:, 'location_normed'] = df['location_normed'].str.strip().apply(bv.check_state, args=(False,)).str.strip()
df.loc[df['location_normed'].str.contains('Anchorage-Mat-Su'), 'location_normed'] = 'Anchorage'
df.loc[df['location_normed'].str.contains('Anchorage-Mat Su'), 'location_normed'] = 'Anchorage'
df.loc[df['location_normed'].str.contains('BRA'), 'location_normed'] = 'Brazos'
df.loc[df['location_normed'].str.contains('BR'), 'location_normed'] = 'Brewster'
df.loc[df['location_normed'].str.contains('Belgrade'), 'location_normed'] = 'Gallatin'
df.loc[df['location_normed'].str.contains('Bozeman'), 'location_normed'] = 'Gallatin'
df.loc[df['location_normed'].str.contains('Big Sky'), 'location_normed'] = 'Gallatin'
df.loc[df['location_normed'].str.contains('Belton'), 'location_normed'] = 'Bell'
df.loc[df['location_normed'].str.contains('Brentwood'), 'location_normed'] = 'Contra Costa'
df.loc[df['location_normed'].str.contains('Chicago'), 'location_normed'] = 'Cook'
df.loc[df['location_normed'].str.contains('Colombus'), 'location_normed'] = 'Franklin'
df.loc[df['location_normed'].str.contains('DuBois'), 'location_normed'] = 'Fremont'
df.loc[df['location_normed'].str.contains('DuPage'), 'location_normed'] = 'Dupage'
df.loc[df['location_normed'].str.contains('Eau claire'), 'location_normed'] = 'Eau Claire'
df.loc[df['location_normed'].str.contains('Ennis'), 'location_normed'] = 'Ellis'
df.loc[df['location_normed'].str.contains('Fond Du Lac'), 'location_normed'] = 'Fond du Lac'
df.loc[df['location_normed'].str.contains('Fond du lac'), 'location_normed'] = 'Fond du Lac'
df.loc[df['location_normed'].str.contains('Fonddu Lac'), 'location_normed'] = 'Fond du Lac'
df.loc[df['location_normed'].str.contains('Frisco'), 'location_normed'] = 'Collin'
df.loc[df['location_normed'].str.contains('Hawai'), 'location_normed'] = 'Hawaii'
df.loc[df['location_normed'].str.contains('Holland'), 'location_normed'] = 'Ottawa'
df.loc[df['location_normed'].str.contains('Honolul'), 'location_normed'] = 'Honolulu'
df.loc[df['location_normed'].str.contains('Indianapolis'), 'location_normed'] = 'Marion'
df.loc[df['location_normed'].str.contains('Interior'), 'location_normed'] = 'Fairbanks North Star'
df.loc[df['location_normed'].str.contains('Ithaca'), 'location_normed'] = 'Tompkins'
df.loc[df['location_normed'].str.contains('Kaua'), 'location_normed'] = 'Kauai'
df.loc[df['location_normed'].str.contains('Las Vegas'), 'location_normed'] = 'Clark'
df.loc[df['location_normed'].str.contains('Mau'), 'location_normed'] = 'Hawaii'
df.loc[df['location_normed'].str.contains('Mcculloch'), 'location_normed'] = 'McCulloch'
df.loc[df['location_normed'].str.contains('Mchenry'), 'location_normed'] = 'McHenry'
df.loc[df['location_normed'].str.contains('Mclennan'), 'location_normed'] = 'McLennan'
df.loc[df['location_normed'].str.contains('Moris'), 'location_normed'] = 'Morris'
df.loc[df['location_normed'].str.contains('New York'), 'location_normed'] = 'New York'
df.loc[df['location_normed'].str.contains('New York City'), 'location_normed'] = 'New York'
df.loc[df['location_normed'].str.contains('New Hyde Park'), 'location_normed'] = 'Nassau'
df.loc[df['location_normed'].str.contains('New Orleans'), 'location_normed'] = 'Orleans'
df.loc[df['location_normed'].str.contains('New Rochelle'), 'location_normed'] = 'Westchester'
df.loc[df['location_normed'].str.contains('Northern'), 'location_normed'] = 'Fairbanks North Star'
df.loc[df['location_normed'].str.contains('Omaha'), 'location_normed'] = 'Douglas'
df.loc[df['location_normed'].str.contains('Ostego'), 'location_normed'] = 'Allegan'
df.loc[df['location_normed'].str.contains('Phoenix'), 'location_normed'] = 'Maricopa'
df.loc[df['location_normed'].str.contains('San Bernadino'), 'location_normed'] = 'San Bernardino'
df.loc[df['location_normed'].str.contains('Seattle'), 'location_normed'] = 'King'
df.loc[df['location_normed'].str.contains('St. Bernard'), 'location_normed'] = 'Saint Bernard'
df.loc[df['location_normed'].str.contains('St. Clair'), 'location_normed'] = 'Saint Clair'
df.loc[df['location_normed'].str.contains('St. Lawrence'), 'location_normed'] = 'Saint Lawrence'
df.loc[df['location_normed'].str.contains('St. Louis'), 'location_normed'] = 'Saint Louis'
df.loc[df['location_normed'].str.contains('St. Tammany'), 'location_normed'] = 'Saint Tammany'
df.loc[df['location_normed'].str.contains('Staten Island'), 'location_normed'] = 'Richmond'
df.loc[df['location_normed'].str.contains('Thurson'), 'location_normed'] = 'Thurston'
df.loc[df['location_normed'].str.contains('Tucson'), 'location_normed'] = 'Pima'
df.loc[df['location_normed'].str.contains('West Yellowstone'), 'location_normed'] = 'Gallatin'
df.loc[df['location_normed'].str.contains('Adam'), 'location_normed'] = 'Adams'
df.loc[df['location_normed'].str.contains('Alachu'), 'location_normed'] = 'Alachua'
df.loc[df['location_normed'].str.contains('Du Bois'), 'location_normed'] = 'Dubois'
df.loc[df['location_normed'].str.contains('DeSoto'), 'location_normed'] = 'Desoto'
df.loc[df['location_normed'].str.contains('PdfID'), 'location_normed'] = 'Pdfidio'
df.loc[df['location_normed'].str.contains('LaSalle'), 'location_normed'] = 'La Salle'
df.loc[df['location_normed'].str.contains('CAMER'), 'location_normed'] = 'Cameron'
df.loc[df['location_normed'].str.contains('CAST'), 'location_normed'] = 'Castro'
df.loc[df['location_normed'].str.contains('CROS'), 'location_normed'] = 'Crosby'
df.loc[df['location_normed'].str.contains('ECT'), 'location_normed'] = 'Ector'
df.loc[df['location_normed'].str.contains('GALVEST'), 'location_normed'] = 'Galveston'
df.loc[df['location_normed'].str.contains('JEFFERS'), 'location_normed'] = 'Jefferson'
df.loc[df['location_normed'].str.contains('KAUFM'), 'location_normed'] = 'Kaufman'
df.loc[df['location_normed'].str.contains('KLEBE'), 'location_normed'] = 'Kleberg'
df.loc[df['location_normed'].str.contains('LAVA'), 'location_normed'] = 'Lavaca'
df.loc[df['location_normed'].str.contains('MCLENN'), 'location_normed'] = 'Mclennan'
df.loc[df['location_normed'].str.contains('St.Clair'), 'location_normed'] = 'Saint Clair'
df.loc[df['location_normed'].str.contains('TARRA'), 'location_normed'] = 'Tarrant'
df.loc[df['location_normed'].str.contains('WALL'), 'location_normed'] = 'Waller'
df.loc[df['location_normed'].str.contains('WICHI'), 'location_normed'] = 'Wichita'

In [68]:
# TODO: Mexico, China, Jordan, Canada
country = 'USA'
if country:
    gisaid_2 = set(df[df['country']==country]['location_normed'].unique())
else:
    gisaid_2 = set(df['location_normed'].unique())
gadm_2 = set(gadm[(~gadm['NAME_2'].isna())&(gadm['NAME_0']=='United States')]['NAME_2'].unique())
print(len(gisaid_2))
print(len(gadm_2))
print(len(gisaid_2&gadm_2))
print(sorted(gisaid_2 - gadm_2))

506
1840
484
['', 'Ambulance', 'Gulf Coast', 'HA', 'M', 'Mclennan', 'Nan', 'Napa Solano Yolo Marin Counties', 'None', 'Out Of State', 'Out of state', 'Out-Of-State', 'Out-of-state', 'PRESID', 'Ponce', 'South West', 'Southeast', 'Southwest', 'UNKNO', 'Unknown', 'Western Alaska', 'unknown']


In [69]:
# df.loc[df['location_normed'].str.contains('WICHI')]

In [71]:
locs_missing = ['', 'GulfCoast', 'Nan', 'Napa,Solano,Yolo,MarinCounties', 
                'Out-Of-State', 'Out-of-state', 'OutOfState', 
                'Outofstate', 'Ponce', 'SouthWest', 'Southeast', 
                'Southwest', 'Unknown', 'WesternAlaska', 'unknown']
samples_missing_county = df.loc[(df['location_normed'].isin(locs_missing))&(df['country']=='USA')]
print(f'Number of samples missing country-level geo-information: {samples_missing_county.shape[0]}')

Number of samples missing country-level geo-information: 183


In [72]:
df['location'].value_counts()

None                                                                                                                                                                                                                                                                                                                                                                                                                       462238
Houston                                                                                                                                                                                                                                                                                                                                                                                                                     12615
Santa Clara County                                                                                                                                                  

In [73]:
df['division'].value_counts()

England                  190952
None                      37951
Wales                     26387
Hovedstaden               17171
Texas                     15897
                          ...  
Callenelle                    1
Montignies-sur-sambre         1
Ghlin                         1
Rishpon                       1
Hertain                       1
Name: division, Length: 2359, dtype: int64

In [74]:
df['country'].value_counts()

United Kingdom                      235806
USA                                 106640
Denmark                              41985
Australia                            17368
Japan                                17253
                                     ...  
Andorra                                  1
Antigua and Barbuda                      1
Albania                                  1
Saint Vincent and the Grenadines         1
Trinidad and Tobago                      1
Name: country, Length: 158, dtype: int64

In [76]:
meta_fp = '/valhalla/gisaid/meta_2021-02-17.tsv.gz'
df.to_csv(meta_fp, sep='\t', index=False, compression='gzip')

In [77]:
df.columns

Index(['strain', 'location', 'date_submitted', 'clade', 'pango_lineage',
       'pango_version', 'accession_id', 'country', 'division',
       'country_normed', 'division_normed', 'location_normed'],
      dtype='object')