In [1]:
# JSON 2 FASTA
import re
import json
import gzip
from Bio import SeqIO

in_fp = '/valhalla/al_tmp/provision.json'
out_fp = '/valhalla/gisaid/test.fasta'
regex = re.compile('[^a-zA-Z]')
print(f"Loading JSON...")
data = [json.loads(line) for line in open(in_fp, 'r')]
print(f"Converting to dict...")
seqs_dict = {sample['covv_virus_name'].replace('hCoV-19/', '').replace(' ', ''): 
             regex.sub('', sample['sequence'].replace('\n', '')) for sample in data}
print(f"Converting to FASTA...")
with open(out_fp, 'w') as f:
    f.write(''.join(f'>{idx}\n{seq}\n' for idx, seq in seqs_dict.items()))
print(f"FASTA output generated and saved in {out_fp}")

Loading JSON...
Converting to dict...
Converting to FASTA...
FASTA output generated and saved in /valhalla/gisaid/test.fasta


## Location normalization

In [2]:
import pandas as pd
import geopandas as gpd
import visualize as bv
import data as bd

In [3]:
metacols = ['covv_virus_name', 'covsurver_prot_mutations', 'covv_location',
             'covv_lineage', 'covv_collection_date', 'covv_accession_id',
             'pangolin_lineages_version', 'covv_clade', 'covv_subm_date']
df = pd.read_json('/valhalla/gisaid/feed_2021-02-14.json', lines=True)

In [10]:
COLS = [
'covv_virus_name', 
'covv_location', 
'covv_subm_date',
'covv_clade',
'covv_lineage',
'pangolin_lineages_version',
'covv_accession_id'
        ]

In [13]:
COLS

['covv_virus_name',
 'covv_location',
 'covv_subm_date',
 'covv_clade',
 'covv_lineage',
 'pangolin_lineages_version',
 'covv_accession_id']

In [14]:
df.columns

Index(['strain', 'covsurver_prot_mutations', 'location', 'pango_lineage',
       'covv_collection_date', 'accession_id', 'sequence', 'pango_version',
       'clade', 'date_submitted'],
      dtype='object')

In [15]:
df = df[COLS].copy()

In [16]:
df.columns

Index(['strain', 'covsurver_prot_mutations', 'location', 'pango_lineage',
       'covv_collection_date', 'accession_id', 'sequence', 'pango_version',
       'clade', 'date_submitted'],
      dtype='object')

In [5]:
df['accession_id']

0         EPI_ISL_426900
1         EPI_ISL_426901
2         EPI_ISL_426902
3         EPI_ISL_426903
4         EPI_ISL_426904
               ...      
527604    EPI_ISL_984623
527605    EPI_ISL_984624
527606    EPI_ISL_984625
527607    EPI_ISL_984626
527608    EPI_ISL_984627
Name: covv_accession_id, Length: 527609, dtype: object

In [17]:
df['accession_id'].unique().shape

(527609,)

In [9]:
df.rename(columns={
                   'covv_virus_name': 'strain', 
                   'covv_location': 'location', 
                   'covv_subm_date': 'date_submitted',
                   'covv_clade': 'clade',
                   'covv_lineage': 'pango_lineage',
                   'pangolin_lineages_version': 'pango_version',
                   'covv_accession_id': 'accession_id'
                  }, inplace=True)

In [18]:
gadm = gpd.read_file('/home/al/data/geojsons/gadm36.shp')

In [19]:
gadm.columns

Index(['UID', 'GID_0', 'ID_0', 'NAME_0', 'GID_1', 'ID_1', 'NAME_1',
       'VARNAME_1', 'NL_NAME_1', 'HASC_1', 'CC_1', 'TYPE_1', 'ENGTYPE_1',
       'VALIDFR_1', 'VALIDTO_1', 'REMARKS_1', 'GID_2', 'ID_2', 'NAME_2',
       'VARNAME_2', 'NL_NAME_2', 'HASC_2', 'CC_2', 'TYPE_2', 'ENGTYPE_2',
       'VALIDFR_2', 'VALIDTO_2', 'REMARKS_2', 'GID_3', 'ID_3', 'NAME_3',
       'VARNAME_3', 'NL_NAME_3', 'HASC_3', 'CC_3', 'TYPE_3', 'ENGTYPE_3',
       'VALIDFR_3', 'VALIDTO_3', 'REMARKS_3', 'GID_4', 'ID_4', 'NAME_4',
       'VARNAME_4', 'CC_4', 'TYPE_4', 'ENGTYPE_4', 'VALIDFR_4', 'VALIDTO_4',
       'REMARKS_4', 'GID_5', 'ID_5', 'NAME_5', 'CC_5', 'TYPE_5', 'ENGTYPE_5',
       'REGION', 'VARREGION', 'zone', 'geometry'],
      dtype='object')

In [20]:
gadmcols = [f'NAME_{i}' for i in range(5)]
gadm_locs = gadm[gadmcols]

In [21]:
res = pd.DataFrame(df['location'].str.split('/').tolist(), 
             columns=['region',
                    'country', 
                    'division', 
                    'location', 
                    'city', 
                    'town'])
res['country'] = res['country'].str.strip()
res['division'] = res['division'].str.strip()
res['location'] = res['location'].str.strip()

In [22]:
res.loc[~res['location'].isna()][['country', 'division', 'location']]

Unnamed: 0,country,division,location
12,Australia,New South Wales,Sydney
58,Netherlands,Noord Holland,Diemen
128,Australia,New South Wales,Sydney
131,Australia,New South Wales,Sydney
134,Australia,New South Wales,Sydney
...,...,...,...
527584,USA,California,San Joaquin County
527585,USA,California,San Joaquin County
527600,Brazil,Amazonas,Manaus
527601,Brazil,Amazonas,Manaus


In [65]:
# res['country'].value_counts().iloc[:40]

In [None]:
res['country_normed'] = res['country'].copy()
res['country_normed'].fillna('None', inplace=True)
res.loc[res['country_normed']=='USA', 'country_normed'] = 'United States'
res.loc[res['country_normed'].str.contains('Congo'), 'country_normed'] = 'Republic of Congo'
res.loc[res['country_normed'].str.contains('Cote dIvoire'), 'country_normed'] = "Côte d'Ivoire"
res.loc[res['country_normed'].str.contains('North Macedonia'), 'country_normed'] = "Macedonia"
res.loc[res['country_normed'].str.contains('Curacao'), 'country_normed'] = "Curaçao"
res.loc[res['country_normed'].str.contains('Saint Martin'), 'country_normed'] = "Saint-Martin"
res.loc[res['country_normed'].str.contains('Trinidad'), 'country_normed'] = 'Trinidad and Tobago'
res.loc[res['country_normed'].str.contains('Czech republic'), 'country_normed'] = 'Czech Republic'
res.loc[res['country_normed'].str.contains('St Eustatius'), 'country_normed'] = 'Netherlands'
res.loc[res['country_normed'].str.contains('Saint Barthelemy'), 'country_normed'] = 'Saint-Barthélemy'
res.loc[res['country_normed'].str.contains('Palestine'), 'country_normed'] = "Palestina"
res.loc[res['country_normed'].str.contains("Germany /"), 'country_normed'] = "Germany"
res.loc[res['country_normed'].str.contains("France /Nouvelle-Aquitaine"), 'division'] = "Nouvelle-Aquitaine"
res.loc[res['country_normed']=="France /Nouvelle-Aquitaine", 'country_normed'] = "France"
res.loc[res['country_normed'].str.contains("France /Nouvelle-Aquitaine/ Limoges"), 'division'] = "Nouvelle-Aquitaine"
res.loc[res['country_normed'].str.contains("France /Nouvelle-Aquitaine/ Limoges"), 'location'] = "Limoges"
res.loc[res['country_normed']=="France /Nouvelle-Aquitaine/ Limoges", 'country_normed'] = "France"
res.loc[res['country_normed']=="Kenya /", 'country_normed'] = "Kenya"
res.loc[res['country_normed']=="Switzerland/ Schwyz", 'division'] = "Schwyz"
res.loc[res['country_normed']=="Switzerland/ Schwyz", 'country_normed'] = "Switzerland"
res.loc[res['country_normed']=="USA /Wisconsin", 'division'] = "Wisconsin"
res.loc[res['country_normed']=="USA /Wisconsin", 'country_normed'] = "United States"
res.loc[res['country_normed']=="Jonavos apskritis", 'country_normed'] = "Lithuania"
res.loc[res['country_normed']=="Thailand /Singburi", 'division'] = "Singburi"
res.loc[res['country_normed']=="Thailand /Singburi", 'country_normed'] = "Thailand"
res.loc[res['country_normed']=="Norway /", 'country_normed'] = "Norway"
res.loc[res['country_normed']=="Morocoo", 'country_normed'] = "Morocco"

In [23]:
# print(sorted(gadm_0))

NameError: name 'gadm_0' is not defined

In [26]:
gisaid_0 = set(res['country_normed'].unique())
gadm_0 = set(gadm_locs['NAME_0'].unique())
print(len(gisaid_0))
print(len(gadm_0))
print(f'Number of countries captured in GADM: {len(gisaid_0 & gadm_0)}')
print(f'Countries in GISAID not captured in GADM: {gisaid_0 - gadm_0}')
#TODO: fix Morocoo

152
256
Number of countries captured in GADM: 150
Countries in GISAID not captured in GADM: {'Caribbean', 'Crimea'}


In [27]:
missing_countries = ['Crimea', 'Caribbean']
samples_missing_country = res[res['country'].isin(missing_countries)]
print(f'Number of samples missing country-level geo-information: {samples_missing_country.shape[0]}')

Number of samples missing country-level geo-information: 19


In [28]:
res.loc[res['division'].isna(), 'division'] = 'None'
res['division_normed'] = res['division'].copy()

In [62]:
# print(sorted(gadm_1))

In [63]:
# TODO: Spain, rest of EU!
country = ''
if country:
    gisaid_1 = set(res[res['country']==country]['division_normed'].unique())
else:
    gisaid_1 = set(res['division_normed'].unique())
gadm_1 = set(gadm_locs[~gadm_locs['NAME_1'].isna()]['NAME_1'].unique())
print(len(gisaid_1))
print(len(gadm_1))
print(len(gisaid_1&gadm_1))
# print(sorted(gisaid_1 - gadm_1))

2287
3487
701


In [52]:
res['division_normed'] = res['division'].copy()
res.loc[res['division_normed']=='USA', 'division_normed'] = 'United States'
res.loc[res['division_normed'].str.contains('Georgia /'), 'division_normed'] = 'Georgia'
res.loc[res['division_normed'].str.contains('Antwerp'), 'division_normed'] = 'Vlaanderen'
res.loc[res['division_normed'].str.contains('Andalu'), 'division_normed'] = 'Andalucía'
res.loc[res['division_normed'].str.contains('Cairo'), 'division_normed'] = 'Al Qahirah'
res.loc[res['division_normed'].str.contains('Northern territory'), 'division_normed'] = 'Northern Territory'
res.loc[res['division_normed'].str.contains('Fayoum'), 'division_normed'] = 'Al Fayyum'
res.loc[res['division_normed'].str.contains('Musca'), 'division_normed'] = 'Muscat'
res.loc[res['division_normed'].str.contains('Kalyoubia'), 'division_normed'] = 'Al Qalyubiyah'
res.loc[res['division_normed'].str.contains('Buraymi'), 'division_normed'] = 'Al Buraymi'
res.loc[res['division_normed'].str.contains('Buraimi'), 'division_normed'] = 'Al Buraymi'
res.loc[res['division_normed'].str.contains('Dakhiliyah'), 'division_normed'] = 'Ad Dakhliyah'
res.loc[res['division_normed'].str.contains('Dhahirah'), 'division_normed'] = 'Al Dhahira'
res.loc[res['division_normed'].str.contains('North Batinah'), 'division_normed'] = 'Al Batinah North'
res.loc[res['division_normed'].str.contains('South Batinah'), 'division_normed'] = 'Al Batinah South'
res.loc[res['division_normed'].str.contains('North Sharqiyah'), 'division_normed'] = 'Ash Sharqiyah North'
res.loc[res['division_normed'].str.contains('Wuhan'), 'division_normed'] = 'Hubei'
res.loc[res['division_normed'].str.contains('Quebec'), 'division_normed'] = 'Québec'
res.loc[res['division_normed'].str.contains('Toronto'), 'division_normed'] = 'Ontario'
res.loc[res['division_normed'].str.contains('Coahuila de Zaragoza'), 'division_normed'] = 'Coahuila'
res.loc[res['division_normed'].str.contains('Mexico City'), 'division_normed'] = 'México'
res.loc[res['division_normed'].str.contains('Michoacan'), 'division_normed'] = 'Michoacán'
res.loc[res['division_normed'].str.contains('Nuevo Leon'), 'division_normed'] = 'Nuevo León'
res.loc[res['division_normed'].str.contains('Queretaro'), 'division_normed'] = 'Querétaro'
res.loc[res['division_normed'].str.contains('SanLuisPotosi'), 'division_normed'] = 'San Luis Potosí'
res.loc[res['division_normed'].str.contains('San Luis Potosi'), 'division_normed'] = 'San Luis Potosí'
res.loc[res['division_normed'].str.contains('State of Mexico'), 'division_normed'] = 'México'
res.loc[res['division_normed'].str.contains('Yucatan'), 'division_normed'] = 'Yucatán'
res.loc[res['division_normed'].str.contains('Bethlehem'), 'division_normed'] = 'West Bank'
res.loc[res['division_normed'].str.contains('Hebron'), 'division_normed'] = 'West Bank'
res.loc[res['division_normed'].str.contains('Jenin'), 'division_normed'] = 'West Bank'
res.loc[res['division_normed'].str.contains('Jericho'), 'division_normed'] = 'West Bank'
res.loc[res['division_normed'].str.contains('Ramallah'), 'division_normed'] = 'West Bank'
res.loc[res['division_normed'].str.contains('Tulkarem'), 'division_normed'] = 'West Bank'
res.loc[res['division_normed'].str.contains('Nablus'), 'division_normed'] = 'West Bank'
res.loc[res['division_normed'].str.contains('Sharja'), 'division_normed'] = 'Sharjah'
res.loc[res['division_normed'].str.contains('Copenhagen'), 'division_normed'] = 'Hovedstaden'
res.loc[res['division_normed'].str.contains('Sjaelland'), 'division_normed'] = 'Sjælland'
res.loc[res['division_normed'].str.contains('Cape Town'), 'division_normed'] = 'Western Cape'
res.loc[res['division_normed'].str.contains('Western Cape'), 'division_normed'] = 'Western Cape'
res.loc[res['division_normed'].str.contains('Amapa'), 'division_normed'] = 'Amapá'
res.loc[res['division_normed'].str.contains('Ceara'), 'division_normed'] = 'Ceará'
res.loc[res['division_normed'].str.contains('Goias'), 'division_normed'] = 'Goiás'
res.loc[res['division_normed'].str.contains('Maranhao'), 'division_normed'] = 'Maranhão'
res.loc[res['division_normed'].str.contains('Paraiba'), 'division_normed'] = 'Paraíba'
res.loc[res['division_normed'].str.contains('Parana'), 'division_normed'] = 'Paraná'
res.loc[res['division_normed'].str.contains('Piaui'), 'division_normed'] = 'Piauí'
res.loc[res['division_normed'].str.contains('Sao Paulo'), 'division_normed'] = 'São Paulo'
res.loc[res['division_normed'].str.contains('Aragon'), 'division_normed'] = 'Aragón'
res.loc[res['division_normed'].str.contains('Asturias'), 'division_normed'] = 'Principado de Asturias'
res.loc[res['division_normed'].str.contains('Balear Islands'), 'division_normed'] = 'Islas Baleares'
res.loc[res['division_normed'].str.contains('Balear_Islands'), 'division_normed'] = 'Islas Baleares'
res.loc[res['division_normed'].str.contains('Illes Balears'), 'division_normed'] = 'Islas Baleares'
res.loc[res['division_normed'].str.contains('Canary Islands'), 'division_normed'] = 'Canaries'
res.loc[res['division_normed'].str.contains('Canary_Islands'), 'division_normed'] = 'Canaries'
res.loc[res['division_normed'].str.contains('Castilla La Mancha'), 'division_normed'] = 'Castilla-La Mancha'
res.loc[res['division_normed'].str.contains('Castilla la Mancha'), 'division_normed'] = 'Castilla-La Mancha'
res.loc[res['division_normed'].str.contains('Castilla y Leon'), 'division_normed'] = 'Castilla y León'
res.loc[res['division_normed'].str.contains('Ceuta'), 'division_normed'] = 'Ceuta y Melilla'
res.loc[res['division_normed'].str.contains('Melilla'), 'division_normed'] = 'Ceuta y Melilla'
res.loc[res['division_normed'].str.contains('Comunitat Valenciana'), 'division_normed'] = 'Comunidad Valenciana'
res.loc[res['division_normed'].str.contains('Comunitat_Valenciana'), 'division_normed'] = 'Comunidad Valenciana'
res.loc[res['division_normed'].str.contains('La_Rioja'), 'division_normed'] = 'La Rioja'
res.loc[res['division_normed'].str.contains('Madrid'), 'division_normed'] = 'Comunidad de Madrid'
res.loc[res['division_normed'].str.contains('Murcia'), 'division_normed'] = 'Región de Murcia'
res.loc[res['division_normed'].str.contains('Navarra'), 'division_normed'] = 'Comunidad Foral de Navarra'
res.loc[res['division_normed'].str.contains('Catalunya'), 'division_normed'] = 'Cataluña'
res.loc[res['division_normed'].str.contains('Catalonia'), 'division_normed'] = 'Cataluña'
res.loc[res['division_normed'].str.contains('Baden-Wuerttemberg'), 'division_normed'] = 'Baden-Württemberg'
res.loc[res['division_normed'].str.contains('Baden-Wurttemberg'), 'division_normed'] = 'Baden-Württemberg'
res.loc[res['division_normed'].str.contains('Bavaria'), 'division_normed'] = 'Bayern'
res.loc[res['division_normed'].str.contains('Hesse'), 'division_normed'] = 'Hessen'
res.loc[res['division_normed'].str.contains('Lower Saxony'), 'division_normed'] = 'Niedersachsen'
res.loc[res['division_normed'].str.contains('Mecklenburg-Western Pomerania'), 'division_normed'] = 'Mecklenburg-Vorpommern'
res.loc[res['division_normed'].str.contains('Rhineland-Palatinate'), 'division_normed'] = 'Rheinland-Pfalz'
res.loc[res['division_normed'].str.contains('Saxony'), 'division_normed'] = 'Sachsen'
res.loc[res['division_normed'].str.contains('Saxony-Anhalt'), 'division_normed'] = 'Sachsen-Anhalt'
res.loc[res['division_normed'].str.contains('North Rhine-Westphalia'), 'division_normed'] = 'Nordrhein-Westfalen'
res.loc[res['division_normed'].str.contains('Thuringia'), 'division_normed'] = 'Thüringen'

In [53]:
res.loc[res['country'].str.contains('Emirates'), 'country'].unique()

array(['United Arab Emirates'], dtype=object)

In [54]:
# print(sorted(gadm_2))

In [55]:
corrections = bd.COUNTY_CORRECTIONS

In [56]:
# res['location'] = res['location'].str.replace(',', '').str[:-2]
res.loc[res['location'].isna(), 'location'] = 'None'
res['location_normed'] = res['location'].copy()

In [57]:
for key, val in corrections.items():
    res.loc[:, 'location_normed'] = res['location_normed'].str.replace(key, val)
res.loc[:, 'location_normed'] = res['location_normed'].str.replace('County', '').str.replace('county', '').str.replace(',', '')
res.loc[:, 'location_normed'] = res['location_normed'].str.strip().apply(bv.check_state, args=(False,)).str.strip()
res.loc[res['location_normed'].str.contains('Anchorage-Mat-Su'), 'location_normed'] = 'Anchorage'
res.loc[res['location_normed'].str.contains('Anchorage-Mat Su'), 'location_normed'] = 'Anchorage'
res.loc[res['location_normed'].str.contains('BRA'), 'location_normed'] = 'Brazos'
res.loc[res['location_normed'].str.contains('BR'), 'location_normed'] = 'Brewster'
res.loc[res['location_normed'].str.contains('Belgrade'), 'location_normed'] = 'Gallatin'
res.loc[res['location_normed'].str.contains('Bozeman'), 'location_normed'] = 'Gallatin'
res.loc[res['location_normed'].str.contains('Big Sky'), 'location_normed'] = 'Gallatin'
res.loc[res['location_normed'].str.contains('Belton'), 'location_normed'] = 'Bell'
res.loc[res['location_normed'].str.contains('Brentwood'), 'location_normed'] = 'Contra Costa'
res.loc[res['location_normed'].str.contains('Chicago'), 'location_normed'] = 'Cook'
res.loc[res['location_normed'].str.contains('Colombus'), 'location_normed'] = 'Franklin'
res.loc[res['location_normed'].str.contains('DuBois'), 'location_normed'] = 'Fremont'
res.loc[res['location_normed'].str.contains('DuPage'), 'location_normed'] = 'Dupage'
res.loc[res['location_normed'].str.contains('Eau claire'), 'location_normed'] = 'Eau Claire'
res.loc[res['location_normed'].str.contains('Ennis'), 'location_normed'] = 'Ellis'
res.loc[res['location_normed'].str.contains('Fond Du Lac'), 'location_normed'] = 'Fond du Lac'
res.loc[res['location_normed'].str.contains('Fond du lac'), 'location_normed'] = 'Fond du Lac'
res.loc[res['location_normed'].str.contains('Fonddu Lac'), 'location_normed'] = 'Fond du Lac'
res.loc[res['location_normed'].str.contains('Frisco'), 'location_normed'] = 'Collin'
res.loc[res['location_normed'].str.contains('Hawai'), 'location_normed'] = 'Hawaii'
res.loc[res['location_normed'].str.contains('Holland'), 'location_normed'] = 'Ottawa'
res.loc[res['location_normed'].str.contains('Honolul'), 'location_normed'] = 'Honolulu'
res.loc[res['location_normed'].str.contains('Indianapolis'), 'location_normed'] = 'Marion'
res.loc[res['location_normed'].str.contains('Interior'), 'location_normed'] = 'Fairbanks North Star'
res.loc[res['location_normed'].str.contains('Ithaca'), 'location_normed'] = 'Tompkins'
res.loc[res['location_normed'].str.contains('Kaua'), 'location_normed'] = 'Kauai'
res.loc[res['location_normed'].str.contains('Las Vegas'), 'location_normed'] = 'Clark'
res.loc[res['location_normed'].str.contains('Mau'), 'location_normed'] = 'Hawaii'
res.loc[res['location_normed'].str.contains('Mcculloch'), 'location_normed'] = 'McCulloch'
res.loc[res['location_normed'].str.contains('Mchenry'), 'location_normed'] = 'McHenry'
res.loc[res['location_normed'].str.contains('Mclennan'), 'location_normed'] = 'McLennan'
res.loc[res['location_normed'].str.contains('Moris'), 'location_normed'] = 'Morris'
res.loc[res['location_normed'].str.contains('New York'), 'location_normed'] = 'New York'
res.loc[res['location_normed'].str.contains('New York City'), 'location_normed'] = 'New York'
res.loc[res['location_normed'].str.contains('New Hyde Park'), 'location_normed'] = 'Nassau'
res.loc[res['location_normed'].str.contains('New Orleans'), 'location_normed'] = 'Orleans'
res.loc[res['location_normed'].str.contains('New Rochelle'), 'location_normed'] = 'Westchester'
res.loc[res['location_normed'].str.contains('Northern'), 'location_normed'] = 'Fairbanks North Star'
res.loc[res['location_normed'].str.contains('Omaha'), 'location_normed'] = 'Douglas'
res.loc[res['location_normed'].str.contains('Ostego'), 'location_normed'] = 'Allegan'
res.loc[res['location_normed'].str.contains('Phoenix'), 'location_normed'] = 'Maricopa'
res.loc[res['location_normed'].str.contains('San Bernadino'), 'location_normed'] = 'San Bernardino'
res.loc[res['location_normed'].str.contains('Seattle'), 'location_normed'] = 'King'
res.loc[res['location_normed'].str.contains('St. Bernard'), 'location_normed'] = 'Saint Bernard'
res.loc[res['location_normed'].str.contains('St. Clair'), 'location_normed'] = 'Saint Clair'
res.loc[res['location_normed'].str.contains('St. Lawrence'), 'location_normed'] = 'Saint Lawrence'
res.loc[res['location_normed'].str.contains('St. Louis'), 'location_normed'] = 'Saint Louis'
res.loc[res['location_normed'].str.contains('St. Tammany'), 'location_normed'] = 'Saint Tammany'
res.loc[res['location_normed'].str.contains('Staten Island'), 'location_normed'] = 'Richmond'
res.loc[res['location_normed'].str.contains('Thurson'), 'location_normed'] = 'Thurston'
res.loc[res['location_normed'].str.contains('Tucson'), 'location_normed'] = 'Pima'
res.loc[res['location_normed'].str.contains('West Yellowstone'), 'location_normed'] = 'Gallatin'
res.loc[res['location_normed'].str.contains('Adam'), 'location_normed'] = 'Adams'
res.loc[res['location_normed'].str.contains('Alachu'), 'location_normed'] = 'Alachua'
res.loc[res['location_normed'].str.contains('Du Bois'), 'location_normed'] = 'Dubois'
res.loc[res['location_normed'].str.contains('DeSoto'), 'location_normed'] = 'Desoto'
res.loc[res['location_normed'].str.contains('PRESID'), 'location_normed'] = 'Presidio'
res.loc[res['location_normed'].str.contains('LaSalle'), 'location_normed'] = 'La Salle'
res.loc[res['location_normed'].str.contains('CAMER'), 'location_normed'] = 'Cameron'
res.loc[res['location_normed'].str.contains('CAST'), 'location_normed'] = 'Castro'
res.loc[res['location_normed'].str.contains('CROS'), 'location_normed'] = 'Crosby'
res.loc[res['location_normed'].str.contains('ECT'), 'location_normed'] = 'Ector'
res.loc[res['location_normed'].str.contains('GALVEST'), 'location_normed'] = 'Galveston'
res.loc[res['location_normed'].str.contains('JEFFERS'), 'location_normed'] = 'Jefferson'
res.loc[res['location_normed'].str.contains('KAUFM'), 'location_normed'] = 'Kaufman'
res.loc[res['location_normed'].str.contains('KLEBE'), 'location_normed'] = 'Kleberg'
res.loc[res['location_normed'].str.contains('LAVA'), 'location_normed'] = 'Lavaca'
res.loc[res['location_normed'].str.contains('MCLENN'), 'location_normed'] = 'Mclennan'
res.loc[res['location_normed'].str.contains('St.Clair'), 'location_normed'] = 'Saint Clair'
res.loc[res['location_normed'].str.contains('TARRA'), 'location_normed'] = 'Tarrant'
res.loc[res['location_normed'].str.contains('WALL'), 'location_normed'] = 'Waller'
res.loc[res['location_normed'].str.contains('WICHI'), 'location_normed'] = 'Wichita'

In [58]:
# TODO: Mexico, China, Jordan, Canada
country = 'USA'
if country:
    gisaid_2 = set(res[res['country']==country]['location_normed'].unique())
else:
    gisaid_2 = set(res['location_normed'].unique())
gadm_2 = set(gadm_locs[(~gadm_locs['NAME_2'].isna())&(gadm_locs['NAME_0']=='United States')]['NAME_2'].unique())
print(len(gisaid_2))
print(len(gadm_2))
print(len(gisaid_2&gadm_2))
print(sorted(gisaid_2 - gadm_2))

504
1840
484
['', 'Gulf Coast', 'HA', 'M', 'Mclennan', 'Nan', 'Napa Solano Yolo Marin Counties', 'None', 'Out Of State', 'Out of state', 'Out-Of-State', 'Out-of-state', 'Ponce', 'South West', 'Southeast', 'Southwest', 'UNKNO', 'Unknown', 'Western Alaska', 'unknown']


In [59]:
res.loc[res['location_normed'].str.contains('WICHI')]

Unnamed: 0,region,country,division,location,city,town,country_normed,division_normed,location_normed


In [61]:
locs_missing = ['', 'GulfCoast', 'Nan', 'Napa,Solano,Yolo,MarinCounties', 
                'Out-Of-State', 'Out-of-state', 'OutOfState', 
                'Outofstate', 'Ponce', 'SouthWest', 'Southeast', 
                'Southwest', 'Unknown', 'WesternAlaska', 'unknown']
samples_missing_county = res.loc[(res['location_normed'].isin(locs_missing))&(res['country']=='USA')]
print(f'Number of samples missing country-level geo-information: {samples_missing_county.shape[0]}')

Number of samples missing country-level geo-information: 183


In [112]:
res['location'].value_counts()

None                  441241
Houston                12615
Santa Clara County      2246
San Diego               2117
Yakima County           2031
                       ...  
Mazet-Saint-Voy            1
Albrechtice                1
Argenteuil                 1
Bermeo                     1
Nogent l'Artaud            1
Name: location, Length: 3159, dtype: int64

In [20]:
res['division'].value_counts()

England              178277
Wales                 25382
Hovedstaden           17171
Texas                 15897
Scotland              15294
                      ...  
Agri                      1
StrassimZillertal         1
Dambovita                 1
Avissawella               1
Ere                       1
Name: division, Length: 2315, dtype: int64

In [19]:
res['country'].value_counts()

UnitedKingdom                   221370
USA                             105568
Denmark                          41985
Australia                        17361
Japan                            17252
                                 ...  
SaintVincentandtheGrenadines         1
Andorra                              1
TrinidadandTobago                    1
Albania                              1
AntiguaandBarbuda                    1
Name: country, Length: 157, dtype: int64