# Accessing Global.health's Linelist data

Raw data can be downloaded from both Global.health using the following these instructions:

1. Global.health - Data from all >130 countries contained in the Global.health are freely available for download at [Global.health](https://global.health/). Users must first register and agree to the [terms of use](https://global.health/terms-of-use/).


## General info

Authors and acknowledgements
* Aman Ahuja - https://github.com/amanahuja/pandemic_data_notebooks
* This content borrowed heavily from Felix Jackson - https://github.com/fojackson8/
* All Global.health team members


In [7]:
import os
import pandas as pd

In [8]:
data_dir = '/home/aman/workspace/pandemic_data/datasets1/'

In [9]:
!ls -1 $data_dir/

citation_data.rtf
data_dictionary_linelist.csv
gh_mexico_linelist.csv
gisaid_metadata.tsv


In [10]:
# The files we will be importing

datadict_file = os.path.join(data_dir, 'data_dictionary_linelist.csv')
linelist_file = os.path.join(data_dir, 'gh_mexico_linelist.csv')


In [11]:
"""
Global.health line list dictionary
----

The Global.health data dictionary describes each columns in the list list table. 
Each row is a case record. 
"""

# Load the dataframe into a python dictionary with help from pandas
datadict = pd.read_csv(datadict_file,
                      #header=None, 
                       index_col=0, 
                       squeeze=True).to_dict()


# print the data dictionary for inspection
for k,v in datadict.items(): 
     print (f"{k[:32]:32} : {v[:40]}")


_id                              : Unique ID of the case
caseReference.additionalSources  : Additional sources that provided data fo
caseReference.sourceEntryId      : The ID of this case that was assigned by
caseReference.sourceId           : Unique ID of the source for this case
caseReference.sourceUrl          : URL of the raw source for this case
caseReference.uploadIds          : An array of UUIDs of uploads where this 
caseReference.verificationStatus : Whether this case was verified by a cura
demographics.ageRange.end        : Maximum age bracket, excluded if differe
demographics.ageRange.start      : Minimum age bracket
demographics.ethnicity           : Ethnicity of the case, Asian, Caucasian,
demographics.gender              : Male, Female, Non-binary/Third gender, O
demographics.nationalities       : Nationalities of the case in English
demographics.occupation          : Primary occupation
events.confirmed.date            : Confirmed case date
events.confirmed.value         

In [15]:
"""
Column manipulation
----
Select the columns we want to keep, and rename them for convenience. 

columns_map: dict
    This dict maps original column names to desired column names.
    All other columns will not be imported. 
"""
   
columns_map = {
    'caseReference.verificationStatus': 'verification_status',
    'demographics.ageRange.start': 'age_start',
    'demographics.ageRange.end': 'age_end', 
    'demographics.ethnicity': 'ethnicity', 
    'demographics.gender': 'sex',
    'events': 'events',
    'genomeSequences': 'genome_sequences',
    'location.country': 'country',
    'location.geometry.latitude': 'lat',
    'location.geometry.longitude': 'long',
    'preexistingConditions.hasPreexistingConditions': 'has_preexisting_conditions',
    'pathogens': 'pathogens',
    #'variantOfConcern': 'variant_of_concern', 
}

In [13]:
"""
Line list data
----

Import into a pandas dataframe
"""

# import line list data into a dataframe
linelist = pd.read_csv(linelist_file, 
                       # this is handy for testing without big loads: nrow = 100
                       usecols = columns_map.keys(),
                      )             

# rename the columns
linelist.rename(columns=columns_map, inplace=True)

In [14]:
linelist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3715249 entries, 0 to 3715248
Data columns (total 12 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   verification_status         object 
 1   age_end                     float64
 2   age_start                   float64
 3   ethnicity                   float64
 4   sex                         object 
 5   events                      object 
 6   genome_sequences            float64
 7   country                     object 
 8   lat                         float64
 9   long                        float64
 10  pathogens                   float64
 11  has_preexisting_conditions  float64
dtypes: float64(8), object(4)
memory usage: 340.1+ MB


In [16]:
linelist.head(2).T

Unnamed: 0,0,1
verification_status,UNVERIFIED,UNVERIFIED
age_end,54.0,61.0
age_start,54.0,61.0
ethnicity,,
sex,Female,Female
events,"{""confirmed"":{""value"":"""",""date"":""2021-10-17""}}","{""confirmed"":{""value"":"""",""date"":""2021-10-16""}}"
genome_sequences,,
country,Mexico,Mexico
lat,21.078188,19.19834
long,-86.941117,-99.206221


In [23]:
"""
Inspecting unique values in various columns
"""

columns = linelist.columns.tolist()

# grab the first 100 rows as a sample
sample_df = linelist[:100] 


# skip cols that are all nan
cols = [col for col in columns if not linelist[col].isna().all()] 
    
object_cols = [col for col in cols if linelist[col].dtype=='O']

for col in object_cols: 
    print (f"Unique values in '{col}':\n {linelist[col].unique()}")

Unique values in 'verification_status':
 ['UNVERIFIED' 'VERIFIED']
Unique values in 'sex':
 ['Female' 'Male' nan]
Unique values in 'events':
 ['{"confirmed":{"value":"","date":"2021-10-17"}}'
 '{"confirmed":{"value":"","date":"2021-10-16"}}'
 '{"confirmed":{"value":"","date":"2021-10-17"},"hospitalAdmission":{"value":"Yes","date":"2021-10-17"}}'
 ...
 '{"onsetSymptoms":{"value":"","date":"2020-03-13"},"confirmed":{"value":"","date":"2020-03-30"}}'
 '{"onsetSymptoms":{"value":"","date":"2020-03-28"},"confirmed":{"value":"","date":"2020-03-30"}}'
 '{"onsetSymptoms":{"value":"","date":"2020-03-29"},"confirmed":{"value":"","date":"2020-03-30"}}']
Unique values in 'country':
 ['Mexico']


## Get events data (HIGH MEM USAGE)

Maybe we can use a more computationally efficient approach here? 

In [37]:
"""
work with a smaller dataset?
"""
print (len(linelist))

# Filter mask 
#mask = linelist.verification_status == "VERIFIED"
#mask = mask & (linelist.country == "Mexico")
#print (mask.sum())
#linelist = linelist[mask] 

# Truncate
max_rows = 500000
linelist = linelist.sample(n=max_rows)

# 
print (len(linelist))

500000
500000


In [41]:
import json
import numpy as np

In [42]:
"""
Create new dataframe from the events array
---

WARNING: this can an expensive operation! 

Events array looks like this: 
    {"confirmed":{"value":"","date":"2021-10-17"}}
    
    dict keys are events: 
    confirmed, hospitalAdmission, outcome
    
    dict values are
    'date' and 'value'

This dataframe will make new columns from dates and/or values of the events. 
"""


# Convert string to dict and make a new dataframe
events = linelist.events.apply(json.loads).apply(pd.Series)

# Lambda function grabs the DATE from the dict values
getdate = lambda x: np.NaN if pd.isnull(x) else x['date']

# create new columns from event dates
events['confirmed_date'] = events.confirmed.apply(getdate)
events['admission_date'] = events.hospitalAdmission.apply(getdate)

# Lambda function grabs the VALUE from the dict values
getvalue = lambda x: np.NaN if pd.isnull(x) else x['value']

# create new columns from event values
events['outcome_value'] =  events.outcome.apply(getvalue)


# We have some extra columns now, but we only need these ones: 
keepcols = [
    'confirmed_date', 
    'admission_date', 
    'outcome_value']
events = events[keepcols]

In [43]:
events

Unnamed: 0,confirmed_date,admission_date,outcome_value
3062666,2020-08-22,,
2064811,2021-01-13,,
1768911,2021-02-03,,
327977,2021-09-02,,
678381,2021-08-11,2021-08-11,
...,...,...,...
1278103,2021-05-28,,
805357,2021-08-04,,
1988439,2021-01-18,,
87628,2021-09-29,,


In [44]:
"""
Join the events dataframe with the linelist data, so we have one dataframe with new events columns. 

Dataframes are already aligned, simply concatenate.
"""

# concat with axis = 1
linelist = pd.concat([linelist, events], axis=1)

In [45]:
"""
Manipulate types and other options if needed
"""

# linelist columns
columns = linelist.columns.tolist()

## Convert types

datetimecols = [col for col in columns if 'date' in col]
print (f"Converting datetime columns: {datetimecols}")

for col in datetimecols: 
    linelist[col] = pd.to_datetime(linelist[col])


Converting datetime columns: ['confirmed_date', 'admission_date']


## Inspect dataframe

In [46]:
linelist.head(3).T

Unnamed: 0,3062666,2064811,1768911
verification_status,UNVERIFIED,UNVERIFIED,UNVERIFIED
age_end,65.0,39.0,66.0
age_start,65.0,39.0,66.0
ethnicity,,,
sex,Male,Male,Female
events,"{""confirmed"":{""value"":"""",""date"":""2020-08-22""}}","{""confirmed"":{""value"":"""",""date"":""2021-01-13""}}","{""confirmed"":{""value"":"""",""date"":""2021-02-03""}}"
genome_sequences,,,
country,Mexico,Mexico,Mexico
lat,23.468523,19.977498,20.591481
long,-106.305886,-102.753409,-103.347855


## Playing Around