# Prepare

Prepares the raw International Classification of Diseases (ICD) data into tabular `.csv` files.

Optionally, uploads the prepared data into a database.

## Libraries

In [1]:
import numpy as np
import os
import pandas as pd
import subprocess
import yaml

from edotenv import load_edotenv
from pandas.api.types import is_numeric_dtype
from pathlib import Path
from sqlalchemy import create_engine, text, inspect

## Config

Load settings from the `config.yml` file.

In [2]:
with open('config.yml', 'r') as file:
    config = yaml.safe_load(file)
    
version = config['Version']

## Save Login (Optional)

To connect to the database, you need to ensure that you have saved your login with successfully with `bin/login.bat` or `bin/login.sh`:

In Windows, run:

```
bin\login
```

In Linux/Mac OS, run:

```
source bin/login.sh
```

If these commands run successfully, a `.env` file will be created.

## Database Connection (Optional)

Connect to the database if provided.

In [3]:
if Path('../.env').is_file():
    load_edotenv('../.env')
    if 'ICD_UPLOAD_DB_URL' in os.environ:
        uengine = create_engine(os.environ['ICD_UPLOAD_DB_URL'])

**Note**: If this fails, try `bin/login.bat` or `bin/login.sh` again.

## Process Data

Process the datasets by:

1. Reading the raw data from a file
2. Removing columns that are not needed
3. Renaming original columns to desired column names
4. Cleaning values by:
    a. Removing leading dashes from values
    b. Converting data types

In [4]:
# Process each dataset
datasets = {}
for data in config['Data']:
    
    # Get data info
    name = data['Name']
    file = Path(data['File'])
    print(f'Processing {name}...')
    
    # 1. Read raw data
    ext = file.suffix.lower()
    if ext in ['.xls', '.xlsx']:
        df = pd.read_excel(file)
    elif ext == '.csv':
        df = pd.read_csv(file)
    else:
        raise ValueError(f'Extension {ext} not supported.')
    
    # 2-4. Remove unneeded cols and rename
    if 'Columns' in data:
        
        # 2. Keep cols
        print(f'Extracting columns {name}...')
        keep_cols = [c['Name'] for c in data['Columns']]
        df = df[keep_cols]
        
        # 3. Rename cols
        print(f'Renaming columns {name}...')
        rename_cols = {c['Name']: c.get('Rename', c['Name']) for c in data['Columns']}
        df = df.rename(columns=rename_cols)
        
        # 4. Preprocess cols
        for c in data['Columns']:
            cname = c.get('Rename', c['Name'])
            
            # 4a. Remove leading dashes
            if c.get('Remove Leading Dashes', False):
                print(f'Removing leading dashes ({cname})...')
                df[cname] = df[cname].str.lstrip(' -')
                    
            # 4b1. Convert data type
            if 'Type' in c:
                print(f'Converting data type ({cname})...')
                df[cname] = df[cname].astype(c['Type'])
                
            # 4b2. Convert whole numbers
            is_numeric = pd.api.types.is_numeric_dtype(df[cname])
            if is_numeric:
                is_null = df[cname].isnull()
                is_int = df[cname].apply(lambda x: float.is_integer(x) if isinstance(x, float) else False)
                if all(is_null | is_int):
                    print(f'Converting to whole numbers ({cname})...')
                    df[cname] = df[cname].astype('Int64').round(0)
    
    # Set processed dataset
    datasets[name] = df
    print(f'Processed {name}!')

Processing icd10...
Extracting columns icd10...
Renaming columns icd10...
Converting data type (kind)...
Converting data type (kind_depth)...
Converting data type (chapter)...
Converting data type (code)...
Converting data type (title)...
Processed icd10!
Processing icd11...
Extracting columns icd11...
Renaming columns icd11...
Converting data type (kind)...
Converting data type (kind_depth)...
Converting data type (chapter)...
Converting data type (code)...
Removing leading dashes (title)...
Converting data type (title)...
Converting data type (block_id)...
Converting data type (is_residual)...
Converting data type (is_leaf)...
Converting data type (is_primary_tabulation)...
Converting data type (group1)...
Converting data type (group2)...
Converting data type (group3)...
Converting data type (group4)...
Converting data type (group5)...
Converting data type (browser_url)...
Converting data type (foundation_url)...
Converting data type (linear_url)...
Processed icd11!
Processing icd10t

### Save Data

Save datasets as `.csv` files in `data` folder.

In [5]:
# Create folder to store ddict
Path('../data').mkdir(exist_ok=True)
Path('../data/archive').mkdir(exist_ok=True)

# Save datasets as csv files
for data in config['Data']:

    # Get dataset info
    name = data['Name']
    ver = data['Version']

    # Save as csv
    df = datasets[name]
    df.to_csv(f'../data/archive/{name}_v{ver}.csv', index=False)
    df.to_csv(f'../data/{name}.csv', index=False)
    print(f'Saved {name}.csv!')

Saved icd10.csv!
Saved icd11.csv!
Saved icd10to11.csv!
Saved icd11to10.csv!


Preview a dataset.

In [6]:
preview = list(datasets.keys())[0]
datasets[preview]

Unnamed: 0,kind,kind_depth,chapter,code,title
0,chapter,1,I,I,Certain infectious and parasitic diseases
1,block,1,I,A00-A09,Intestinal infectious diseases
2,category,1,I,A00,Cholera
3,category,2,I,A00.0,"Cholera due to Vibrio cholerae 01, biovar chol..."
4,category,2,I,A00.1,"Cholera due to Vibrio cholerae 01, biovar eltor"
...,...,...,...,...,...
12592,category,2,XXII,U84.3,Resistance to tuberculostatic drug(s)
12593,category,2,XXII,U84.7,Resistance to multiple antimicrobial drugs
12594,category,2,XXII,U84.8,Resistance to other specified antimicrobial drug
12595,category,2,XXII,U84.9,Resistance to unspecified antimicrobial drugs


## Data Descriptions and Dictionaries

1. Create data descriptions from `config.yml` using the `Description` key for each item under the `Data` key.
2. Also create data dictionaries from `config.yml` using the `Columns` key for each item under the `Data` key.

In [7]:
# Create descriptions and ddict of each dataset
ddict = []
ddescribe = []
for data in config['Data']:
    
    # Get processed dataset
    name = data['Name']
    df = datasets[name]
    
    # 1. Add dataset description to ddescribe
    ddescribe.append({
        'dataset': name,
        'columns': len(df.columns),
        'rows': len(df),
        'description': data['Description']
    })
    
    # 2a. Calc col stats for ddict
    dd = df.describe(include='all', datetime_is_numeric=True).transpose()
    dd = dd.reset_index()
    dd = dd.rename(columns={'index': 'column'})
    dd.insert(0, 'dataset', name)
    
    # 2b. Add col descriptions to ddict
    dd.insert(2, 'type', dd.dtypes)
    dd.insert(3, 'character_length', pd.Series(dtype='int'))
    dd.insert(4, 'description', pd.Series(dtype='str'))
    for c in data['Columns']:
        cname = c.get('Rename', c['Name'])
        dd.loc[dd['column'] == cname, ['type']] = c.get('Type', str(df[cname].dtype))
        dd.loc[dd['column'] == cname, ['description']] = c.get('Description', None)
        dd.loc[dd['column'] == cname, ['character_length']] = c.get('Character Length', None)
    ddict.append(dd)
    
# Gather descriptions and ddict
ddescribe = pd.DataFrame(ddescribe)
ddict = pd.concat(ddict)

### Save Descriptions and Dictionary

* `icd_data.csv`: dataset descriptions
* `icd_ddict.csv`: data dictionary describing columns for each dataset

In [8]:
# Create folder to store ddict
Path('../data').mkdir(exist_ok=True)

# Save data descriptions as csv
ddescribe = ddescribe.sort_values(by=['dataset'])
ddescribe.to_csv('../data/icd_data.csv', index=False)

# Save ddict as csv
ddict.to_csv('../data/icd_ddict.csv', index=False)

Preview data descriptions.

In [9]:
ddescribe

Unnamed: 0,dataset,columns,rows,description
0,icd10,5,12597,International Classification of Diseases Revis...
2,icd10to11,12,12597,Mappings for International Classification of D...
1,icd11,17,35459,International Classification of Diseases Revis...
3,icd11to10,7,17799,Mappings for International Classification of D...


Preview data dictionary.

In [10]:
ddict

Unnamed: 0,dataset,column,type,character_length,description,count,unique,top,freq,mean,std,min,25%,50%,75%,max
0,icd10,kind,str,,"ICD-10 entity kind. One of chapter, block, or ...",12597.0,4.0,category,11243.0,,,,,,,
1,icd10,kind_depth,Int64,,"ICD-10 depth for kind. For example, a category...",12597.0,,,,1.818846,0.393521,1.0,2.0,2.0,2.0,3.0
2,icd10,chapter,str,,ICD-10 chapter for the entity.,12597.0,22.0,XX,1589.0,,,,,,,
3,icd10,code,str,,ICD-10 code for the entity.,12597.0,12597.0,I,1.0,,,,,,,
4,icd10,title,str,,ICD-10 title of the entity.,12597.0,12536.0,Driver injured in collision with other and uns...,7.0,,,,,,,
0,icd11,kind,str,,"ICD-11 entity kind. One of chapter, block, or ...",35459.0,3.0,category,34079.0,,,,,,,
1,icd11,kind_depth,Int64,,"ICD-11 depth for kind. For example, a category...",35459.0,,,,2.050312,1.045853,1.0,1.0,2.0,3.0,8.0
2,icd11,chapter,str,,ICD-11 chapter for the entity.,35459.0,28.0,X,16964.0,,,,,,,
3,icd11,code,str,,ICD-11 code for the entity. Note that the grou...,35459.0,34080.0,,1380.0,,,,,,,
4,icd11,title,str,,ICD-11 title of the entity.,35459.0,35401.0,Acute myeloid leukaemia without maturation,2.0,,,,,,,


## Datasets Upload (Optional)

Upload datasets to created tables in the PostgreSQL database.

In [11]:
if 'ICD_UPLOAD_DB_URL' in os.environ:
    
    for data in config['Data']:
    
        # Get info from config for dataset
        version = data['Version']
        name = data['Name']
        table = f'{name}_v{version}'
        schema = data['Schema'] if 'Schema' in data else None

        # Upload to db
        print(f'Uploading {table}...')
        if not inspect(uengine).has_table(table):
            
            # Upload to db depending on whether it has geodata
            df = datasets[name]
            if 'Geometry Column' in data:
                df.to_postgis(table, uengine, schema=schema, index=False)
            else:
                df.to_sql(table, uengine, schema=schema, index=False)
            print(f'Uploaded {table}!')
            
        else:
            
            # Skip upload if table exists
            print(f'Table {table} exists - skipping!')
            

Uploading icd10_v1...
Uploaded icd10_v1!
Uploading icd11_v1...
Uploaded icd11_v1!
Uploading icd10to11_v1...
Uploaded icd10to11_v1!
Uploading icd11to10_v1...
Uploaded icd11to10_v1!


## Comments Upload (Optional)

Add table and column comments to uploaded dataset tables by:

1. Generating SQL for dataset table comment
2. Generating SQL for dataset column comments
3. Executing generated SQL statements above

In [12]:
if 'ICD_UPLOAD_DB_URL' in os.environ:
    
    sql = {}
    for data in config['Data']:

        # Get info from config for dataset
        version = data['Version']
        name = data['Name']
        table = f'{name}_v{version}'

        # Add schema if avail
        if 'Schema' in data:
            schema = data['Schema']
            table = f'{schema}.{table}'

        # Get ddict for dataset
        dd = ddict[ddict['dataset'] == name]

        # 1. Create sql for table comment
        squote = "'"
        dbquotes = "''"
        description = ddescribe[ddescribe['dataset'] == name]['description']
        description = description.tolist()[0].replace(squote, dbquotes)
        comment_query = f"COMMENT ON TABLE {table} IS '{description}';"

        # 2. Create sql for column comments
        ncols = dd.shape[0]
        col_query = [f"COMMENT ON COLUMN {table}.{r['column']} IS '{str(r['description']).replace(squote, dbquotes)}';" for i, r in dd.iterrows()]
        col_query = '\n'.join(col_query)

        # 3a. Add table and col comment statements
        query = f'--- {table} table comment\n' + comment_query \
            + f'\n\n--- {table} column comments (n={ncols})\n' + col_query
        sql[name] = query

    # 3b. Combine and execute comment statements
    comment_sql = '\n\n'.join(q for dataset, q in sql.items())
    with uengine.connect() as connection:
        connection.execute(text(comment_sql))
        connection.execute('COMMIT;')

Save comments sql to file `icd_comments.sql` in `downloads` folder.

In [13]:
if 'ICD_UPLOAD_DB_URL' in os.environ:
    
    # Create folder to store database outputs
    Path('../downloads').mkdir(exist_ok=True)
    Path('../downloads/database').mkdir(exist_ok=True)
    
    # Save comments sql for tables
    with open(f'../downloads/database/icd_comments_v{version}.sql', 'w') as file:
        file.write(comment_sql)

## View Uploads (Optional)

Create views for the uploaded tables with accompanying view/column comments.

In [14]:
if 'ICD_UPLOAD_DB_URL' in os.environ:
    
    sql = {}
    for data in config['Data']:

        # Get info from config for dataset
        version = data['Version']
        name = data['Name']
        table = f'{name}_v{version}'

        # Add schema if avail
        if 'Schema' in data:
            schema = data['Schema']
            table = f'{schema}.{table}'
            
        # Get ddict for dataset
        dd = ddict[ddict['dataset'] == name]

        # 1. Create view query
        view_query = f'CREATE OR REPLACE VIEW {name} AS (SELECT * FROM {table});'
        
        # 2. Create sql for view comment
        squote = "'"
        dbquotes = "''"
        description = ddescribe[ddescribe['dataset'] == name]['description']
        description = description.tolist()[0].replace(squote, dbquotes)
        comment_query = f"COMMENT ON VIEW {name} IS '{description}';"

        # 3. Create sql for column comments
        ncols = dd.shape[0]
        col_query = [f"COMMENT ON COLUMN {name}.{r['column']} IS '{str(r['description']).replace(squote, dbquotes)}';" for i, r in dd.iterrows()]
        col_query = '\n'.join(col_query)
        
        # 3a. Add view, comment, and column statements
        sql[name] = f'--- {name} view\n\n' + view_query \
            + f'\n\n--- {name} view comment\n\n' \
            + comment_query \
            + f'\n\n--- {name} view column comments (n={ncols})\n\n' \
            + col_query

    # 3b. Combine and execute comment statements
    view_sql = '\n\n'.join([q for dataset, q in sql.items()])
    with uengine.connect() as connection:
        connection.execute(text(view_sql))
        connection.execute('COMMIT;')

Save views sql to file `icd_views.sql` in `downloads` folder.

In [15]:
if 'ICD_UPLOAD_DB_URL' in os.environ:
    with open(f'../downloads/database/icd_views_v{version}.sql', 'w') as file:
        file.write(view_sql)

## Close Database Connections

In [16]:
if 'ICD_UPLOAD_DB_URL' in os.environ:
    uengine.dispose()