In [None]:
import pandas as pd
import xml.etree.ElementTree as ET

## DrugBank drug status

In [None]:
# Parse DrugBank XML file.
ns = '{http://www.drugbank.ca}'
tree = ET.parse('../data/external/drugbank_517.xml')

rows = [(drug.findtext(ns + 'drugbank-id[@primary="true"]'),
         drug.findtext(ns + 'name'),
         '|'.join([group.text for group in
                   drug.findall(f'{ns}groups/{ns}group')]),
         drug.findtext(f'{ns}calculated-properties/{ns}'
                       f'property[{ns}kind="InChIKey"]/{ns}value'))
        for drug in tree.getroot()]

drugbank = (pd.DataFrame(rows, columns=['drugbank_id', 'name', 'groups',
                                        'inchikey'])
            .dropna(subset=['inchikey']))
drugbank['inchikey_nostereo'] = drugbank['inchikey'].str.split('-').str[0]

## Match observed drugs

In [None]:
observed = (pd.read_csv('https://docs.google.com/spreadsheets/d/'
                        '1S5cdfuA8S5bW_0LVjd1gPxCxtwdhV4EQLV85raHWs9s/'
                        'export?format=csv', usecols=['InChIKey'])
            .rename(columns={'InChIKey': 'inchikey'}).dropna())
observed['inchikey_nostereo'] = observed['inchikey'].str.split('-').str[0]

In [None]:
observed_status = pd.concat(
    [drugbank[drugbank['inchikey_nostereo'] == inchikey]
     for inchikey in observed['inchikey_nostereo']],
    ignore_index=True).sort_values('drugbank_id').reset_index(drop=True)

In [None]:
observed_status.to_csv('../data/processed/drugs_status.csv', index=False)