# All of Us PPI Codebook to FHIR
This notebook:
 * validates PPI defintions from a spreadsheet
 * constructs a CodeSystem
 * constructs appropriate ValueSets

In [None]:
PMI_SYSTEM = "http://terminology.pmi-ops.org/CodeSystem/ppi"
PMI_VALUESET_BASE = "http://terminology.pmi-ops.org/ValueSet/ppi-%s"
PMI_VERSION = "0.0.1"
PMI_PUBLISHER = "Precision Medicine Initiative: All of Us Resarch Program"
LAST_CHANGE_DATE = "2017-02-23"

SHEET_URL = "https://docs.google.com/spreadsheets/d/%(sid)s/export?format=csv&gid=%(gid)s"

#C
#SHEET_ID = "1b1lmf2KywIVx3-WJBnsR21ImF6Bl345n5VMSdUNBwQI"


SHEET_ID = "1TNqJ1ekLFHF4vYA2SNCb-4NL8QgoJrfuJsxnUuXd-is"
sheets = {
  "PMI": "1791570240",
  "ConsentPII": "328858697",
  "OverallHealth": "448841041",
  "PersonalHabits": "1838424427",
  "Sociodemographic": "551947635",
}

## Download PPI Modules as individual CSV files

*Note: This will overwrite the contents of "./ppi_sheets"*

In [None]:
!mkdir -p ppi_sheets

for name, gid in sheets.iteritems():
    target = SHEET_URL%{"sid": SHEET_ID, "gid": gid}
    !wget --quiet "$target" -O ppi_sheets/"$name".csv

In [None]:
import csv
from collections import namedtuple

Coding = namedtuple('Coding', ['system', 'code'])

class CodebookEntry(object):
    issues = []
    
    def __init__(self, term):
        for k in term:
            term[k] = term[k].strip()
            if not term[k]: term[k] = None
        self._dict = term
        self.make_valid(term)

    def make_valid(self, term):
        if not term['PMI Code']:
            self.issues.append("PMI Code is not defined in: %s"%term)
        if "'" in (term['PMI Code'] or ""):
            self.issues.append("Invalid character in code '%s'"%term['PMI Code'])
        if term['Parent code'] and ' ' in term['Parent code']:
            self.issues.append("unexpected space in parent code '%s' of code '%s'"%(term['Parent code'], term['PMI Code']))
            term['Parent code'] = term['Parent code'].replace(" ", "")
        if term['PMI Code'] and ' ' in term['PMI Code']:
            self.issues.append("unexpected space in code  '%s'"%term['PMI Code'])
            term['PMI Code'] = term['PMI Code'].replace(" ", "")
        if 'Type' not in term:
            self.issues.append("No type is defined for code '%s'"%term['PMI Code'])
            term['Type'] = 'Unknown'
        if 'Topic' not in term and not self.coding.code.startswith("PMI"):
            self.issues.append("No topic is defined for '%s'"%term['PMI Code'])
            term['Topic'] = 'Unknown'

    @property
    def concept_type(self):
        return self._dict['Type']
    
    @property
    def concept_topic(self):
        return self._dict['Topic']

    @property
    def display(self):
        return self._dict['Display']
    
    @property
    def coding(self):
        return Coding(self._dict['PMI System'], self._dict["PMI Code"])

    @property
    def parent_coding(self):
        return Coding(self._dict['PMI System'], self._dict["Parent code"])

In [None]:
CodebookEntry.issues = []
terms = []
for name in sheets:
    with open("ppi_sheets/%s.csv"%name, "rb") as csvfile:
        reader = csv.DictReader(csvfile)
        terms += [row for row in reader]

terms_by_coding = {}
terms_by_parent = {}

for term in terms:
    assert 'Parent code' in term
    assert 'PMI Code' in term
    assert term['PMI Code'] not in terms_by_coding, "Redefined!"
    entry = CodebookEntry(term)
    terms_by_coding[entry.coding] = entry

for term in terms_by_coding.values():
    if term.parent_coding and term.parent_coding not in terms_by_coding:
        if term.coding.code not in sheets:
            CodebookEntry.issues.append("Parent of '%s' is '%s' but does not exist"%(
                    term._dict['PMI Code'], term._dict['Parent code']))
        term._dict['Parent code'] = None
        # TODO: Remove this
        if term.coding.code.startswith("PMI"):
            term._dict['Parent code'] = "PMI"
    if term.parent_coding not in terms_by_parent:
        terms_by_parent[term.parent_coding] = []
    terms_by_parent[term.parent_coding].append(term)

for term in terms_by_coding.values():
    if term.concept_type == "Question" and term.coding not in terms_by_parent:
        CodebookEntry.issues.append("Term '%s' has type=Question, but no answers assocaited with it"%(
                term._dict['PMI Code']))

In [None]:
def strip_empty_concepts(concept):
    if concept['concept'] == None:
        concept.pop('concept')
    return concept

def concepts_with_parent(parent=None):
    return [strip_empty_concepts({
            'code': t.coding.code,
            'display': t.display,
            'property': [{
                'code': 'concept-type',
                'valueCode': t.concept_type
            },{
                'code': 'concept-topic',
                'valueCode': t.concept_topic
            }],
            'concept': concepts_with_parent(t.coding.code) or None
        }) for t in terms_by_parent.get(Coding(PMI_SYSTEM, parent), [])]
    
def make_pmi_codesystem():
    return {
    'resourceType': 'CodeSystem',
    'url': PMI_SYSTEM,
    'version': PMI_VERSION,
    'name': 'pmi-codebook',
    'title': "Codebook for PMI's All of Us Research Program Participant-Provided Information",
    'status': 'draft',
    'date': LAST_CHANGE_DATE,
    'publisher': PMI_PUBLISHER,
    'description': """
# PMI Codebook
This `CodeSystem` defines the concepts used in PPI modules.
TODO: add detail here...
    """.strip(),
    'caseSensitive': True,
    'hierarchyMeaning': 'grouped-by',
    'compositional': False,
    'content': 'complete',
    'count': len(terms_by_coding),
    'property': [{
        'code': 'concept-type',
        'description': 'indicates whether this PPI concept is a Topic, Question, or Answer',
        'type': 'string'
    },{
        'code': 'concept-topic',
        'description': 'indicates the topic for this PPI concept',
        'type': 'string'
    }],
    'concept': concepts_with_parent()
}

def make_include_for(codebook_terms):
    assert len(set([t.coding.system for t in codebook_terms])) == 1
    return {
        'system': codebook_terms[0].coding.system,
        'concept': [{
            'code': t.coding.code,
            'display': t.display
        } for t in codebook_terms]
    }

def make_pmi_valueset(question_entry):
    return {
        'resourceType': 'ValueSet',
        'url': PMI_VALUESET_BASE%question_entry.coding.code,
        'version': PMI_VERSION,
        'name': 'values-for-%s'%question_entry.coding.code,
        'title': 'Values for %s'%question_entry.display,
        'status': 'draft',
        'date': LAST_CHANGE_DATE,
        'publisher': PMI_PUBLISHER,
        'compose': {
            'include': [
                make_include_for(terms_by_parent[question_entry.coding]),
                make_include_for(terms_by_parent[Coding(PMI_SYSTEM, "PMI")]),
            ]
        }
    }

In [None]:
print "# terms:", len(terms_by_coding)
print "# issues:", len(CodebookEntry.issues)
print "Top-level concepts", "\n  ".join([str(t.coding) for t in terms_by_parent[Coding(PMI_SYSTEM,None)]])

In [None]:
import json

OUTPUT_FILE = "ppi_sheets/CodeSystem"

with open(OUTPUT_FILE+".json", "wb") as json_file:
    json.dump(make_pmi_codesystem(), json_file, indent=2)

with open(OUTPUT_FILE+".issues.json", "wb") as json_file:
    json.dump(CodebookEntry.issues, json_file, indent=2)

valuesets = [
    make_pmi_valueset(term)
    for coding, term in terms_by_coding.iteritems()
    if term.concept_type == "Question" and term.coding in terms_by_parent
]

bundle = {
    'resourceType': "Bundle",
    'entry': [{
            'resource': r
        } for r in [make_pmi_codesystem()] + valuesets]
}

with open(OUTPUT_FILE+".bundle.json", "wb") as json_file:
    json.dump(bundle, json_file, indent=2)