# PDC to CRDCH Transformation Workflow

This notebook demonstrates the conversion of data from one of the CRDC nodes, specifically the PDC, into CRDCH instance data. The notebook reads in the node data as JSON and outputs it in the LinkML format.

In [1]:
import sys

# LinkML is a modeling language with built in generators
# that can automagically generate data in many output formats
# like JSON-LD, Python dataclasses modules, etc.
!{sys.executable} -m pip install linkml

# Dataframes library to visualize node data in a tabular format
!{sys.executable} -m pip install pandas

# Utilities to visualize data in LinkML YAML format
!{sys.executable} -m pip install linkml-runtime

Collecting argparse>=1.4.0
  Using cached argparse-1.4.0-py2.py3-none-any.whl (23 kB)


Installing collected packages: argparse
Successfully installed argparse-1.4.0




## Load and Visualize PDC data

In [2]:
import json
import pandas

with open('head-and-mouth/pdc-head-and-mouth.json') as file:
    pdc_head_and_mouth = json.load(file)
    
pandas.DataFrame(pdc_head_and_mouth)

Unnamed: 0,case_id,case_submitter_id,days_to_lost_to_followup,demographics,diagnoses,disease_type,externalReferences,index_date,lost_to_followup,primary_site,project_submitter_id,samples
0,0232701d-6d00-440c-af6c-5899fbbf4142,OSCC_13,,"[{'cause_of_death': None, 'days_to_birth': Non...","[{'age_at_diagnosis': None, 'ajcc_clinical_m':...",Oral Squamous Cell Carcinoma,[],,,Head and Neck,Oral Squamous Cell Carcinoma - Chang Gung Univ...,[{'aliquots': [{'aliquot_id': '426a2696-f073-4...
1,0e943de7-c277-48f2-8fa9-b2e836b03c2c,OSCC_25,,"[{'cause_of_death': None, 'days_to_birth': Non...","[{'age_at_diagnosis': None, 'ajcc_clinical_m':...",Oral Squamous Cell Carcinoma,[],,,Head and Neck,Oral Squamous Cell Carcinoma - Chang Gung Univ...,[{'aliquots': [{'aliquot_id': '38404eb4-20a6-4...
2,1104505a-9890-49ce-8d7d-7a8070261324,OSCC_23,,"[{'cause_of_death': None, 'days_to_birth': Non...","[{'age_at_diagnosis': None, 'ajcc_clinical_m':...",Oral Squamous Cell Carcinoma,[],,,Head and Neck,Oral Squamous Cell Carcinoma - Chang Gung Univ...,[{'aliquots': [{'aliquot_id': '15218d5b-fc40-4...
3,195cd133-0d53-402d-b31c-3d4fe0481858,OSCC_37,,"[{'cause_of_death': None, 'days_to_birth': Non...","[{'age_at_diagnosis': None, 'ajcc_clinical_m':...",Oral Squamous Cell Carcinoma,[],,,Head and Neck,Oral Squamous Cell Carcinoma - Chang Gung Univ...,[{'aliquots': [{'aliquot_id': '47e8d70c-646d-4...
4,1df726a4-8520-4474-8c00-d238a7384be1,OSCC_06,,"[{'cause_of_death': None, 'days_to_birth': Non...","[{'age_at_diagnosis': None, 'ajcc_clinical_m':...",Oral Squamous Cell Carcinoma,[],,,Head and Neck,Oral Squamous Cell Carcinoma - Chang Gung Univ...,[{'aliquots': [{'aliquot_id': '333e24c9-ec45-4...
...,...,...,...,...,...,...,...,...,...,...,...,...
143,df6bef95-c233-4b10-b321-36ef4e79b5d4,OSCC_40,,"[{'cause_of_death': None, 'days_to_birth': Non...","[{'age_at_diagnosis': None, 'ajcc_clinical_m':...",Oral Squamous Cell Carcinoma,[],,,Head and Neck,Oral Squamous Cell Carcinoma - Chang Gung Univ...,[{'aliquots': [{'aliquot_id': 'a3402806-a9ec-4...
144,e11e9155-4ac6-43dc-b8e5-1be822cd2dab,OSCC_47,,"[{'cause_of_death': None, 'days_to_birth': Non...","[{'age_at_diagnosis': None, 'ajcc_clinical_m':...",Oral Squamous Cell Carcinoma,[],,,Head and Neck,Oral Squamous Cell Carcinoma - Chang Gung Univ...,[{'aliquots': [{'aliquot_id': '9d1789f8-d629-4...
145,ea7c9fbd-8353-4f3c-9fea-2fba79140536,OSCC_56,,"[{'cause_of_death': None, 'days_to_birth': Non...","[{'age_at_diagnosis': None, 'ajcc_clinical_m':...",Oral Squamous Cell Carcinoma,[],,,Head and Neck,Oral Squamous Cell Carcinoma - Chang Gung Univ...,[{'aliquots': [{'aliquot_id': '9c36e4e9-a971-4...
146,f581075d-1b69-4812-9fe4-2bde4aad8bf2,OSCC_38,,"[{'cause_of_death': None, 'days_to_birth': Non...","[{'age_at_diagnosis': None, 'ajcc_clinical_m':...",Oral Squamous Cell Carcinoma,[],,,Head and Neck,Oral Squamous Cell Carcinoma - Chang Gung Univ...,[{'aliquots': [{'aliquot_id': '4059003c-b576-4...


## Transform PDC Case into CRDCH Research Subject

In [3]:
# from ccdh import ccdhmodel as ccdh

In [4]:
import crdch_model.crdch_model as ccdh

In [5]:


def create_stage_observation(type, value):
    """ Create a CCDHCancerStageObservation from a type of observation and a codeable concept."""
    # As with the body site example above, we need to map PDC values into the values
    # allowed under the CRDCH model.
    stage_mappings = {
        'not reported': 'Not Reported',
        'unknown': 'Unknown',
        'stage i': 'Stage I',
        'stage ii': 'Stage II',
        'stage iii': 'Stage III',
        'stage iva': 'Stage IVA',
        'stage ivb': 'Stage IVB',
        'stage ivc': 'Stage IVC',
    }
    
    if value in stage_mappings:
        return ccdh.CancerStageObservation(
            observation_type=type,
            valueCodeableConcept=stage_mappings[value]
        )
    
    return ccdh.CancerStageObservation(
        observation_type=type,
        valueCodeableConcept=value
    )

def create_stage_from_pdc(diagnosis):
    # Create an observation set
    # Difference 1: what will method_type be here?
    obs = ccdh.CancerStageObservationSet()
    
    # Add observations for every type of observation in the PDC diagnosis.
    if diagnosis.get('tumor_stage') is not None:
        obs.observations.append(create_stage_observation('Overall', diagnosis.get('tumor_stage')))
        
    if diagnosis.get('ajcc_clinical_stage') is not None:
        obs.observations.append(create_stage_observation('Clinical Overall', diagnosis.get('ajcc_clinical_stage')))
        
    if diagnosis.get('ajcc_clinical_t') is not None:
        obs.observations.append(create_stage_observation('Clinical Tumor (T)', diagnosis.get('ajcc_clinical_t')))
        
    if diagnosis.get('ajcc_clinical_n') is not None:
        obs.observations.append(create_stage_observation('Clinical Node (N)', diagnosis.get('ajcc_clinical_n')))
        
    if diagnosis.get('ajcc_clinical_m') is not None:
        obs.observations.append(create_stage_observation('Clinical Metastasis (M)', diagnosis.get('ajcc_clinical_m')))
    
    if diagnosis.get('ajcc_pathologic_stage') is not None:
        obs.observations.append(create_stage_observation('Pathological Overall', diagnosis.get('ajcc_pathologic_stage')))
        
    if diagnosis.get('ajcc_pathologic_t') is not None:
        obs.observations.append(create_stage_observation('Pathological Tumor (T)', diagnosis.get('ajcc_pathologic_t')))
        
    if diagnosis.get('ajcc_pathologic_n') is not None:
        obs.observations.append(create_stage_observation('Pathological Node (N)', diagnosis.get('ajcc_pathologic_n')))
        
    if diagnosis.get('ajcc_pathologic_m') is not None:
        obs.observations.append(create_stage_observation('Pathological Metastasis (M)', diagnosis.get('ajcc_pathologic_m')))
    
    return obs

# Test transform with the diagnosis from the first loaded case.
# Note that the resulting CancerStageObservationSet contains descriptions for the concepts included in it.
example_observation_set = create_stage_from_pdc(pdc_head_and_mouth[131]['diagnoses'][0])
example_observation_set

TypeError: crdch_model.crdch_model.CodeableConcept() argument after ** must be a mapping, not str

In [None]:
from linkml_runtime.dumpers import yaml_dumper

print(yaml_dumper.dumps(example_observation_set))

## Transform PDC Diagnosis into CRDCH Diagnosis

In [None]:
def create_body_site(site_name):
    """ Create a CCDH BodySite based on the name of a site in the human body."""
    
    # Accept 'None'.
    if site_name is None:
        return None
    
    # Some body sites are not currently included in the CCDH model. We will need to translate these sites
    # into values that *are* included in the CCDH model.
    site_mappings = {
        'Larynx, NOS': ccdh.EnumCCDHBodySiteSite.Larynx
    }
    
    # Map values if needed. Otherwise, pass them through unmapped.
    if site_name in site_mappings:
        return ccdh.BodySite(site=(site_mappings[site_name]))
    
    return ccdh.BodySite(site=site_name)

def transform_sample_to_specimen(sample):
    """
    A method for transforming a PDC Sample into CCDH Specimen.
    """
    specimen = ccdh.Specimen(id = sample.get('sample_id'))
    specimen.source_material_type = sample.get('sample_type')
    specimen.general_tissue_morphology = sample.get('tissue_type')
    specimen.specific_tissue_morphology = sample.get('tumor_code')
    specimen.tumor_status_at_collection = sample.get('tumor_descriptor')

    return specimen

def transform_diagnosis(diagnosis, case):
    ccdh_diagnosis = ccdh.Diagnosis(
        id=diagnosis.get('diagnosis_id'),
        condition=diagnosis.get('primary_diagnosis'),
        morphology=diagnosis.get('morphology'),
        grade=diagnosis.get('grade'),
        stage=create_stage_from_pdc(diagnosis),
        year_at_diagnosis=diagnosis.get('year_of_diagnosis'),
        related_specimen=[
            transform_sample_to_specimen(
                sample
            ) for sample in case.get('samples')
        ]
    )
    ccdh_diagnosis.identifier = [
        ccdh.Identifier(
            system='PDC-submitter-id',
            value=diagnosis.get('diagnosis_submitter_id')
        )
    ]
    
    if 'primary_site' in case and case['primary_site'] != '':
        body_site = create_body_site(case['primary_site'])
        if body_site is not None:
            ccdh_diagnosis.metastatic_site.append(body_site)

    return ccdh_diagnosis

example_diagnosis = transform_diagnosis(pdc_head_and_mouth[131]['diagnoses'][0], pdc_head_and_mouth[131])
print(yaml_dumper.dumps(example_diagnosis))