In [1]:
import requests
import pandas as pd
import json
import crdch_model as ccdh
from linkml_runtime.dumpers import yaml_dumper

In [2]:
with open('head-and-mouth/gdc-head-and-mouth.json') as file:
    gdc_head_and_mouth = json.load(file)

In [3]:
suggested_row_num = 131
suggested_diag_num = 0
suggested_row = gdc_head_and_mouth[suggested_row_num]
# suggested_row

In [4]:
suggested_diagnosis_dict = suggested_row["diagnoses"][suggested_diag_num]
# suggested_diagnosis_dict

In [5]:
# derrived from Gaurav's code
# how to DISCOVER?
#   close_mappings ?
# how to convert these legacy CRDC-H strings to codable concepts?
gdc_to_crdch_csoot_dict = [
    {"GDC": "ajcc_clinical_m", "CRDC-H": "Clinical Metastasis (M)"},
    {"GDC": "ajcc_clinical_n", "CRDC-H": "Clinical Node (N)"},
    {"GDC": "ajcc_clinical_stage", "CRDC-H": "Clinical Overall"},
    {"GDC": "ajcc_clinical_t", "CRDC-H": "Clinical Tumor (T)"},
    {"GDC": "tumor_stage", "CRDC-H": "Overall"},
    {"GDC": "ajcc_pathologic_m", "CRDC-H": "Pathological Metastasis (M)"},
    {"GDC": "ajcc_pathologic_n", "CRDC-H": "Pathological Node (N)"},
    {"GDC": "ajcc_pathologic_stage", "CRDC-H": "Pathological Overall"},
    {"GDC": "ajcc_pathologic_t", "CRDC-H": "Pathological Tumor (T)"},
    {"GDC": "age_at_diagnosis", "CRDC-H": None},
    {"GDC": "ajcc_staging_system_edition", "CRDC-H": None},
    {"GDC": "classification_of_tumor", "CRDC-H": None},
    {"GDC": "created_datetime", "CRDC-H": None},
    {"GDC": "days_to_diagnosis", "CRDC-H": None},
    {"GDC": "days_to_last_follow_up", "CRDC-H": None},
    {"GDC": "days_to_last_known_disease_status", "CRDC-H": None},
    {"GDC": "days_to_recurrence", "CRDC-H": None},
    {"GDC": "diagnosis_id", "CRDC-H": None},
    {"GDC": "icd_10_code", "CRDC-H": None},
    {"GDC": "last_known_disease_status", "CRDC-H": None},
    {"GDC": "morphology", "CRDC-H": None},
    {"GDC": "primary_diagnosis", "CRDC-H": None},
    {"GDC": "prior_malignancy", "CRDC-H": None},
    {"GDC": "prior_treatment", "CRDC-H": None},
    {"GDC": "progression_or_recurrence", "CRDC-H": None},
    {"GDC": "site_of_resection_or_biopsy", "CRDC-H": None},
    {"GDC": "state", "CRDC-H": None},
    {"GDC": "submitter_id", "CRDC-H": None},
    {"GDC": "synchronous_malignancy", "CRDC-H": None},
    {"GDC": "tissue_or_organ_of_origin", "CRDC-H": None},
    {"GDC": "tumor_grade", "CRDC-H": None},
    {"GDC": "updated_datetime", "CRDC-H": None},
    {"GDC": "year_of_diagnosis", "CRDC-H": None},
]

In [6]:
gdc_to_crdch_csoot_frame = pd.DataFrame(gdc_to_crdch_csoot_dict)
gdc_to_crdch_csoot_frame_complete = gdc_to_crdch_csoot_frame.loc[ ~ gdc_to_crdch_csoot_frame['CRDC-H'].isnull()]
gdc_to_crdch_csoot_frame_complete

Unnamed: 0,GDC,CRDC-H
0,ajcc_clinical_m,Clinical Metastasis (M)
1,ajcc_clinical_n,Clinical Node (N)
2,ajcc_clinical_stage,Clinical Overall
3,ajcc_clinical_t,Clinical Tumor (T)
4,tumor_stage,Overall
5,ajcc_pathologic_m,Pathological Metastasis (M)
6,ajcc_pathologic_n,Pathological Node (N)
7,ajcc_pathologic_stage,Pathological Overall
8,ajcc_pathologic_t,Pathological Tumor (T)


In [7]:
# initialize
global_mapping_table = pd.DataFrame()

def get_mapping_table(model_name, class_name, slot_name):
    global global_mapping_table
    current_url = f"https://terminology.ccdh.io/models/{model_name}/entities/{class_name}/attributes/{slot_name}/mappings"
    response = requests.get(current_url)
    resp_struct = response.json()
    current_mappings = pd.DataFrame(resp_struct["mappings"])
    global_mapping_table = global_mapping_table.append(current_mappings)


In [8]:
# add suggestion to pull new content into global_mapping_table if no hits
# add required predicate handling
# add handling for len(results.index) == 0 or > 1
# add option for closest match via difflib.get_close_matches 
#   or something more quantitative
def get_mapping(model_name, mf, sl, required_pred=""):
    prefixed_mf = model_name + "." + mf
    # print(prefixed_mf)
    results = global_mapping_table.loc[
        global_mapping_table['subject_match_field'].eq(prefixed_mf) &
        global_mapping_table['subject_label'].eq(sl)
    ]
    if len(results.index) == 1:
        return results.to_dict(orient="records")

In [9]:
global_mapping_table = pd.DataFrame()

In [10]:

print(global_mapping_table.shape)
get_mapping_table("CRDC-H", "BodySite", "site")
print(global_mapping_table.shape)
get_mapping_table("CRDC-H", "CancerStageObservation", "valueCodeableConcept")
print(global_mapping_table.shape)

(0, 0)
(1402, 9)
(2081, 9)


In [11]:
# global_mapping_table

In [12]:
# get_mapping("GDC.Case", "primary_site", "Brain")

In [13]:
# suggested_row['samples'][0]['sample_id']

In [14]:
# suggested_row['primary_site']

In [15]:
def transform_sample_to_specimen(sample):
    """
    A method for transforming a GDC Sample into CCDH Specimen.
    """

    specimen = ccdh.Specimen(id = sample.get('sample_id'))
    specimen.source_material_type = sample.get('sample_type')
    specimen.general_tissue_morphology = sample.get('tissue_type')
    specimen.specific_tissue_morphology = sample.get('tumor_code')
    specimen.tumor_status_at_collection = sample.get('tumor_descriptor')
    # we don't consistentely have created_datetimes
    # need to check their format too
#     if 'created_datetime' in sample and sample['created_datetime'] is not None :
#         specimen.creation_activity = ccdh.SpecimenCreationActivity(
#             date_ended=ccdh.TimePoint(
#                 dateTime=sample.get('created_datetime')
#             )
#         )
#     else:
#         print("creation_activity not created because created_datetime absent or equals None")
    return specimen




In [16]:
# Let's try creating a test specimen.
test_specimen = transform_sample_to_specimen(suggested_row['samples'][0])
test_specimen

Specimen(id='a118da56-784d-4b67-aade-d9a7a8b49f18', identifier=[], description=None, specimen_type=None, analyte_type=None, associated_project=None, data_provider=None, source_material_type='Primary Tumor', parent_specimen=[], source_subject=None, tumor_status_at_collection=None, creation_activity=None, processing_activity=[], storage_activity=[], transport_activity=[], contained_in=None, dimensional_measures=None, quantity_measure=[], quality_measure=[], cellular_composition_type=None, histological_composition_measures=[], general_tissue_pathology=None, specific_tissue_pathology=None, preinvasive_tissue_morphology=None, morphology_pathologically_confirmed=None, morphology_assessor_role=None, morphology_assessment_method=None, degree_of_dysplasia=None, dysplasia_fraction=None, related_document=[], section_location=None, derived_product=[], distance_from_paired_specimen=None)

In [41]:
ccdh.Diagnosis(age_at_diagnosis = ccdh.Quantity(value_decimal="123"))

Diagnosis(id=None, identifier=[], subject=None, age_at_diagnosis=Quantity(value_decimal=Decimal('123'), value_codeable_concept=None, unit=None), diagnosis_date=None, condition=None, primary_site=[], metastatic_site=[], stage=[], grade=[], morphology=None, disease_status=None, prior_diagnosis=None, method_of_diagnosis=None, related_specimen=[], primary_tumor_dimensional_measures=None, supporting_observation=[])

In [44]:
def create_body_site(site_input):
    the_mapping = get_mapping("GDC.Case", "primary_site", site_input)
    mapped = ccdh.BodySite(
        site=ccdh.CodeableConcept(
            coding=[
                ccdh.Coding(
                    code=the_mapping[0]["object_id"],
                    system="undefined",
                    label=the_mapping[0]["object_label"],
                )
            ],
            text=site_input,
        )
    )
    return mapped


def create_stage_from_gdc(current_prefix, diagnosis):
    cancer_stage_method_type = None
    
    # if diagnosis.get('ajcc_staging_system_edition') == '7th':
    #    cancer_stage_method_type = 'AJCC staging system 7th edition'
    
    # Create an observation set
    obs = ccdh.CancerStageObservationSet(method_type=cancer_stage_method_type)
    for k, v in diagnosis.items():
        # print(f"trying {k} of {v}")
        qk = current_prefix + str(k)
        current_mapping = get_mapping(current_prefix, k, v)
        # print(current_mapping)
        if current_mapping is not None:
            crdch_diag_info_type = gdc_to_crdch_csoot_frame_complete["CRDC-H"].loc[
                gdc_to_crdch_csoot_frame_complete["GDC"] == k
            ]
            crdch_diag_info_type = list(crdch_diag_info_type)
            if len(crdch_diag_info_type) == 1:
                crdch_diag_info_type = crdch_diag_info_type[0]
                current_observation = ccdh.CancerStageObservation(
                    observation_type=ccdh.CodeableConcept(
                        coding=[
                            ccdh.Coding(
                                code=crdch_diag_info_type,
                                system="undefined",
                            )
                        ],
                        text=k,
                    ),
                    value_codeable_concept=ccdh.CodeableConcept(
                        coding=[
                            ccdh.Coding(
                                code=current_mapping[0]["object_id"],
                                system="undefined",
                                label=current_mapping[0]["object_label"],
                            )
                        ],
                        text=v,
                    ),
                )
                # print(current_observation)
                obs.observations.append(current_observation)
    # print(obs)
    return obs

def printifin(current_dict, current_key):
    if current_key in current_dict:
        print(f"{current_key} = {current_dict[current_key]}")
    else:
        print(f"{current_key} = {None}")
        
def list_to_pii(current_dict, key_list):
    for i in key_list:
        printifin(current_dict, i)


def transform_diagnosis(diagnosis, case):
    print(diagnosis)
    list_to_pii(diagnosis, ["primary_diagnosis", "morphology", "grade", "year_of_diagnosis"])
    
    ccdh_diagnosis = ccdh.Diagnosis(
        id=diagnosis.get("diagnosis_id"),
        related_specimen=[
            transform_sample_to_specimen(sample) for sample in case.get("samples")
        ],
        stage=create_stage_from_gdc("GDC.Diagnosis", diagnosis),
#         age_at_diagnosis = ccdh.Quantity(value_decimal=diagnosis['year_of_diagnosis'])
    )

    #     ccdh_diagnosis = ccdh.Diagnosis(
    #         condition=diagnosis.get('primary_diagnosis'),
    #         morphology=diagnosis.get('morphology'),
    #         grade=diagnosis.get('grade'),
    #         year_at_diagnosis=diagnosis.get('year_of_diagnosis'),
    #     )

    ccdh_diagnosis.identifier = [
        ccdh.Identifier(system="GDC-submitter-id", value=diagnosis.get("submitter_id"))
    ]

    if "primary_site" in case and case["primary_site"] != "":
        body_site = create_body_site(case["primary_site"])
        if body_site is not None:
            ccdh_diagnosis.metastatic_site.append(body_site)
        else:
            print("returned body_site is None")
    else:
        print("case didn't seem to have a populated primary_site")

    return ccdh_diagnosis
    pass


example_diagnosis = transform_diagnosis(
    gdc_head_and_mouth[131]["diagnoses"][0], gdc_head_and_mouth[131]
)
print("\n")
print(yaml_dumper.dumps(example_diagnosis))


{'age_at_diagnosis': 22107, 'ajcc_clinical_m': 'M1', 'ajcc_clinical_n': 'N1', 'ajcc_clinical_stage': 'Stage IVC', 'ajcc_clinical_t': 'T3', 'ajcc_pathologic_m': 'M1', 'ajcc_pathologic_n': 'N1', 'ajcc_pathologic_stage': 'Stage IVC', 'ajcc_pathologic_t': 'T3', 'ajcc_staging_system_edition': '7th', 'classification_of_tumor': 'not reported', 'created_datetime': None, 'days_to_diagnosis': 0, 'days_to_last_follow_up': None, 'days_to_last_known_disease_status': None, 'days_to_recurrence': None, 'diagnosis_id': '9e30aa6c-91e6-5dd3-9512-75c162a89913', 'icd_10_code': 'C32.9', 'last_known_disease_status': 'not reported', 'morphology': '8070/3', 'primary_diagnosis': 'Squamous cell carcinoma, NOS', 'prior_malignancy': 'no', 'prior_treatment': 'No', 'progression_or_recurrence': 'not reported', 'site_of_resection_or_biopsy': 'Larynx, NOS', 'state': 'released', 'submitter_id': 'TCGA-QK-A8Z8_diagnosis', 'synchronous_malignancy': 'No', 'tissue_or_organ_of_origin': 'Larynx, NOS', 'tumor_grade': 'not repor

ValueError:  Unknown argument: year_at_diagnosis = Quantity(value_decimal=Decimal('2013'), 