## Paso 1
Importar las librerías necesarias (entre ellas phenopackets, pandas, numpy, y otras que ayuden a procesar elementos como fechas o países en formato ISO). Esta sección iréis aumentándola a medida que descubráis que necesitáis más librerías.

In [1]:
import phenopackets.schema.v2 as pps2
#import numpy as np
#import pandas as pd
from google.protobuf.json_format import MessageToDict, ParseDict, MessageToJson
from google.protobuf.timestamp_pb2 import Timestamp
import json
import datetime

## Paso 2
Crear las funciones que adapten los valores del csv a valores que acepte Phenopackets.

In [2]:
def isoage(age):
    ageyear= int(age)
    ageyeardecimal = round(age % 1,2)
    agemonthdecimal = ageyeardecimal*12
    agemonth=int(agemonthdecimal)
    agedaysdecimal = agemonthdecimal % 1
    ageday=int(agedaysdecimal*30.437)
    finalage = "P"
    if ageyear != 0:
        finalage = finalage+str(ageyear)+"Y"
    if agemonth != 0:
        finalage = finalage+str(agemonth)+"M"
    if ageday != 0:
        finalage = finalage+str(ageday)+"D"
    return finalage

def disease(d_term=None,d_excluded=None,d_onset=None,d_resolution=None,d_disease_stage=None,d_clinical_tnm_finding=None,d_primary_site=None,d_laterality=None):
    
    try:
        d = pps2.Disease(term=d_term)
    except:
        raise TypeError("The given term is incorrect")
    
    
    if d_excluded == True:
        d.excluded = True
    elif d_excluded == False:
        d.excluded = False
    elif isinstance(d_excluded,bool) == False:
        raise TypeError("Excluded")
    
    
    if d_onset is None:
        pass
    else:
        try:
            d.onset.CopyFrom(pps2.TimeElement(age = pps2.Age(iso8601duration=isoage(d_onset))))
        except:
            raise TypeError("Onset is not a number")
        
    if d_resolution is None:
        pass
    else:
        try:
            d.resolution.CopyFrom(pps2.TimeElement(age = pps2.Age(iso8601duration=isoage(d_resolution))))
        except:
            raise TypeError("Resolution is not a number")
    
    if d_disease_stage is None:
        pass
    elif d_disease_stage is list or d_disease_stage is tuple:
        try:
            d.disease_stage.append(tuple(d_disease_stage))
        except:
            raise TypeError("The disease stages are not OntologyClass")
    else:
        try:
            d.disease_stage.append(d_disease_stage)
        except:
            raise TypeError("The disease stages are not OntologyClass")
    
    if d_clinical_tnm_finding is None:
        pass
    elif d_clinical_tnm_finding is list or d_clinical_tnm_finding is tuple:
        try:
            d.clinical_tnm_finding.append(tuple(d_clinical_tnm_finding))
        except:
            raise TypeError("The TNM findings are not OntologyClass")
    else:
        try:
            d.clinical_tnm_finding.append(d_disease_stage)
        except:
            raise TypeError("The TNM findings are not OntologyClass")
    
    if d_primary_site is None:
        pass
    else:
        try:
            d.primary_site.CopyFrom(d_primary_site)
        except:
            raise TypeError("Primary site is not a OntologyClass")
    if d_laterality is None:
        pass
    else:
        try:
            d.laterality.CopyFrom(d_laterality)
        except:
            raise TypeError("Laterality is not a OntologyClass")
    return d
    
    
def sexph(n):
    sex_dict = {1:"MALE",2:"FEMALE"}
    try:
        return sex_dict[n]
    except:
        return "UNKNOWN_SEX"
def vitalph(n):
    vital_status = {0:"ALIVE",1:"DECEASED",-1:"UNKNOWN_STATUS"}
    try:
        return vital_status[n]
    except:
        return "UNKNOWN_STATUS"

## Paso 3
Crear los Ontology Class que se van a usar

In [3]:
t2d = pps2.OntologyClass(id="NCIT:C26747", label = "Type 2 Diabetes Mellitus")
obese = pps2.OntologyClass(id="NCIT:_C159658", label = "Obese")
mdelay = pps2.OntologyClass(id="HP:0001270", label = "Motor delay")

tdisease = pps2.OntologyClass(id="NCIT:C3879", label = "Thyroid Gland Medullary Carcinoma")
thyroid = pps2.OntologyClass(id="NCIT:C12400", label = "Thyroid Gland")

fassay = pps2.OntologyClass(id="LOINC:26515-7", label="Platelets [#/volume] in Blood")
funit = pps2.OntologyClass(id="UO:0000316",label = "cells per imcroliter")

f_tnm = pps2.OntologyClass(id="NCIT:C141023", label="Medullary Thyroid Carcinoma pT0 TNM")
f_stage = pps2.OntologyClass(id="NCIT:C27971", label="Stage IV")

mild = pps2.OntologyClass(id="HP:0012825", label = "Mild")

male = pps2.OntologyClass(id="NCIT:C46109", label = "Male Gender")
female = pps2.OntologyClass(id="NCIT:C46110", label = "Female Gender")

human = pps2.OntologyClass(id="NCIT:C14225", label = "Human")

In [4]:
fonset = pps2.TimeElement(age = pps2.Age(iso8601duration=isoage(40)))
monset = pps2.TimeElement(age = pps2.Age(iso8601duration=isoage(60)))
ponset = pps2.TimeElement(age = pps2.Age(iso8601duration=isoage(25)))

In [5]:
fts = Timestamp(seconds=int(datetime.datetime(1962, 12, 5, 13, 7, 33).timestamp()))
mts = Timestamp(seconds=int(datetime.datetime(1960, 2, 5, 12, 5, 20).timestamp()))
pts = Timestamp(seconds=int(datetime.datetime(1994, 7, 2, 22, 34, 38).timestamp()))

ftimeofdeath = pps2.TimeElement(timestamp=Timestamp(seconds=int(datetime.datetime.now().timestamp())))

fage = pps2.TimeElement(age = pps2.Age(iso8601duration=isoage(62.34)))
mage = pps2.TimeElement(age = pps2.Age(iso8601duration=isoage(64.56)))
page = pps2.TimeElement(age = pps2.Age(iso8601duration=isoage(30.23)))

In [6]:
fdead = pps2.VitalStatus(status = "DECEASED",time_of_death=ftimeofdeath, cause_of_death=tdisease, survival_time_in_days = 84)
malive = pps2.VitalStatus(status = "ALIVE")
palive =  pps2.VitalStatus(status = "ALIVE")

In [7]:
pfile = pps2.File(uri = "file://ThisFileDoesNotExistHaHa.vcf.gz", individual_to_file_identifiers= {"Proband":"Proband_VCF"}, file_attributes = {"genomeAssembly":"GRCh38","fileFormat":"vcf","description":"What did the pirate say when he turned 80? Aye matey."})
ffile = pps2.File(uri = "file://ThisFileDoesNotExistHeHe.vcf.gz", individual_to_file_identifiers= {"Father":"Father_VCF"}, file_attributes = {"genomeAssembly":"GRCh38","fileFormat":"vcf","description":"What do you call a train carrying bubblegum? A chew-chew train"})
mfile = pps2.File(uri = "file://ThisFileDoesNotExistHoHo.vcf.gz", individual_to_file_identifiers= {"Mother":"Mother_VCF"}, file_attributes = {"genomeAssembly":"GRCh38","fileFormat":"vcf","description":"Did you hear about the guy who invented the knock-knock joke? He won the 'no-bell' prize."})

In [8]:
father = pps2.Individual(id="Father",sex=sexph(1), date_of_birth = fts, karyotypic_sex = "XY", gender = male, time_at_last_encounter=fage, vital_status = fdead, taxonomy=human)
mother = pps2.Individual(id="Mother",sex=sexph(2), date_of_birth = mts,karyotypic_sex = "XX", gender = female, time_at_last_encounter=mage, vital_status = malive, taxonomy=human)
proband = pps2.Individual(id="Proband",sex=sexph(1), date_of_birth = pts, karyotypic_sex = "XY", gender = male, time_at_last_encounter=page, vital_status = palive, taxonomy=human)


In [9]:
fquant = pps2.Quantity(unit = funit, value = 24943, reference_range = pps2.ReferenceRange(unit = funit, low = 150000.0, high = 450000.0))
fval = pps2.Value(quantity = fquant)

msample = pps2.OntologyClass(id = "NCIT:C18009", label = "Tumor Tissue Sample")
tcoll = pps2.TimeElement(age = pps2.Age(iso8601duration=isoage(58.4)))
stissue = pps2.OntologyClass(id = "SNOMED:127478007", label = "Tissue specimen from thyroid gland (specimen)")
sdtissue = pps2.OntologyClass(id = "SNOMED:258566005", label = "Deoxyribonucleic acid sample (specimen)")
eprocedure = pps2.Procedure(code = pps2.OntologyClass(id="NCIT:C28743",label="Punch Biopsy"),
                            body_site = pps2.OntologyClass(id="NCIT:C12664",label="Abdomen"),
                            performed = pps2.TimeElement(age = pps2.Age(iso8601duration=isoage(58.31))))
tprogression = pps2.OntologyClass(id = "NCIT:84509", label = "Primary Malignant Neoplasm")
pstage=pps2.OntologyClass(id="SNOMED:718465002",label="Clinical stage 0 (finding)")
tgrade = f_stage
p_tnm = [f_tnm]
hdiag = tdisease
dmark = [pps2.OntologyClass(id = "NCIT:C68748", label = "Her2/Neu Positive"),
         pps2.OntologyClass(id="NCIT:C131711",label="Human Papillomavirus-18 Positive")]
pfeat = pps2.PhenotypicFeature(type = pps2.OntologyClass(id = "NCIT:C27009", label = "Myalgia"))
mprocedure = pps2.Procedure(code = pps2.OntologyClass(id="NCIT:C40592",label="Platelet Activating Factor Signaling Process"),
                            performed = pps2.TimeElement(age = pps2.Age(iso8601duration=isoage(58.32))))
measample = pps2.Measurement(assay = fassay,
                             value = fval,
                             time_observed = pps2.TimeElement(timestamp=Timestamp(seconds=int(datetime.datetime(2008,7,3,13,30,21).timestamp()))),
                             procedure = mprocedure)
sproc = pps2.OntologyClass(id = "NCIT:C64863", label = "Ultrasonic Surgical Aspiration")
sstor = pps2.OntologyClass(id = "C178971", label = "Refrigerator")


In [10]:
biosample = pps2.Biosample(id="LukeIAmYourFather",
                           individual_id="Father",description="Damn he's dead. Sucks.",
                           sampled_tissue=stissue,
                           sample_type=sdtissue,
                           phenotypic_features=[pfeat],
                           measurements=[measample],
                           taxonomy=human,
                           time_of_collection=tcoll,
                           histological_diagnosis=tdisease,
                           tumor_progression=tprogression,
                           tumor_grade=tgrade,
                           pathological_stage=pstage,
                           pathological_tnm_finding=(p_tnm),
                           diagnostic_markers=(dmark),
                           procedure=eprocedure,
                           files=[ffile],
                           material_sample=msample,
                           sample_processing=sproc,
                           sample_storage=sstor
                           )

In [11]:
pobese = pps2.PhenotypicFeature(type = obese, excluded = True)
evidence = pps2.Evidence(evidence_code = pps2.OntologyClass(id="ECO:0006017",label = "author statement from published clinical study used in manual assertion"),
                         reference = pps2.ExternalReference(id = "PMID:30962759",description = "Recurrent Erythema Nodosum in a Child with a SHOC2 Gene Mutation"))
p= pps2.PhenotypicFeature(type = mdelay, 
                              modifiers = [pps2.OntologyClass(id="HP:0031796", label="Recurrent")],
                              severity = mild,
                              onset = pps2.TimeElement(age = pps2.Age(iso8601duration = isoage(43))),
                              resolution = pps2.TimeElement(age = pps2.Age(iso8601duration = isoage(45))),
                              evidence = [evidence])

In [12]:
vardes = pps2.VariationDescriptor(id = "FakeVariant",
                                  vcf_record = pps2.VcfRecord(
                                      genome_assembly = "GRCh38",
                                      chrom = "10",
                                      pos = 121496701,
                                      id = "rs121918506",
                                      ref = "T",
                                      alt = "G",
                                      qual = ".",
                                      filter = ".",
                                      info = "."
                                  ),
                                  )
varint = pps2.VariantInterpretation(acmg_pathogenicity_classification = "LIKELY_PATHOGENIC", therapeutic_actionability = "ACTIONABLE", variation_descriptor = vardes)
genom = pps2.GenomicInterpretation(subject_or_biosample_id = "LukeIAmYourFather",interpretation_status = "CANDIDATE",
                                   variant_interpretation = varint)
diagn = pps2.Diagnosis(disease = tdisease, genomic_interpretations = [genom])

interpretation = pps2.Interpretation(id = "Consortium:0000123456",progress_status = "SOLVED", diagnosis = diagn, summary = "Por dios que no se acaba esto."
                    )

In [13]:
dcovid = disease(d_term = tdisease,
                 d_excluded=False,
                 d_onset=62.2,
                 d_resolution=64.56,
                 d_disease_stage=f_stage,
                 d_clinical_tnm_finding=f_tnm,
                 d_primary_site=thyroid,
                 d_laterality=pps2.OntologyClass(id = "NCIT:C160199", label = "Tumor Laterality Right"))
ddiabetes = disease(d_term = t2d,
                    d_excluded=True)

In [14]:
dint = pps2.DoseInterval(quantity = pps2.Quantity(unit = pps2.OntologyClass(id = "UO:0000022", label="milligram"),value = 30.0),
                         schedule_frequency = pps2.OntologyClass(id="NCIT:C64496",label="Twice Daily"),
                         interval = pps2.TimeInterval(start = Timestamp(seconds=int(datetime.datetime(2020, 3, 15, 13, 0, 0).timestamp())),
                                                      end = Timestamp(seconds=int(datetime.datetime(202, 3, 25, 13, 7, 33).timestamp()))))

treat = pps2.Treatment(agent = pps2.OntologyClass(id = "DrugCentral:1610", label = "losartan"),
                       route_of_administration = pps2.OntologyClass(id = "NCIT:C38288", label = "Oral Route of Administration"),
                       dose_intervals = [dint],
                       drug_type = "PRESCRIPTION",
                       cumulative_dose = pps2.Quantity(unit = pps2.OntologyClass(id = "EFO:0002902", label="milligram per kilogram"),value = 200.0)
                       )

In [15]:
rthe = pps2.RadiationTherapy(modality = pps2.OntologyClass(id="NCIT:C104914", label="High-LET Heavy Ion Therapy"),
                             body_site = pps2.OntologyClass(id="NCIT:C12664",label="Abdomen"),
                             dosage = 43,
                             fractions = 13)

In [16]:
ther = pps2.TherapeuticRegimen(ontology_class= pps2.OntologyClass(id = "", label = ""),
                        start_time = pps2.TimeElement(timestamp=Timestamp(seconds=int(datetime.datetime(2020,2,3,14,20,44).timestamp()))),
                        end_time = ftimeofdeath,
                        regimen_status = "DISCONTINUED"
                        )

In [17]:
med1 = pps2.MedicalAction(treatment = treat)
med2 = pps2.MedicalAction(radiation_therapy = rthe,
                          treatment_target = thyroid,
                          treatment_intent = pps2.OntologyClass(id="NCIT:C185614",label = "Curative Therapy"),
                          response_to_treatment = pps2.OntologyClass(id = "NCIT:C94226", label = "Improvement"),
                          adverse_events = [pps2.OntologyClass(id = "NCIT:C2982", label = "Depression")],
                          treatment_termination_reason = pps2.OntologyClass(id = "NCIT:C28554", label = "Dead"))
med3 = pps2.MedicalAction(therapeutic_regimen = ther)

In [18]:
ncit = pps2.Resource(id="ncit",name="NCI Thesaurus OBO Edition",url="http://purl.obolibrary.org/obo/ncit.owl",version="https://github.com/NCI-Thesaurus/thesaurus-obo-edition/releases/tag/v2024-05-07/ncit.owl",namespace_prefix="NCIT")
snomed = pps2.Resource(id="snomed",name="SNOMED CT",url="https://browser.ihtsdotools.org/",version="https://browser.ihtsdotools.org/?edition=MAIN/2024-12-01",namespace_prefix="SNOMED")
metadata = pps2.MetaData(created = Timestamp(seconds=int(datetime.datetime.now().timestamp())),
                         created_by = "Aldar Cabrelles",
                         submitted_by = "Mickey Mouse",
                         resources=[ncit,snomed],
                         phenopacket_schema_version="2.0")

# Por FIN hacemos el phenopacket

In [19]:
ph = pps2.Phenopacket(id="Father_info",
                      subject = father,
                      phenotypic_features = [pobese,p], 
                      measurements = [measample],
                      biosamples=[biosample],
                      interpretations=[interpretation],
                      diseases = [dcovid,ddiabetes],
                      medical_actions = [med1,med2,med3],
                      files = [ffile],
                      meta_data = metadata)
phjson = MessageToDict(ph)

## Paso 6
Exportar phenopacket en un archivo.

In [20]:
with open('/Users/aldar/Documents/CRG-EGA/Curation/Phenopackets/Testing/Output/extensivepheno.json', 'w', encoding='utf-8') as f:
    json.dump(phjson, f, ensure_ascii=False, indent=4)