# Create GKS object for genes in cBioportal transformer

This is a result of issue #547 in cancervariants/metakb repo

In [None]:
from os import environ

environ["AWS_ACCESS_KEY_ID"]="dummy"
environ["AWS_SECRET_ACCESS_KEY"]="dummy"
environ["AWS_SESSION_TOKEN"]="dummy"
environ["SEQREPO_ROOT_DIR"]="/usr/local/share/seqrepo/2024-12-20"

import metakb.harvesters.cbioportal as cbph
import metakb.transformers.cbp_es_dfarber_broad_2014 as cbpt
from metakb.transformers.base import Transformer

import pandas as pd
import requests
from tqdm import tqdm
from typing import List

import logging
_logger = logging.getLogger(__name__)

from ga4gh.core.models import (
    Coding,
    ConceptMapping,
    Extension,
    MappableConcept,
    Relation,
)

200
Downloaded es_dfarber_broad_2014.tar.gz
Extracted to: es_dfarber_broad_2014_extracted


## Import the CBP harvester and transformers
Commment out this section after the first run to save time

In [2]:
#Run while inside the harvesters directory
f = cbph.cBioportalHarvester()
data = f.harvest()
g = cbpt.cBioportalTransformer()
df = g.transform(data)

Number of duplicate rows : 495

Duplicate rows (excluding first instance):
      Hugo_Symbol Chromosome  Start_Position  End_Position  \
720         ABCA1          9       107607765     107607765   
730         ABCA3         16         2338066       2338066   
783         ABCC9         12        21954066      21954066   
813         ACACA         17        35640173      35640173   
829          ACHE          7       100490251     100490251   
...           ...        ...             ...           ...   
15134       STAG2          X       123179197     123179197   
15155     ZDHHC15          X        74742823      74742824   
15191       HUWE1          X        53579734      53579734   
15216     SHROOM4          X        50350700      50350700   
15218        SOX3          X       139586714     139586714   

              Consequence Variant_Classification Variant_Type  \
720      missense_variant      Missense_Mutation          SNP   
730      missense_variant      Missense_Mutation  

In [3]:
df.to_excel('test_file.xlsx')

## Assign df to a variable so that you don't need to rerun the previous cells 

In [4]:
df = pd.read_excel('test_file.xlsx',index_col=0)
df.head()

Unnamed: 0,Hugo_Symbol,Chromosome,Start_Position,End_Position,Consequence,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele2,SAMPLE_ID,...,temp_Gnomad_Notation,Chrom_23,Chr23_X,Chr23_Y,x_hgnc_id,y_hgnc_id,ambig_chrom,gene_hgnc_id,hgnc_id_match,Gnomad_Notation
0,ETFDH,4,159603535,159603535,missense_variant,Missense_Mutation,SNP,G,C,SJDES004,...,4-159603535-G-C,False,False,False,no_value,no_value,non-ambiguous,untested,untested,4-159603535-G-C
1,GJB4,1,35227334,35227334,missense_variant,Missense_Mutation,SNP,G,A,SJDES004,...,1-35227334-G-A,False,False,False,no_value,no_value,non-ambiguous,untested,untested,1-35227334-G-A
2,MUC5B,11,1272679,1272679,missense_variant,Missense_Mutation,SNP,G,A,SJDES004,...,11-1272679-G-A,False,False,False,no_value,no_value,non-ambiguous,untested,untested,11-1272679-G-A
3,PCDHAC1,5,140307515,140307515,synonymous_variant,Silent,SNP,G,A,SJDES004,...,5-140307515-G-A,False,False,False,no_value,no_value,non-ambiguous,untested,untested,5-140307515-G-A
4,YAP1,11,102080254,102080254,missense_variant,Missense_Mutation,SNP,C,T,SJDES004,...,11-102080254-C-T,False,False,False,no_value,no_value,non-ambiguous,untested,untested,11-102080254-C-T


## Variant MappableConcept
Now with the data from the cBioportal harvester + transformer, we can start writing a loop to get variant data processed

In [5]:
all_mappings = []

base_url = 'https://normalize.cancervariants.org/variation/to_vrs?q=' #TODO: change to use Python API not REST API
for idx, row in tqdm(df[0:50].iterrows()):
    variant_of_interest = row['Gnomad_Notation']
    var_url = base_url + variant_of_interest
    r = requests.get(var_url)   
    if r.status_code == 200:
        try:
            vrs_id = r.json()['variations'][0]['id'] 
        except:
            vrs_id = f'Error with {variant_of_interest}'
    else:
        vrs_id = 'Not Found'

    mappings = [
                ConceptMapping(
                    coding=Coding(
                        id=vrs_id,
                        code=str(vrs_id),
                        system="https://www.cbioportal.org/",
                    ),
                    relation=Relation.EXACT_MATCH,
                )
            ]
    
    all_mappings.append(mappings)

# Code to convert all mappings to JSON object goes here

print(all_mappings)

50it [00:13,  3.64it/s]

[[ConceptMapping(id=None, extensions=None, coding=Coding(id='ga4gh:VA.A_KHwjPFBBTw2J2AwTbiEDYA1AZM0-T0', extensions=None, name=None, system='https://www.cbioportal.org/', systemVersion=None, code=code(root='ga4gh:VA.A_KHwjPFBBTw2J2AwTbiEDYA1AZM0-T0'), iris=None), relation='exactMatch')], [ConceptMapping(id=None, extensions=None, coding=Coding(id='ga4gh:VA.lHm5O2TRaxF1tYvxutOofpC40WTjGHea', extensions=None, name=None, system='https://www.cbioportal.org/', systemVersion=None, code=code(root='ga4gh:VA.lHm5O2TRaxF1tYvxutOofpC40WTjGHea'), iris=None), relation='exactMatch')], [ConceptMapping(id=None, extensions=None, coding=Coding(id='ga4gh:VA.F5cXKsHFc7TKvrcysBAzSUIWzve89eoT', extensions=None, name=None, system='https://www.cbioportal.org/', systemVersion=None, code=code(root='ga4gh:VA.F5cXKsHFc7TKvrcysBAzSUIWzve89eoT'), iris=None), relation='exactMatch')], [ConceptMapping(id=None, extensions=None, coding=Coding(id='ga4gh:VA.8mB3IFMdAVHOn0Pv5sZbijYClEXZhJPi', extensions=None, name=None, sys




In [6]:
#The mappings should be a ConceptMapping object (ga4gh.core.models.ConceptMapping)
type(mappings[0])

ga4gh.core.models.ConceptMapping

In [7]:
# inspect full response if code == 200
r.json()

{'search_term': '16-3340401-C-A',
 'variations': [{'id': 'ga4gh:VA.ijmmVdDcPm-AvDVr94F9tNJnC61pvkEk',
   'type': 'Allele',
   'digest': 'ijmmVdDcPm-AvDVr94F9tNJnC61pvkEk',
   'location': {'id': 'ga4gh:SL.Wzcz9PAfx1b4LJP_a9B98YB1Wp-K9sxA',
    'type': 'SequenceLocation',
    'digest': 'Wzcz9PAfx1b4LJP_a9B98YB1Wp-K9sxA',
    'sequenceReference': {'type': 'SequenceReference',
     'refgetAccession': 'SQ.W6wLoIFOn4G7cjopxPxYNk2lcEqhLQFb'},
    'start': 3340400,
    'end': 3340401,
    'sequence': 'C'},
   'state': {'type': 'LiteralSequenceExpression', 'sequence': 'A'}}],
 'service_meta_': {'name': 'variation-normalizer',
  'version': '0.13.0',
  'response_datetime': '2025-09-30T16:19:30.296216Z',
  'url': 'https://github.com/cancervariants/variation-normalization'}}

## Gene Mappable Concept
Now with the data from the cBioportal harvester + transformer, we can start writing a loop to get gene data processed

In [8]:
def _get_exact_gene_mappings(hgnc_id: str, gene_symbol: str) -> list[ConceptMapping]:
        """ Get HGNC gene mapping 

        param hgnc_id: the unique numeric identifier provided by HGNC to each gene (with no "hgnc:" prefix) that is present in the CBP data in the column gene_hgnc_id
        param gene_symbol: the gene symbol provided in the CBP data in the column Hugo_Symbol

        return: Concept Mapping for HGNC Gene
        """

        #if there is a value in the gene_hgnc_id column that is the string "untested"
        if not hgnc_id and hgnc_id == "untested":
            return None
        
        #add the "hgnc:" prefix to the unique numeric identifier
        gene_id = f"hgnc:{hgnc_id}"

        return[
            ConceptMapping(
                coding=Coding(
                    id= gene_id,
                    name=gene_symbol,
                    code=gene_id.upper(),
                    system="https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/",
                ),
                relation=Relation.EXACT_MATCH,
                extensions= [Extension(name="cbioportal_annotation", value=True)],
            )]

In [9]:
#Check the output behaves as expected
mappings = _get_exact_gene_mappings("13056","ZNF263")
mappings

[ConceptMapping(id=None, extensions=[Extension(id=None, extensions=None, name='cbioportal_annotation', value=True, description=None)], coding=Coding(id='hgnc:13056', extensions=None, name='ZNF263', system='https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/', systemVersion=None, code=code(root='HGNC:13056'), iris=None), relation='exactMatch')]

In [10]:
class CBPTransformer(Transformer):
    """A class for transforming CBP to the common data model."""

    def _create_cache(self):
        pass

    def _get_therapeutic_substitute_group(self):
        pass

    def _get_therapy(self):
        pass

    def transform(self):
        pass

    def _add_genes(self, genes:list[dict]) -> list:
        """Create gene objects for all CBP records.

        Mutates instance variables ``_cache.genes`` and ``processed_data.genes``

        :param genes: All genes in CBP
        """
        transform_genes = []

       #for gene in genes: #if want ot remove tqdm for backend
        for gene in tqdm(genes, desc="Processing genes"):

            gene_symbol = gene.get("Hugo_Symbol")
            hgnc_id = gene.get("gene_hgnc_id")


            queries = [hgnc_id, gene_symbol] if hgnc_id and hgnc_id != "untested" else [gene_symbol]
            extensions = []

            normalized_gene_id = None
            gene_norm_resp = None

            for query in queries:
                gene_norm_resp, normalized_gene_id = self.vicc_normalizers.normalize_gene(query)
                if normalized_gene_id:
                    break

            cbp_mappings = _get_exact_gene_mappings(hgnc_id,gene_symbol)

            if not normalized_gene_id:
                _logger.debug(
                    "Gene Normalizer unable to normalize: using queries %s",
                    queries,
                )
                mappings = cbp_mappings
                extensions.append(self._get_vicc_normalizer_failure_ext())
            else:
                mappings = self._get_vicc_normalizer_mappings(normalized_gene_id, gene_norm_resp)
                #self._update_normalizer_mappings(mappings, _get_exact_gene_mappings) 
                #TODO: add this back, figure out how

            cbp_gene = MappableConcept(
                conceptType="Gene",
                name=gene_symbol,
                mappings=mappings,
                extensions=extensions or None,
            )
            transform_genes.append(cbp_gene)

            #self._cache.genes[gene_symbol] = cbp_gene
        return transform_genes

In [11]:
#Convert CBP data df to a list of dictionaries (each row its own dictionary)
genes = df.to_dict(orient='records')
#Take just a subset to test
sub_genes = genes[0:5]

In [12]:
transformer = CBPTransformer()

***Using Gene Database Endpoint: http://localhost:8000***
Downloading hg19ToHg38.over.chain.gz...


100%|██████████| 227698/227698 [00:00<00:00, 763015.03it/s]


Downloading hg38ToHg19.over.chain.gz...


100%|██████████| 1246411/1246411 [00:00<00:00, 2041091.26it/s]


In [13]:
#Run fxn on subset
transformer._add_genes(sub_genes)

Processing genes: 100%|██████████| 5/5 [00:00<00:00, 21.65it/s]


[MappableConcept(id=None, extensions=None, conceptType='Gene', name='ETFDH', primaryCoding=None, mappings=[ConceptMapping(id=None, extensions=[Extension(id=None, extensions=None, name='vicc_normalizer_priority', value=True, description=None)], coding=Coding(id='hgnc:3483', extensions=None, name='ETFDH', system='https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/', systemVersion=None, code=code(root='HGNC:3483'), iris=None), relation='exactMatch'), ConceptMapping(id=None, extensions=[Extension(id=None, extensions=None, name='vicc_normalizer_priority', value=False, description=None)], coding=Coding(id='ncbigene:2110', extensions=None, name=None, system='https://www.ncbi.nlm.nih.gov/gene/', systemVersion=None, code=code(root='2110'), iris=None), relation='exactMatch')]),
 MappableConcept(id=None, extensions=None, conceptType='Gene', name='GJB4', primaryCoding=None, mappings=[ConceptMapping(id=None, extensions=[Extension(id=None, extensions=None, name='vicc_normalizer_priority', 

In [14]:
#Run fxn on whole dataset(~6 min)
transformer._add_genes(genes)

Processing genes: 100%|██████████| 11480/11480 [04:47<00:00, 39.96it/s]


[MappableConcept(id=None, extensions=None, conceptType='Gene', name='ETFDH', primaryCoding=None, mappings=[ConceptMapping(id=None, extensions=[Extension(id=None, extensions=None, name='vicc_normalizer_priority', value=True, description=None)], coding=Coding(id='hgnc:3483', extensions=None, name='ETFDH', system='https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/', systemVersion=None, code=code(root='HGNC:3483'), iris=None), relation='exactMatch'), ConceptMapping(id=None, extensions=[Extension(id=None, extensions=None, name='vicc_normalizer_priority', value=False, description=None)], coding=Coding(id='ncbigene:2110', extensions=None, name=None, system='https://www.ncbi.nlm.nih.gov/gene/', systemVersion=None, code=code(root='2110'), iris=None), relation='exactMatch')]),
 MappableConcept(id=None, extensions=None, conceptType='Gene', name='GJB4', primaryCoding=None, mappings=[ConceptMapping(id=None, extensions=[Extension(id=None, extensions=None, name='vicc_normalizer_priority', 