In [23]:
import json
import pandas as pd
import requests
from tqdm import tqdm

In [24]:
files = ['brca.vr.json',
         'cgi.vr.json',
         'civic.vr.json',
         'jax.vr.json',
         'molecularmatch.vr.json',
         'oncokb.vr.json',
         'pmkb.vr.json']

In [25]:
def load_v1_data(data,source_name):
    extracted_data = []
    for item in data:
        try: # TODO: Handle KeyError [0] in phenotype_term for molecularmatch
            extracted_data.append({
                'genes': item.get('genes', None),
                'feature_names': item.get('feature_names', None),
                'drug_labels': item.get('association', {}).get('drug_labels', None),
                'variant_name': item.get('association', {}).get('variant_name', None),
                'phenotype_term': item.get('association', {}).get('phenotypes', {})[0].get('term',None), # TODO: Check for >1 entries
                'source': f'{source_name.split(".")[0]}'
            })
        except:
            pass
    df = pd.DataFrame(extracted_data)

    return(df)




In [26]:
v1_data = pd.DataFrame()

for file in tqdm(files):
    with open(f'v1_data/{file}','r') as f:
        data = []
        for line in f:
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError:
                pass
        tdf = load_v1_data(data,file)
        v1_data = pd.concat([v1_data,tdf]).reset_index(drop=True)

v1_data

100%|██████████| 7/7 [00:03<00:00,  1.89it/s]


Unnamed: 0,genes,feature_names,drug_labels,variant_name,phenotype_term,source
0,[BRCA1],?,,,breast cancer,brca
1,[BRCA1],E1038G,,,breast cancer,brca
2,[BRCA1],?,,,breast cancer,brca
3,[BRCA1],E1250K,,,breast cancer,brca
4,[BRCA1],?,,,breast cancer,brca
...,...,...,...,...,...,...
23472,[PATZ1],PATZ1 any mutation,,,acinar cell carcinoma,pmkb
23473,[ZNF331],ZNF331 any mutation,,,acinar cell carcinoma,pmkb
23474,[ZNF384],ZNF384 any mutation,,,acinar cell carcinoma,pmkb
23475,[ZNF521],ZNF521 any mutation,,,acinar cell carcinoma,pmkb


In [36]:
# Expand the rows with multiple genes into separate rows
expanded_rows = []
for index, row in v1_data.iterrows():
    for gene in row['genes']:
        new_row = row.copy()
        new_row['genes'] = gene
        expanded_rows.append(new_row)

# Create a new DataFrame from the expanded rows list
v1_data = pd.DataFrame(expanded_rows).reset_index(drop=True)
v1_data

Unnamed: 0,genes,feature_names,drug_labels,variant_name,phenotype_term,source,gene,therapy_concept_id,gene_concept_id
0,BRCA1,?,,,breast cancer,brca,BRCA1,No Match,normalize.gene.hgnc:1100
1,BRCA1,E1038G,,,breast cancer,brca,BRCA1,No Match,normalize.gene.hgnc:1100
2,BRCA1,?,,,breast cancer,brca,BRCA1,No Match,normalize.gene.hgnc:1100
3,BRCA1,E1250K,,,breast cancer,brca,BRCA1,No Match,normalize.gene.hgnc:1100
4,BRCA1,?,,,breast cancer,brca,BRCA1,No Match,normalize.gene.hgnc:1100
...,...,...,...,...,...,...,...,...,...
32275,PATZ1,PATZ1 any mutation,,,acinar cell carcinoma,pmkb,PATZ1,,
32276,ZNF331,ZNF331 any mutation,,,acinar cell carcinoma,pmkb,ZNF331,,
32277,ZNF384,ZNF384 any mutation,,,acinar cell carcinoma,pmkb,ZNF384,,
32278,ZNF521,ZNF521 any mutation,,,acinar cell carcinoma,pmkb,ZNF521,,


### Inspect

In [28]:
v1_data['genes'].value_counts()

genes
[BRCA2]       2803
[BRCA1]       2402
[KRAS]        1332
[VHL]         1109
[ERBB2]        983
              ... 
[SLFN11]         1
[ABCC10]         1
[PDGFA]          1
[PREX2]          1
[HIST1H1C]       1
Name: count, Length: 1172, dtype: int64

In [29]:
v1_data['phenotype_term'].value_counts()

phenotype_term
cancer                                                            6226
breast cancer                                                     5843
colorectal cancer                                                 1355
non-small cell lung carcinoma (disease)                           1346
lung cancer                                                       1219
                                                                  ... 
Lung adenocarcinoma;Cutaneous melanoma;Prostate adenocarcinoma       1
acoustic neuroma                                                     1
peritoneal mesothelioma (disease)                                    1
Cutaneous melanoma;Lung adenocarcinoma;Prostate adenocarcinoma       1
chronic eosinophilic leukemia                                        1
Name: count, Length: 428, dtype: int64

In [30]:
v1_data['variant_name'].value_counts()

variant_name
[]                      2238
[]                       760
[, ]                     574
MUTATION                 368
Truncating Mutations     240
                        ... 
S425C                      1
V277D                      1
V356R                      1
W406A                      1
S100F                      1
Name: count, Length: 5051, dtype: int64

In [8]:
v1_data['drug_labels'].value_counts()

drug_labels
NA                                 1527
CRIZOTINIB                          475
CETUXIMAB                           330
AFATINIB                            309
Imatinib                            293
                                   ... 
Futuximab,PANITUMUMAB,CETUXIMAB       1
497839-62-0,Erlotinib,GEFITINIB       1
ROCILETINIB,OSIMERTINIB               1
CWP232291                             1
BI 2536,PLX-4720                      1
Name: count, Length: 1548, dtype: int64

In [31]:
def normalize_gene(gene):
    url = f'https://normalize.cancervariants.org/gene/normalize?q={gene}'
    r = requests.get(url)
    if r.json()['match_type'] == 0:
        concept_id = 'No Match'
    else:
        concept_id = r.json()['gene']['id']
    return concept_id

def normalize_disease(disease):
    url = f'https://normalize.cancervariants.org/disease/normalize?q={disease}'
    r = requests.get(url)
    if r.json()['match_type'] == 0:
        concept_id = 'No Match'
    else:
        concept_id = r.json()['disease']['id']
    return concept_id

def normalize_therapy(therapy):
    url = f'https://normalize.cancervariants.org/therapy/normalize?q={therapy}'
    r = requests.get(url)
    if r.json()['match_type'] == 0:
        concept_id = 'No Match'
    else:
        concept_id = r.json()['therapy']['id']
    return concept_id

In [37]:
# Normalize Genes
v1_data['gene_concept_id'] = None
for idx, row in tqdm(v1_data.iterrows()):
    gene = v1_data.at[idx,'genes']
    v1_data.at[idx, 'gene_concept_id'] = normalize_gene(gene)

32280it [50:30, 10.65it/s]


In [38]:
# Normalize Disease
v1_data['disease_concept_id'] = None
for idx, row in tqdm(v1_data.iterrows()):
    disease = v1_data.at[idx,'phenotype_term']
    v1_data.at[idx, 'disease_concept_id'] = normalize_disease(disease)

32280it [47:56, 11.22it/s]


In [39]:
# Normalize Therapy
v1_data['therapy_concept_id'] = None
for idx, row in tqdm(v1_data.iterrows()):
    therapy = v1_data.at[idx,'drug_labels']
    v1_data.at[idx, 'therapy_concept_id'] = normalize_therapy(therapy)

32280it [52:56, 10.16it/s]


In [42]:
# v1_data.to_excel('v1_data_normalized.xlsx')
v1_data

Unnamed: 0,genes,feature_names,drug_labels,variant_name,phenotype_term,source,gene,therapy_concept_id,gene_concept_id,disease_concept_id
0,BRCA1,?,,,breast cancer,brca,BRCA1,No Match,normalize.gene.hgnc:1100,normalize.disease.ncit:C9335
1,BRCA1,E1038G,,,breast cancer,brca,BRCA1,No Match,normalize.gene.hgnc:1100,normalize.disease.ncit:C9335
2,BRCA1,?,,,breast cancer,brca,BRCA1,No Match,normalize.gene.hgnc:1100,normalize.disease.ncit:C9335
3,BRCA1,E1250K,,,breast cancer,brca,BRCA1,No Match,normalize.gene.hgnc:1100,normalize.disease.ncit:C9335
4,BRCA1,?,,,breast cancer,brca,BRCA1,No Match,normalize.gene.hgnc:1100,normalize.disease.ncit:C9335
...,...,...,...,...,...,...,...,...,...,...
32275,PATZ1,PATZ1 any mutation,,,acinar cell carcinoma,pmkb,PATZ1,normalize.therapy.rxcui:476250,normalize.gene.hgnc:13071,normalize.disease.ncit:C3768
32276,ZNF331,ZNF331 any mutation,,,acinar cell carcinoma,pmkb,ZNF331,normalize.therapy.rxcui:476250,normalize.gene.hgnc:15489,normalize.disease.ncit:C3768
32277,ZNF384,ZNF384 any mutation,,,acinar cell carcinoma,pmkb,ZNF384,normalize.therapy.rxcui:476250,normalize.gene.hgnc:11955,normalize.disease.ncit:C3768
32278,ZNF521,ZNF521 any mutation,,,acinar cell carcinoma,pmkb,ZNF521,normalize.therapy.rxcui:476250,normalize.gene.hgnc:24605,normalize.disease.ncit:C3768
