In [1]:
import re

import pandas as pd

from bystro.api import auth
from bystro.proteomics.annotation_interface import (
    get_annotation_result_from_query,
    async_get_annotation_result_from_query,
    join_annotation_result_to_fragpipe_dataset,
    explode_rows_with_list
)

from bystro.proteomics.fragpipe_tandem_mass_tag import (
    load_tandem_mass_tag_dataset,
)

In [None]:
user = auth.login('email', 'password', 'https://bystro-dev.emory.edu')

job_id = '663ac4bfa0e17a1660ba8130'
index = '663ac4bfa0e17a1660ba8130_657a50d4b2d0278938ba791d'

In [3]:
# Query and melt by refSeq.name2 to make sure you can join on FragPipe gene_name
query_result_df = get_annotation_result_from_query(
    query_string="*",
    index_name=index,
    bystro_api_auth=user,
    melt_samples=True,
    explode_field='refSeq.name2',
    fields=["refSeq.name2", "refSeq.spID", "refSeq.name", "refSeq.ensemblID", "gnomad.genomes.AF", "gnomad.genomes.AF_nfe"]
)
query_result_df

Unnamed: 0,chrom,pos,vcfPos,inputRef,alt,type,id,locus,sample,dosage,refSeq.name2,refSeq.spID,refSeq.name,refSeq.ensemblID,gnomad.genomes.AF,gnomad.genomes.AF_nfe
0,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1805,2,RAP1GAP2,,NM_001330058,,0.922388,0.951887
1,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1805,2,RAP1GAP2,Q684P5,NM_015085,ENST00000254695,0.922388,0.951887
2,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1805,2,RAP1GAP2,Q684P5-2,NM_001100398,ENST00000366401,0.922388,0.951887
3,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1847,2,RAP1GAP2,,NM_001330058,,0.922388,0.951887
4,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1847,2,RAP1GAP2,Q684P5,NM_015085,ENST00000254695,0.922388,0.951887
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49820,chr2,242686529,242686529,A,T,SNP,.,chr2:242686529:A:T,1805,1,D2HGDH,"[B5MCV2, Q8N465]",NM_152783,"[ENST00000321264, ENST00000473126]",0.531943,0.444423
49821,chr2,242688366,242688366,C,T,SNP,.,chr2:242688366:C:T,1805,1,D2HGDH,,NM_001352824,,0.523598,0.445675
49822,chr2,242688366,242688366,C,T,SNP,.,chr2:242688366:C:T,1805,1,D2HGDH,,NM_001287249,,0.523598,0.445675
49823,chr2,242688366,242688366,C,T,SNP,.,chr2:242688366:C:T,1805,1,D2HGDH,,NR_109778,,0.523598,0.445675


In [4]:
# The same dataset, but without exploding on refSeq.name2
query_result_df_no_explode_by = get_annotation_result_from_query(
    query_string="*",
    index_name=index,
    bystro_api_auth=user,
    melt_samples=True,
    fields=["refSeq.name2", "refSeq.spID", "refSeq.name", "refSeq.ensemblID", "gnomad.genomes.AF", "gnomad.genomes.AF_nfe"]
)
query_result_df_no_explode_by

Unnamed: 0,chrom,pos,vcfPos,inputRef,alt,type,id,locus,sample,dosage,refSeq.name2,refSeq.spID,refSeq.name,refSeq.ensemblID,gnomad.genomes.AF,gnomad.genomes.AF_nfe
0,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1805,2,"[RAP1GAP2, RAP1GAP2, RAP1GAP2]","[None, Q684P5, Q684P5-2]","[NM_001330058, NM_015085, NM_001100398]","[None, ENST00000254695, ENST00000366401]",0.922388,0.951887
1,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1847,2,"[RAP1GAP2, RAP1GAP2, RAP1GAP2]","[None, Q684P5, Q684P5-2]","[NM_001330058, NM_015085, NM_001100398]","[None, ENST00000254695, ENST00000366401]",0.922388,0.951887
2,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,4805,2,"[RAP1GAP2, RAP1GAP2, RAP1GAP2]","[None, Q684P5, Q684P5-2]","[NM_001330058, NM_015085, NM_001100398]","[None, ENST00000254695, ENST00000366401]",0.922388,0.951887
3,chr17,2929286,2929286,A,C,SNP,.,chr17:2929286:A:C,1805,2,"[RAP1GAP2, RAP1GAP2, RAP1GAP2]","[None, Q684P5, Q684P5-2]","[NM_001330058, NM_015085, NM_001100398]","[None, ENST00000254695, ENST00000366401]",0.695134,0.690156
4,chr17,2929286,2929286,A,C,SNP,.,chr17:2929286:A:C,1847,2,"[RAP1GAP2, RAP1GAP2, RAP1GAP2]","[None, Q684P5, Q684P5-2]","[NM_001330058, NM_015085, NM_001100398]","[None, ENST00000254695, ENST00000366401]",0.695134,0.690156
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15784,chr2,242682029,242682029,T,C,SNP,.,chr2:242682029:T:C,1805,1,"[D2HGDH, D2HGDH, D2HGDH, D2HGDH]","[None, None, None, [B5MCV2, Q8N465]]","[NM_001352824, NM_001287249, NR_109778, NM_152...","[None, None, None, [ENST00000321264, ENST00000...",0.275657,0.264889
15785,chr2,242684115,242684115,T,C,SNP,.,chr2:242684115:T:C,1805,1,"[D2HGDH, D2HGDH, D2HGDH, D2HGDH]","[None, None, None, [B5MCV2, Q8N465]]","[NM_001352824, NM_001287249, NR_109778, NM_152...","[None, None, None, [ENST00000321264, ENST00000...",0.542776,0.445781
15786,chr2,242684159,242684159,C,A,SNP,.,chr2:242684159:C:A,1805,1,"[D2HGDH, D2HGDH, D2HGDH, D2HGDH]","[None, None, None, [B5MCV2, Q8N465]]","[NM_001352824, NM_001287249, NR_109778, NM_152...","[None, None, None, [ENST00000321264, ENST00000...",0.003312,0.005186
15787,chr2,242686529,242686529,A,T,SNP,.,chr2:242686529:A:T,1805,1,"[D2HGDH, D2HGDH, D2HGDH, D2HGDH]","[None, None, None, [B5MCV2, Q8N465]]","[NM_001352824, NM_001287249, NR_109778, NM_152...","[None, None, None, [ENST00000321264, ENST00000...",0.531943,0.444423


In [5]:
# Query and explode refSeq.spID list values to make sure you can join on FragPipe ProteinID
# `force_flatten_exploded_field` is by default True, which will result in 1 refSeq.spID value
# even when the primary key of the refSeq track (refSeq.name) corresponds to 2 refSeq.spID values,
# or in other words, when 1 refSeq transcript has 2 refSeq.spID values (because multiple kgXref values exist for 1 refSeq.name)
query_result_df_by_spid = get_annotation_result_from_query(
    query_string="*",
    index_name=index,
    bystro_api_auth=user,
    melt_samples=True,
    explode_field="refSeq.spID",
    fields=["refSeq.name2", "refSeq.spID", "refSeq.name", "refSeq.ensemblID", "gnomad.genomes.AF", "gnomad.genomes.AF_nfe"]
)
query_result_df_by_spid

Unnamed: 0,chrom,pos,vcfPos,inputRef,alt,type,id,locus,sample,dosage,refSeq.name2,refSeq.spID,refSeq.name,refSeq.ensemblID,gnomad.genomes.AF,gnomad.genomes.AF_nfe
0,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1805,2,RAP1GAP2,,NM_001330058,,0.922388,0.951887
1,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1805,2,RAP1GAP2,Q684P5,NM_015085,ENST00000254695,0.922388,0.951887
2,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1805,2,RAP1GAP2,Q684P5-2,NM_001100398,ENST00000366401,0.922388,0.951887
3,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1847,2,RAP1GAP2,,NM_001330058,,0.922388,0.951887
4,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1847,2,RAP1GAP2,Q684P5,NM_015085,ENST00000254695,0.922388,0.951887
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57807,chr2,242688366,242688366,C,T,SNP,.,chr2:242688366:C:T,1805,1,D2HGDH,,NM_001352824,,0.523598,0.445675
57808,chr2,242688366,242688366,C,T,SNP,.,chr2:242688366:C:T,1805,1,D2HGDH,,NM_001287249,,0.523598,0.445675
57809,chr2,242688366,242688366,C,T,SNP,.,chr2:242688366:C:T,1805,1,D2HGDH,,NR_109778,,0.523598,0.445675
57810,chr2,242688366,242688366,C,T,SNP,.,chr2:242688366:C:T,1805,1,D2HGDH,B5MCV2,NM_152783,"[ENST00000321264, ENST00000473126]",0.523598,0.445675


In [6]:
# Query and explode refSeq.spID list values to make sure you can join on FragPipe gene_name
# This time however, if 1 refSeq transcript (identified by refSeq.name) corresponds to multiple refSeq.spIDs
# we will have 1 row with a the refSeq.spID value being a list
# We would need to further flatten this, to join on FragPipe ProteinID values
query_result_df_by_spid_no_force_flatten = get_annotation_result_from_query(
    query_string="*",
    index_name=index,
    bystro_api_auth=user,
    melt_samples=True,
    explode_field="refSeq.spID",
    fields=["refSeq.name2", "refSeq.spID", "refSeq.name", "refSeq.ensemblID", "gnomad.genomes.AF", "gnomad.genomes.AF_nfe"],
    force_flatten_exploded_field=False
)
query_result_df_by_spid_no_force_flatten

Unnamed: 0,chrom,pos,vcfPos,inputRef,alt,type,id,locus,sample,dosage,refSeq.name2,refSeq.spID,refSeq.name,refSeq.ensemblID,gnomad.genomes.AF,gnomad.genomes.AF_nfe
0,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1805,2,RAP1GAP2,,NM_001330058,,0.922388,0.951887
1,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1805,2,RAP1GAP2,Q684P5,NM_015085,ENST00000254695,0.922388,0.951887
2,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1805,2,RAP1GAP2,Q684P5-2,NM_001100398,ENST00000366401,0.922388,0.951887
3,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1847,2,RAP1GAP2,,NM_001330058,,0.922388,0.951887
4,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1847,2,RAP1GAP2,Q684P5,NM_015085,ENST00000254695,0.922388,0.951887
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49820,chr2,242686529,242686529,A,T,SNP,.,chr2:242686529:A:T,1805,1,D2HGDH,"[B5MCV2, Q8N465]",NM_152783,"[ENST00000321264, ENST00000473126]",0.531943,0.444423
49821,chr2,242688366,242688366,C,T,SNP,.,chr2:242688366:C:T,1805,1,D2HGDH,,NM_001352824,,0.523598,0.445675
49822,chr2,242688366,242688366,C,T,SNP,.,chr2:242688366:C:T,1805,1,D2HGDH,,NM_001287249,,0.523598,0.445675
49823,chr2,242688366,242688366,C,T,SNP,.,chr2:242688366:C:T,1805,1,D2HGDH,,NR_109778,,0.523598,0.445675


In [7]:
# We can manually explode these refSeq.spID list values
df_flattened = explode_rows_with_list(query_result_df_by_spid_no_force_flatten, 'refSeq.spID')
df_flattened

Unnamed: 0,chrom,pos,vcfPos,inputRef,alt,type,id,locus,sample,dosage,refSeq.name2,refSeq.spID,refSeq.name,refSeq.ensemblID,gnomad.genomes.AF,gnomad.genomes.AF_nfe
0,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1805,2,RAP1GAP2,,NM_001330058,,0.922388,0.951887
1,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1805,2,RAP1GAP2,Q684P5,NM_015085,ENST00000254695,0.922388,0.951887
2,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1805,2,RAP1GAP2,Q684P5-2,NM_001100398,ENST00000366401,0.922388,0.951887
3,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1847,2,RAP1GAP2,,NM_001330058,,0.922388,0.951887
4,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1847,2,RAP1GAP2,Q684P5,NM_015085,ENST00000254695,0.922388,0.951887
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49821,chr2,242688366,242688366,C,T,SNP,.,chr2:242688366:C:T,1805,1,D2HGDH,,NM_001352824,,0.523598,0.445675
49822,chr2,242688366,242688366,C,T,SNP,.,chr2:242688366:C:T,1805,1,D2HGDH,,NM_001287249,,0.523598,0.445675
49823,chr2,242688366,242688366,C,T,SNP,.,chr2:242688366:C:T,1805,1,D2HGDH,,NR_109778,,0.523598,0.445675
49824,chr2,242688366,242688366,C,T,SNP,.,chr2:242688366:C:T,1805,1,D2HGDH,B5MCV2,NM_152783,"[ENST00000321264, ENST00000473126]",0.523598,0.445675


In [8]:
abundance_file = '../proteomics/tests/example_abundance_gene_MD.tsv'
experiment_file = '../proteomics/tests/example_experiment_annotation_file.tsv'

tmt_dataset = load_tandem_mass_tag_dataset(abundance_file, experiment_file)
display(tmt_dataset.abundance_df)
tmt_dataset.annotation_df

Unnamed: 0,gene_name,NumberPSM,ProteinID,MaxPepProb,ReferenceIntensity,CPT0088900003,CPT0079270003,CPT0088920001,CPT0079300001,CPT0088550004,...,CPT0007860003,CPT0079380003,CPT0015810003,CPT0086030003,CPT0085670003,CPT0025230003,CPT0065750003,CPT0015730003,CPT0078800003,CPT0079230003
0,A1BG,324,P04217,1.0,30.044336,0.665648,0.156983,-0.040208,0.647754,-0.948779,...,0.800228,-0.034147,-0.056787,0.721216,0.282223,0.107007,0.947105,-0.203403,-0.612417,0.528959
1,A1CF,94,Q9NQ94,1.0,26.115335,0.091328,0.228538,0.641523,-0.936936,-1.625079,...,0.443220,-0.679189,-0.447859,-0.529476,0.173284,-1.745636,-0.209171,-0.472092,-0.114567,0.435046
2,A2M,1418,P01023,1.0,30.802090,0.956687,-0.231766,0.095571,0.859875,-0.668738,...,0.559026,-0.546317,0.272841,0.459240,0.148868,0.224759,0.530004,0.657324,-1.356194,1.095773
3,AAAS,50,Q9NRG9,1.0,24.422673,-0.005041,0.081379,-0.132121,0.102784,-0.097407,...,0.528265,-0.056627,-0.020744,-0.148636,0.241928,0.105649,-0.047757,-0.022935,0.072918,-0.017612
4,AACS,65,Q86V21,1.0,27.520496,-0.885569,-0.604937,0.115300,0.116263,-0.682289,...,-0.414655,0.244193,0.196228,-0.017570,-1.114966,-0.112935,0.371002,0.215173,0.740686,-0.320558
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,BIN1,183,O00499,1.0,28.519829,-0.208624,-0.740367,0.667512,-0.372923,-0.404923,...,-0.449444,0.134355,-0.859130,-0.449058,-0.722286,-0.758876,-0.732240,-0.698885,0.578534,-0.692164
995,BIN2,81,Q9UBW5,1.0,25.936171,0.013176,-0.065219,-0.637294,-0.568995,2.294478,...,0.563133,1.187669,0.001840,0.416629,0.010067,0.173171,0.076997,0.470754,0.147366,0.127343
996,BIN3,51,Q9NQY0,1.0,25.290069,0.638213,1.206009,-0.099914,0.187577,-0.733904,...,0.428650,0.285849,0.062304,0.249939,-0.107869,0.961484,0.217960,0.434468,-0.494660,0.044322
997,BIRC2,36,Q13490,1.0,24.084678,0.161076,0.427054,0.038171,-0.089428,0.196889,...,0.374340,0.030547,0.210947,0.005615,0.296264,0.049694,0.167639,0.319608,0.178903,0.154747


Unnamed: 0_level_0,plex,channel,sample_name,condition,replicate
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CPT0088900003,16,126,C3N-01179-T,Tumor,1
CPT0079270003,16,127N,C3L-00606-T,Tumor,1
CPT0088920001,16,127C,C3N-01179-N,NAT,1
CPT0079300001,16,128N,C3L-00606-N,NAT,1
CPT0088550004,16,128C,C3N-01648-T,Tumor,1
QC6,16,129N,QC6,QC,1
CPT0014450004,16,129C,C3N-00242-T,Tumor,1
CPT0088570001,16,130N,C3N-01648-N,NAT,1
CPT0014470001,16,130C,C3N-00242-N,NAT,1
pool16,16,131N,pool16,pool16,1


In [9]:
sample_ids = query_result_df_by_spid["sample"].unique()

sample_names = list(tmt_dataset.annotation_df.index)[0 : sample_ids.shape[0]]

# We always join on the sample columns in the genetic and tmt dataset, as well as 1 
# additional column in each, by default "refSeq.name2" in the genetic dataset, and "gene_name" in the TMT dataset
# replace the 3 sample names in the genetic dataset with the first 3 from the TMT dataset,
# to simulate a genetic dataset that has samples shared with the TMT dataset
replacements = {sample_id: sample_name for sample_id, sample_name in zip(sample_ids, sample_names)}
query_result_df_by_spid["sample"] = query_result_df_by_spid["sample"].replace(replacements)

joined_df = join_annotation_result_to_fragpipe_dataset(query_result_df_by_spid, tmt_dataset,
                                                       genetic_join_column='refSeq.spID',
                                                       fragpipe_join_column='ProteinID')
joined_df

Unnamed: 0,chrom,pos,vcfPos,inputRef,alt,type,id,locus,sample,dosage,...,refSeq.spID,refSeq.name,refSeq.ensemblID,gnomad.genomes.AF,gnomad.genomes.AF_nfe,gene_name,NumberPSM,MaxPepProb,ReferenceIntensity,normalized_sample_intensity
0,chr17,4620497,4620497,C,T,SNP,.,chr17:4620497:C:T,CPT0088900003,2,...,P32121,NM_001257329,ENST00000269260,0.914080,0.942333,ARRB2,57,1.0,25.443213,0.169927
1,chr17,4620497,4620497,C,T,SNP,.,chr17:4620497:C:T,CPT0088900003,2,...,P32121,NM_004313,ENST00000269260,0.914080,0.942333,ARRB2,57,1.0,25.443213,0.169927
2,chr17,4620928,4620928,T,C,SNP,.,chr17:4620928:T:C,CPT0088900003,2,...,P32121,NM_001257329,ENST00000269260,0.159387,0.236445,ARRB2,57,1.0,25.443213,0.169927
3,chr17,4620928,4620928,T,C,SNP,.,chr17:4620928:T:C,CPT0088900003,2,...,P32121,NM_004313,ENST00000269260,0.159387,0.236445,ARRB2,57,1.0,25.443213,0.169927
4,chr17,4622551,4622551,A,G,SNP,.,chr17:4622551:A:G,CPT0088900003,2,...,P32121,NM_001257329,ENST00000269260,0.165816,0.226039,ARRB2,57,1.0,25.443213,0.169927
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2591,chr17,10614869,10614869,G,T,SNP,.,chr17:10614869:G:T,CPT0088900003,1,...,Q3LIE5,NM_020233,"[ENST00000379774, ENST00000609540]",0.490546,0.442932,ADPRM,1,1.0,18.474061,
2592,chr2,70906115,70906115,C,A,SNP,.,chr2:70906115:C:A,CPT0079270003,2,...,P35612,NM_001617,"[ENST00000264436, ENST00000403045]",0.320501,0.274110,ADD2,72,1.0,25.452790,-1.297841
2593,chr2,70906115,70906115,C,A,SNP,.,chr2:70906115:C:A,CPT0079270003,2,...,P35612,NM_001185054,ENST00000407644,0.320501,0.274110,ADD2,72,1.0,25.452790,-1.297841
2594,chr2,70915134,70915134,C,G,SNP,.,chr2:70915134:C:G,CPT0079270003,2,...,P35612,NM_001617,"[ENST00000264436, ENST00000403045]",0.290935,0.236910,ADD2,72,1.0,25.452790,-1.297841
