In [11]:
import re

import pandas as pd

from bystro.api import auth
from bystro.proteomics.annotation_interface import (
    get_annotation_result_from_query,
    async_get_annotation_result_from_query,
    join_annotation_result_to_proteomic_dataset
)

from bystro.proteomics.fragpipe_tandem_mass_tag import (
    load_tandem_mass_tag_dataset,
)

from bystro.proteomics.somascan import SomascanDataset

In [2]:
user = auth.login('email', 'password', 'https://bystro-dev.emory.edu')

job_id = '663ac4bfa0e17a1660ba8130'
index = '663ac4bfa0e17a1660ba8130_657a50d4b2d0278938ba791d'

Existing session found, logging out


In [4]:
abundance_file = '../proteomics/tests/example_abundance_gene_MD.tsv'
experiment_file = '../proteomics/tests/example_experiment_annotation_file.tsv'

tmt_dataset = load_tandem_mass_tag_dataset(abundance_file, experiment_file)

In [5]:
# Query and melt by refSeq.name2 to make sure you can join on FragPipe gene_id
query_result_df = get_annotation_result_from_query(
    query_string="*",
    index_name=index,
    bystro_api_auth=user,
    melt_samples=True,
    explode_field='refSeq.name2',
    fields=["refSeq.name2", "refSeq.spID", "refSeq.name"]
)

In [6]:
query_result_df.head()

Unnamed: 0,chrom,pos,vcfPos,inputRef,alt,type,id,locus,sample,dosage,refSeq.name2,refSeq.spID,refSeq.name
0,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1805,2,RAP1GAP2,,NM_001330058
1,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1805,2,RAP1GAP2,Q684P5,NM_015085
2,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1805,2,RAP1GAP2,Q684P5-2,NM_001100398
3,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1847,2,RAP1GAP2,,NM_001330058
4,chr17,2923767,2923767,A,G,SNP,.,chr17:2923767:A:G,1847,2,RAP1GAP2,Q684P5,NM_015085


In [9]:
sample_ids = query_result_df["sample"].unique()

sample_names = list(tmt_dataset.annotation_df.index)[0 : sample_ids.shape[0]]

# replace the sample ids with the sample names
replacements = {sample_id: sample_name for sample_id, sample_name in zip(sample_ids, sample_names)}
query_result_df["sample"] = query_result_df["sample"].replace(replacements)

joined_df = join_annotation_result_to_proteomic_dataset(query_result_df, tmt_dataset,
                                                       proteomic_sample_id_column='sample',
                                                       proteomic_join_column='gene_name')
joined_df

Unnamed: 0,chrom,pos,vcfPos,inputRef,alt,type,id,locus,dosage,refSeq.name2,refSeq.spID,refSeq.name,NumberPSM,ProteinID,MaxPepProb,ReferenceIntensity,normalized_sample_intensity
0,chr2,235402279,235402279,T,C,SNP,.,chr2:235402279:T:C,1,ARL4C,,NM_001282431,1,P56559,1.0,15.885759,
1,chr2,235402279,235402279,T,C,SNP,.,chr2:235402279:T:C,1,ARL4C,"[P56559, Q4A519]",NM_005737,1,P56559,1.0,15.885759,
2,chr2,235402279,235402279,T,C,SNP,.,chr2:235402279:T:C,1,ARL4C,,NM_001282431,1,P56559,1.0,15.885759,
3,chr2,235402279,235402279,T,C,SNP,.,chr2:235402279:T:C,1,ARL4C,"[P56559, Q4A519]",NM_005737,1,P56559,1.0,15.885759,
4,chr2,235402279,235402279,T,C,SNP,.,chr2:235402279:T:C,2,ARL4C,,NM_001282431,1,P56559,1.0,15.885759,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1161,chr17,58786625,58786625,A,G,SNP,.,chr17:58786625:A:G,2,BCAS3,,NM_001320470,65,Q9H6U6,1.0,25.062575,0.069469
1162,chr17,58786625,58786625,A,G,SNP,.,chr17:58786625:A:G,2,BCAS3,"[Q9H6U6, Q9H6U6-3, Q9H6U6-8]",NM_001099432,65,Q9H6U6,1.0,25.062575,0.069469
1163,chr17,58786625,58786625,A,G,SNP,.,chr17:58786625:A:G,2,BCAS3,"[Q70WD9, Q9H6U6-2, Q9H6U6-6]",NM_017679,65,Q9H6U6,1.0,25.062575,0.069469
1164,chr17,58786625,58786625,A,G,SNP,.,chr17:58786625:A:G,2,BCAS3,,NM_001330413,65,Q9H6U6,1.0,25.062575,0.069469


In [10]:
pwd

'/home/ubuntu/bystro/python/python/bystro/examples'

In [None]:
adat_file = '../proteomics/tests/example_data_v4.1_plasma.adat'
somascan_dataset = SomascanDataset.from_paths(adat_file)


In [25]:
sample_ids = query_result_df["sample"].unique()

sample_names = list(somascan_dataset.adat.index.to_frame()['SampleId'].values)[0 : sample_ids.shape[0]]

# replace the sample ids with the sample names
replacements = {sample_id: sample_name for sample_id, sample_name in zip(sample_ids, sample_names)}
query_result_df["sample"] = query_result_df["sample"].replace(replacements)


x

Unnamed: 0,chrom,pos,vcfPos,inputRef,alt,type,id,locus,sample,dosage,...,ColCheck,CalQcRatio_SS-000005_Set001_200170,QcReference_200170,Cal_SS-000005_Set002,CalQcRatio_SS-000005_Set002_200170,Cal_SS-000005_Set003,CalQcRatio_SS-000005_Set003_200170,Cal_SS-000005_Set004,CalQcRatio_SS-000005_Set004_200170,RFU
0,chr2,197122493,197122493,T,G,SNP,.,chr2:197122493:T:G,119,1,...,PASS,1.049,773.8,0.99890683,1.004,0.98794079,1.002,0.97440735,1.000,1951.8
1,chr2,197122493,197122493,T,G,SNP,.,chr2:197122493:T:G,119,1,...,PASS,1.049,773.8,0.99890683,1.004,0.98794079,1.002,0.97440735,1.000,1951.8
2,chr2,197122493,197122493,T,G,SNP,.,chr2:197122493:T:G,119,1,...,PASS,1.049,773.8,0.99890683,1.004,0.98794079,1.002,0.97440735,1.000,1951.8
3,chr2,197122493,197122493,T,G,SNP,.,chr2:197122493:T:G,66,1,...,PASS,1.049,773.8,0.99890683,1.004,0.98794079,1.002,0.97440735,1.000,985.5
4,chr2,197122493,197122493,T,G,SNP,.,chr2:197122493:T:G,66,1,...,PASS,1.049,773.8,0.99890683,1.004,0.98794079,1.002,0.97440735,1.000,985.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11031,chr2,242613089,242613089,A,G,SNP,.,chr2:242613089:A:G,119,1,...,PASS,0.979,2530.0,0.98466899,0.968,0.99843537,0.994,0.96636623,1.016,2135.8
11032,chr2,242613089,242613089,A,G,SNP,.,chr2:242613089:A:G,66,1,...,PASS,0.979,2530.0,0.98466899,0.968,0.99843537,0.994,0.96636623,1.016,4218.8
11033,chr2,242613089,242613089,A,G,SNP,.,chr2:242613089:A:G,66,1,...,PASS,0.979,2530.0,0.98466899,0.968,0.99843537,0.994,0.96636623,1.016,4218.8
11034,chr2,242613089,242613089,A,G,SNP,.,chr2:242613089:A:G,23,2,...,PASS,0.979,2530.0,0.98466899,0.968,0.99843537,0.994,0.96636623,1.016,1949.5


In [20]:
somascan_dataset.adat.columns.to_frame()['Target'].values

<ArrowStringArrayNumpySemantics>
[         'CRBB2',          'c-Raf',          'ZNF41',           'ELK1',
          'GUC1A',          'BECN1',           'OCRL',          'SPDEF',
       'Fc_MOUSE',           'SLUG',
 ...
          'YIPF6', 'Neuropeptide W',       'LRC25:CD',          'LRC24',
 'EMIL3:region 2',          'ZN264',          'ATP4B',            'DUT',
     'UBXN4:CD 1',           'IRF6']
Length: 7596, dtype: string

In [24]:
somascan_dataset.adat.index.to_frame()['SampleId'].values

<ArrowStringArrayNumpySemantics>
[    '23',    '119',     '66',     '46', '200169',     '51',    '174',
    '177',    '147',     '86',
 ...
    '109', '200170', '200169',     '94',    '157',     '71', '200169',
     '20',     '88', '190063']
Length: 207, dtype: string