# whisper Tutorial

This Jupyter notebook demonstrates how to use the `whisper` Python package to process a BioID/AP-MS dataset. We will:

1. Load an example intensity file
2. Run feature engineering
3. Train a PU-learning model and estimate FDR
4. Save the output

In [1]:
from whisper.protein_features import feature_engineering_protein
from whisper.protein_train import train_and_score_protein
from whisper.peptide_features import feature_engineering_peptide
from whisper.peptide_train import train_and_score_peptide
import pandas as pd

## Load Example Data

In [2]:
intensity_protein_df = pd.read_csv("input_intensity_dataset.tsv", sep="\t")
intensity_protein_df

Unnamed: 0,Protein,KRT8_1,KRT8_2,KRT8_3,LMNA_1,LMNA_2,LMNA_3,ACTB_1,ACTB_2,ACTB_3,...,MAPRE3_3,EGFP_1,EGFP_2,EGFP_3,Empty_1,Empty_2,Empty_3,NminiTurbo_1,NminiTurbo_2,NminiTurbo_3
0,IGLV3-21;IGLV3-9,0.0,0.0,0.0,0.00,0.00,0.00,0.0,0.0,10349.4,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,IGKV3D-15,0.0,0.0,0.0,0.00,0.00,0.00,0.0,0.0,26850.7,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,IGKV2-28;IGKV2-29;IGKV2-30;IGKV2-40;IGKV2D-26;...,0.0,0.0,0.0,0.00,0.00,0.00,0.0,0.0,18848.6,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,IGKV3-11;IGKV3D-11,0.0,0.0,0.0,0.00,0.00,0.00,0.0,0.0,37883.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,IGHV3-49,0.0,0.0,0.0,0.00,0.00,0.00,0.0,0.0,40300.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5315,FAM169A,0.0,0.0,0.0,18955.60,25511.70,7100.19,0.0,0.0,0.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
5316,ZHX2,0.0,0.0,0.0,6059.18,7605.23,2710.10,0.0,0.0,0.0,...,0.00,16249.50,3799.18,2444.10,0.00,0.00,0.00,0.00,0.00,0.00
5317,MORC2,0.0,0.0,0.0,4914.16,6995.20,8531.25,0.0,0.0,0.0,...,0.00,7233.22,0.00,9045.29,0.00,0.00,0.00,0.00,0.00,0.00
5318,IVNS1ABP,39238.4,34255.2,54790.0,15299.60,23808.70,10559.40,32396.1,27509.0,42473.5,...,9761.35,50919.90,48695.70,61052.20,4351.15,9788.32,9907.35,4901.16,7005.24,7117.24


In [3]:
intensity_peptide_df = pd.read_csv("input_peptide_intensity_dataset.tsv", sep="\t")
intensity_peptide_df

Unnamed: 0,Protein,Peptide,ACTB_1,ACTB_2,ACTB_3,CTNNA1_1,CTNNA1_2,CTNNA1_3,EGFP_1,EGFP_2,...,KRT8_3,LMNA_1,LMNA_2,LMNA_3,MAPRE3_1,MAPRE3_2,MAPRE3_3,NminiTurbo_1,NminiTurbo_2,NminiTurbo_3
0,SRP14,AAAAAAAAAPAAAATAPTTAATTAATAAQ2,33718.90,29606.80,38578.10,20077.10,17282.900,12282.70,28736.50,55521.90,...,50512.90,29880.70,58082.10,17924.00,11460.60,11043.60,11547.60,11086.60,0.00,15590.90
1,ZFP91,AAAAAAAAAVSR2,6278.16,4767.13,8670.24,0.00,983.026,0.00,49270.40,38619.10,...,4393.12,32877.80,46476.40,18112.50,2801.08,4297.12,2766.07,0.00,0.00,0.00
2,INCENP,AAAAAAAATMALAAPSSPTPESPTMLTK3,0.00,0.00,0.00,0.00,0.000,0.00,0.00,0.00,...,0.00,0.00,3297.13,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,IRS2,AAAAAAAAVPSAGPAGPAPTSAAGR2,0.00,0.00,4032.20,0.00,0.000,0.00,3391.18,2389.12,...,4049.21,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,RPL4,AAAAAAALQAK2,14878.40,10907.30,16942.50,8955.24,9614.210,6739.16,12260.30,19150.50,...,18657.50,16595.40,16640.40,8790.21,8648.21,8801.23,7869.19,6058.15,9204.23,5082.12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57925,SAV1,YYYSDNFFDGQR2,0.00,0.00,0.00,0.00,0.000,0.00,0.00,0.00,...,6229.24,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
57926,PHKB,YYYVPADFVEYEK2,6105.28,4836.20,10839.40,0.00,1413.060,0.00,7602.31,9035.37,...,13629.60,6450.27,9957.41,5343.21,1959.07,1663.08,1704.07,0.00,0.00,0.00
57927,PEG10,YYYVQNVYTPVDEHVYPDHR3,6198.22,5906.19,8773.29,0.00,0.000,0.00,5269.20,5539.17,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
57928,PEG10,YYYVQNVYTPVDEHVYPDHR4,5226.13,5279.14,9285.23,0.00,0.000,0.00,3339.09,5081.14,...,2644.07,0.00,0.00,0.00,1282.04,0.00,0.00,0.00,0.00,0.00


## Run Feature Engineering

You must specify control (substrings identifying control samples).

In [4]:
controls = ['EGFP', 'Empty', 'NminiTurbo']

features_protein_df = feature_engineering_protein(intensity_protein_df, controls)
features_protein_df.head()

Unnamed: 0,Bait,Prey,log_fold_change,snr,mean_diff,median_diff,replicate_fold_change_sd,bait_cv,bait_control_sd_ratio,zero_or_neg_fc,nonzero_reps,reps_above_ctrl_med,single_rep_flag,composite_score,global_cv
0,ACTB,CCT8,2.18849,3.416967,3607493.0,4587256.9,0.186509,0.040917,0.139811,1,3,3,0,16.742191,1.217581
1,ACTB,FLNA,1.721757,2.55311,3226178.0,4348742.0,0.365365,0.110771,0.282812,1,3,3,0,15.403182,1.03863
2,ACTB,AHNAK,1.39588,2.040781,2567820.0,3867983.0,0.066947,0.025441,0.051919,1,3,3,0,13.012335,0.789551
3,ACTB,ACTA1,3.491085,46.438511,3079957.0,2908418.0,2.740437,0.243724,11.318196,1,3,3,0,12.909304,1.610158
4,ACTB,ACTC1,11.813446,1272.496414,100352.0,71529.6,3439.685503,0.95569,1216.112472,1,2,2,0,11.557553,3.511042


In [5]:
controls = ['EGFP', 'Empty', 'NminiTurbo']

features_peptide_df = feature_engineering_peptide(intensity_peptide_df, controls)
features_peptide_df.head()

Unnamed: 0,Bait,Protein,Peptide,log_fold_change,snr,mean_diff,median_diff,replicate_fold_change_sd,bait_cv,bait_control_sd_ratio,zero_or_neg_fc,nonzero_reps,reps_above_ctrl_med,single_rep_flag,composite_score,global_cv
0,ACTB,ACTA1,AGFAGDDAPR2,15.116768,12561.908648,300035.5,331530.0,24697.619371,0.695111,8731.927068,1,3,3,0,35.034082,3.286348
1,ACTB,CCT8,DIDEVSSLLR2,2.189997,3.419919,3610119.0,4587836.9,0.188185,0.041241,0.141041,1,3,3,0,26.40747,1.218533
2,ACTB,IGHA1,WLQGSQELPR2,7.856083,9489.830134,453324.9,0.0,75918.641074,1.414214,26841.292961,1,1,1,1,26.352388,4.795832
3,ACTB,FLNA,VEPGLGADNSVVR2,1.930744,2.947713,3356560.0,4351282.0,0.345548,0.090635,0.267167,1,3,3,0,24.78516,1.035805
4,ACTB,FLNA,LPQLPITNFSR2,1.790183,2.514213,2997475.0,3715828.4,0.633864,0.183273,0.460788,1,3,3,0,21.66364,1.16126


## Training Model and Estimate FDR

In [6]:
protein_scores_df = train_and_score_protein(features_protein_df, initial_positives=15, initial_negatives=200)

protein_scores_df

Unnamed: 0,Bait,Prey,log_fold_change,snr,mean_diff,median_diff,replicate_fold_change_sd,bait_cv,bait_control_sd_ratio,zero_or_neg_fc,nonzero_reps,reps_above_ctrl_med,single_rep_flag,composite_score,global_cv,predicted_probability,FDR,global_cv_flag
0,ACTB,A2M,3.489390,22.297028,3489.876889,0.0,178.376227,1.414214,63.065520,1,1,1,1,0.464994,4.795832,0.8729,0.137917,
1,ACTB,A2ML1,3.887981,38.745611,6084.943556,0.0,309.964887,1.414214,109.589137,1,1,1,1,0.636857,4.795832,0.8946,0.116604,
2,ACTB,AAAS,0.179147,2.476831,1655.612222,3542.1,0.122431,0.108135,0.267832,1,3,3,0,0.025320,0.609950,0.6936,0.383857,likely background
3,ACTB,AAGAB,1.095165,1.417723,5745.414444,12210.5,0.491274,0.229958,0.326017,1,3,3,0,0.120357,1.312424,0.8770,0.137917,
4,ACTB,AAK1,1.200044,1.572472,23371.033333,36988.5,0.758363,0.330088,0.519054,1,3,3,0,0.215805,1.082057,0.9695,0.015860,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26591,MAPRE3,ZWINT,-0.005494,0.703972,-40.136667,10633.3,0.067684,0.067942,0.047830,0,3,3,0,0.247697,0.951283,0.2157,1.000000,likely background
26592,MAPRE3,ZYX,-0.701176,0.439522,-26557.394444,42129.4,0.017680,0.028746,0.012634,0,3,3,0,0.183385,1.208568,0.0101,1.000000,
26593,MAPRE3,ZZEF1,-5.536547,0.000000,-4485.592222,0.0,0.000000,0.000000,0.000000,0,0,0,0,-0.343160,1.559795,0.0000,1.000000,
26594,MAPRE3,ZZZ3,-5.536547,0.000000,-5034.066667,0.0,0.000000,0.000000,0.000000,0,0,0,0,-0.345372,1.862704,0.0000,1.000000,


In [7]:
peptide_scores_df = train_and_score_peptide(features_peptide_df, initial_positives=15, initial_negatives=200)

peptide_scores_df

(          Bait Protein                       Peptide  log_fold_change  \
 0         ACTB       0               ASLTGTSSTASLTR2        -5.536547   
 1         ACTB       0                   GLPLYPDPSR2         1.229599   
 2         ACTB       0          HYSPEDEPSPEAQPIAAYK3         1.269933   
 3         ACTB       0  KNQFQAFTQPATDGLSEPDVFAIAPFR3         0.580749   
 4         ACTB       0           LGGAVPFAPPEVSPEQAK2         1.053603   
 ...        ...     ...                           ...              ...   
 289453  MAPRE3    ZZZ3            LQQMQAESGFVQHVGFK3        -5.536547   
 289454  MAPRE3    ZZZ3                 SQAVQDLESLGR2        -0.710070   
 289455  MAPRE3    ZZZ3               STVVDNDADFQGTK2        -5.536547   
 289456  MAPRE3    ZZZ3                    TPNLYIYSK2        -5.536547   
 289457  MAPRE3    ZZZ3        VVQLPEIVWDQYTHSLGNFER3        -5.536547   
 
              snr     mean_diff  median_diff  replicate_fold_change_sd  \
 0       0.000000   -871.251111     