# PUPPI Tutorial

This Jupyter notebook demonstrates how to use the `puppi` Python package to process a BioID/AP-MS dataset. We will:

1. Load an example intensity file
2. Run feature engineering
3. Train a PU-learning model and estimate FDR
4. Save the output

In [4]:
import pandas as pd
from puppi.features import feature_engineering
from puppi.train import train_and_score

## Load Example Data

In [2]:
input_df = pd.read_csv("input_intensity_dataset.tsv", sep='\t')
input_df.head()

Unnamed: 0,Protein,ACTB_1,ACTB_2,ACTB_3,CTNNA1_1,CTNNA1_2,CTNNA1_3,EGFP_1,EGFP_2,EGFP_3,...,KRT8_3,LMNA_1,LMNA_2,LMNA_3,MAPRE3_1,MAPRE3_2,MAPRE3_3,NminiTurbo_1,NminiTurbo_2,NminiTurbo_3
0,A2M,7867.27,7647.22,10553.3,11271.3,4931.14,3093.09,1952.07,18694.6,23878.8,...,10888.4,12060.4,15633.5,5512.17,3201.09,0.0,13319.4,999.026,4449.13,3227.09
1,A2ML1,0.0,0.0,7483.24,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5964.16,0.0,0.0,0.0,0.0,0.0,0.0
2,AAAS,14566.4,12137.4,74442.2,10259.3,13194.3,9284.29,16794.5,19413.5,23325.8,...,26056.7,32204.9,49027.6,29099.0,10037.3,8569.23,7981.22,6454.19,7324.22,8710.24
3,AADAC,0.0,0.0,0.0,18876.6,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,AAGAB,12210.5,7979.22,12883.4,0.0,0.0,0.0,14340.5,11251.4,21102.8,...,15733.6,0.0,2908.11,0.0,2140.07,2398.07,0.0,0.0,0.0,0.0


## Run Feature Engineering

You must specify control (substrings identifying control samples).

In [3]:
features_df = feature_engineering(input_df, controls=["EGFP", "Empty", "NminiTurbo"])
features_df.head()

Unnamed: 0,Bait,Prey,log_fold_change,snr,mean_diff,median_diff,replicate_fold_change_sd,bait_cv,bait_control_sd_ratio,zero_or_neg_fc,composite_score,global_cv
0,ACTB,BSTAR,1.793066,2.728274,4120327.0,6174746.0,0.540772,0.156045,0.425733,1,22.687752,0.646998
1,ACTB,CCT8,2.188429,3.417044,3607451.0,4587256.9,0.186501,0.040917,0.139814,1,18.328789,1.217068
2,ACTB,FLNA,1.721993,2.552866,3226408.0,4348742.0,0.365425,0.110771,0.282785,1,16.853468,1.037275
3,ACTB,AHNAK,1.387616,2.046653,2558778.0,3848550.0,0.066565,0.025441,0.052069,1,14.152581,0.787742
4,ACTB,IGLC2;IGLC3;IGLC6;IGLC7,6.550428,1553.015899,157147.1,0.0,12424.127191,1.414214,4392.592294,1,13.684224,4.795832


## Train PU-learning Model and Estimate FDR

In [9]:
final_df = train_and_score(features_df, initial_positives=10, initial_negatives=200)
final_df.head()

Unnamed: 0,Bait,Prey,log_fold_change,snr,mean_diff,median_diff,replicate_fold_change_sd,bait_cv,bait_control_sd_ratio,zero_or_neg_fc,composite_score,global_cv,predicted_probability,FDR,global_cv_flag
0,ACTB,BSTAR,1.793066,2.728274,4120327.0,6174746.0,0.540772,0.156045,0.425733,1,22.687752,0.646998,0.9823,0.0,likely background
1,ACTB,CCT8,2.188429,3.417044,3607451.0,4587256.9,0.186501,0.040917,0.139814,1,18.328789,1.217068,0.9452,0.0,
2,ACTB,FLNA,1.721993,2.552866,3226408.0,4348742.0,0.365425,0.110771,0.282785,1,16.853468,1.037275,0.971,0.0,likely background
3,ACTB,AHNAK,1.387616,2.046653,2558778.0,3848550.0,0.066565,0.025441,0.052069,1,14.152581,0.787742,0.8523,0.029542,likely background
4,ACTB,IGLC2;IGLC3;IGLC6;IGLC7,6.550428,1553.015899,157147.1,0.0,12424.127191,1.414214,4392.592294,1,13.684224,4.795832,0.974,0.0,


## Save Output

In [11]:
final_df.to_csv("puppi_output.csv", index=False)