# PUPPI Tutorial

This Jupyter notebook demonstrates how to use the `puppi` Python package to process a BioID/AP-MS dataset. We will:

1. Load an example intensity file
2. Run feature engineering
3. Train a PU-learning model and estimate FDR
4. Save the output

In [1]:
import pandas as pd
from puppi.features import feature_engineering
from puppi.train import train_and_score

## Load Example Data

In [2]:
input_df = pd.read_csv("input_intensity_dataset.tsv", sep='\t')
input_df.head(10)

Unnamed: 0,Protein,KRT8_1,KRT8_2,KRT8_3,LMNA_1,LMNA_2,LMNA_3,ACTB_1,ACTB_2,ACTB_3,...,MAPRE3_3,EGFP_1,EGFP_2,EGFP_3,Empty_1,Empty_2,Empty_3,NminiTurbo_1,NminiTurbo_2,NminiTurbo_3
0,IGLV3-21;IGLV3-9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10349.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,IGKV3D-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26850.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,IGKV2-28;IGKV2-29;IGKV2-30;IGKV2-40;IGKV2D-26;...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18848.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,IGKV3-11;IGKV3D-11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37883.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,IGHV3-49,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40300.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,IGKV3D-20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99823.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,IFT56,2801.1,0.0,3743.14,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,E2F8,0.0,0.0,0.0,2235.07,3705.11,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,UBA6,16811.5,20947.6,32968.9,12222.3,17614.5,7483.19,15850.4,15761.4,21677.5,...,10002.3,21427.6,22669.6,28582.7,9445.24,15648.4,7383.2,7051.19,4315.11,7106.19
9,ESYT2,23014.7,23328.7,27303.7,8006.24,11602.4,4955.15,9810.29,9916.31,15753.4,...,3747.11,19154.5,18357.5,19583.7,0.0,1474.04,1478.04,0.0,0.0,0.0


## Run Feature Engineering

You must specify control (substrings identifying control samples).

In [3]:
features_df = feature_engineering(input_df, controls=["EGFP", "Empty", "NminiTurbo"])
features_df.head()

Unnamed: 0,Bait,Prey,log_fold_change,snr,mean_diff,median_diff,replicate_fold_change_sd,bait_cv,bait_control_sd_ratio,zero_or_neg_fc,nonzero_reps,reps_above_ctrl_med,single_rep_flag,composite_score,global_cv
0,ACTB,CCT8,2.18849,3.416967,3607493.0,4587256.9,0.186509,0.040917,0.139811,1,3,3,0,16.742191,1.217581
1,ACTB,FLNA,1.721757,2.55311,3226178.0,4348742.0,0.365365,0.110771,0.282812,1,3,3,0,15.403182,1.03863
2,ACTB,AHNAK,1.39588,2.040781,2567820.0,3867983.0,0.066947,0.025441,0.051919,1,3,3,0,13.012335,0.789551
3,ACTB,ACTA1,3.491085,46.438511,3079957.0,2908418.0,2.740437,0.243724,11.318196,1,3,3,0,12.909304,1.610158
4,ACTB,ACTC1,11.813446,1272.496414,100352.0,71529.6,3439.685503,0.95569,1216.112472,1,2,2,0,11.557553,3.511042


## Train PU-learning Model and Estimate FDR

In [4]:
final_df = train_and_score(features_df, initial_positives=15, initial_negatives=200)
final_df.head()


Cluster assignments:
Bait: ACTB, Cluster: 1
Bait: CTNNA1, Cluster: 1
Bait: KRT8, Cluster: 1
Bait: LMNA, Cluster: 1
Bait: MAPRE3, Cluster: 2

Strong cluster chosen: 1 (size=4, mean=3.7446)

Assigned positives:
Bait: ACTB, Positives: 15
Bait: CTNNA1, Positives: 15
Bait: KRT8, Positives: 15
Bait: LMNA, Positives: 15
Bait: MAPRE3, Positives: 0


Unnamed: 0,Bait,Prey,log_fold_change,snr,mean_diff,median_diff,replicate_fold_change_sd,bait_cv,bait_control_sd_ratio,zero_or_neg_fc,nonzero_reps,reps_above_ctrl_med,single_rep_flag,composite_score,global_cv,predicted_probability,FDR,global_cv_flag
0,ACTB,CCT8,2.18849,3.416967,3607493.0,4587256.9,0.186509,0.040917,0.139811,1,3,3,0,16.742191,1.217581,0.9979,0.0,
1,ACTB,FLNA,1.721757,2.55311,3226178.0,4348742.0,0.365365,0.110771,0.282812,1,3,3,0,15.403182,1.03863,0.9989,0.0,
2,ACTB,AHNAK,1.39588,2.040781,2567820.0,3867983.0,0.066947,0.025441,0.051919,1,3,3,0,13.012335,0.789551,0.9736,0.013777,likely background
3,ACTB,ACTA1,3.491085,46.438511,3079957.0,2908418.0,2.740437,0.243724,11.318196,1,3,3,0,12.909304,1.610158,0.9998,0.0,
4,ACTB,ACTC1,11.813446,1272.496414,100352.0,71529.6,3439.685503,0.95569,1216.112472,1,2,2,0,11.557553,3.511042,0.9992,0.0,


## Save Output

In [5]:
final_df.to_csv("puppi_output.csv", index=False)