In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('..')

In [3]:
from talus_data_analysis.load import *
from talus_data_analysis.elib import Elib
from talus_data_analysis.reshape import uniprot_protein_name
from dotenv import load_dotenv
import tempfile
import sqlite3
import pandas as pd

In [4]:
load_dotenv()

True

In [108]:
k562_proteome = pd.read_csv("../data/K562_proteome_4fractions.txt", sep="\t")

In [109]:
k562_proteome = k562_proteome[["xPeptide", "Protein"]]
k562_proteome.columns = ["PeptideSeq", "ProteinAccession"]
k562_proteome["ProteinAccession"] = k562_proteome["ProteinAccession"].apply(lambda x: x.split(";"))
k562_proteome = k562_proteome.explode("ProteinAccession")
k562_proteome["Cell Line"] = "K562"
k562_proteome["Protein"] = k562_proteome["ProteinAccession"].apply(uniprot_protein_name)

In [110]:
MM1S_ELIB = "../data/MM1S/RESULTS-quant.elib"
MLLtx_ELIB = "../data/MLLtx/RESULTS-quant.elib"

In [111]:
sql = "SELECT PeptideSeq, ProteinAccession FROM peptidetoprotein"

In [112]:
mm1s_elib_conn = Elib(key=MM1S_ELIB)
mm1s_df = pd.DataFrame(mm1s_elib_conn.execute_sql(sql), columns=["PeptideSeq", "ProteinAccession"])
mm1s_df["Cell Line"] = "MM1S"
mm1s_df["Protein"] = mm1s_df["ProteinAccession"].apply(uniprot_protein_name)

In [113]:
mlltx_elib_conn = Elib(key=MLLtx_ELIB)
mlltx_df = pd.DataFrame(mlltx_elib_conn.execute_sql(sql), columns=["PeptideSeq", "ProteinAccession"])
mlltx_df["Cell Line"] = "MLLtx"
mlltx_df["Protein"] = mlltx_df["ProteinAccession"].apply(uniprot_protein_name)

In [114]:
df = k562_proteome.append(mm1s_df).append(mlltx_df)

In [115]:
df

Unnamed: 0,PeptideSeq,ProteinAccession,Cell Line,Protein
0,AAAAAAAAAAAAAAAGAGAGAK,tr|G3XAL9|G3XAL9_HUMAN,K562,G3XAL9
0,AAAAAAAAAAAAAAAGAGAGAK,sp|P55011|S12A2_HUMAN,K562,S12A2
1,AAAAAAAAAVSR,sp|Q96JP5|ZFP91_HUMAN,K562,ZFP91
1,AAAAAAAAAVSR,tr|A0A0A6YYC7|A0A0A6YYC7_HUMAN,K562,A0A0A6YYC7
2,AAAAAAAAGAFAGR,sp|Q8N697|S15A4_HUMAN,K562,S15A4
...,...,...,...,...
108433,QGPTFLYNDSIPGK,DECOY_sp|Q9Y2E5|MA2B2_HUMAN,MLLtx,MA2B2
108434,CVEEASLQR,DECOY_sp|O14523|C2C2L_HUMAN,MLLtx,C2C2L
108435,SVAQEFSVR,DECOY_sp|Q13601|KRR1_HUMAN,MLLtx,KRR1
108436,IFLGGEKR,DECOY_sp|Q86WI1|PKHL1_HUMAN,MLLtx,PKHL1


In [116]:
target_protein = ["MCL-1", "BIM1", "BIM2", "BIM3", "BID", "BCL-XL", "BFL-1/A1", "BCL-2", "BAX", "BOK", "BIK"]

In [118]:
target_protein_mappings = {
    "MCL-1": "MCL1", 
    "BIM1": "BIM1",
    "BIM2": "BIM2",
    "BIM3": "BIM3",
    "BID": "BID",
    "BCL-XL": "B2CL1",
    "BFL-1/A1": "B2LA1",
    "BCL-2": "BCL2", 
    "BAX": "BAX",
    "BOK": "BOK",
    "BIK": "BIK",
}

In [119]:
filtered_df = df[df["Protein"].str.lower().isin(set([target_protein_mappings[protein].lower() for protein in target_protein]))]

In [120]:
filtered_df = filtered_df.sort_values(by=["PeptideSeq", "Protein", "Cell Line"]).reset_index(drop=True)

In [121]:
filtered_df.columns = ["Peptide", "UniProt ID", "Cell Line", "Protein"]
filtered_df = filtered_df[["Peptide", "Protein", "UniProt ID", "Cell Line"]]

In [122]:
filtered_df

Unnamed: 0,Peptide,Protein,UniProt ID,Cell Line
0,DLATALEQLLQAYPR,BID,sp|P55957|BID_HUMAN,K562
1,DLATALEQLLQAYPR,BID,sp|P55957|BID_HUMAN,MLLtx
2,DVFHTTVNFINQNLR,BID,sp|P55957|BID_HUMAN,K562
3,EAGDEFELR,B2CL1,sp|Q07817|B2CL1_HUMAN,MLLtx
4,EAGDEFELR,B2CL1,sp|Q07817|B2CL1_HUMAN,MM1S
...,...,...,...,...
63,VARPPPIGAEVPDVTATPAR,MCL1,sp|Q07820|MCL1_HUMAN,MLLtx
64,VARPPPIGAEVPDVTATPAR,MCL1,sp|Q07820|MCL1_HUMAN,MM1S
65,VMIHVFSDGVTNWGR,MCL1,sp|Q07820|MCL1_HUMAN,MM1S
66,VVALFYFASK,BAX,sp|Q07812|BAX_HUMAN,MLLtx


In [123]:
filtered_df["Protein"].value_counts()

BAX      25
MCL1     15
B2CL1     9
BID       9
BCL2      8
BOK       2
Name: Protein, dtype: int64

In [124]:
filtered_df.to_csv("../data/mitochondria_proteins.csv")