In [1]:
import os
import requests
import zipfile
import io

import numpy as np
import pandas as pd
import re
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt

In [2]:
data_path_tsv = "data/mq_variants_intensity.tsv"

# May take a while to download
if not os.path.exists(data_path_tsv):
    os.makedirs("data", exist_ok=True)
    r = requests.post(
        "https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResult?view=mq_variants_intensity&task=a19fe3be4bd84d4a80e58d64e14ba1dd",
        stream=True,
    )
    r.raise_for_status()
    with zipfile.ZipFile(io.BytesIO(r.content)) as z:
        for zipinfo in z.infolist():
            if zipinfo.filename.endswith(".tsv"):
                zipinfo.filename = "mq_variants_intensity.tsv"
            z.extract(zipinfo, "data")

In [3]:
mq_variants_df = pd.read_csv(data_path_tsv, sep="\t")

  mq_variants_df = pd.read_csv(data_path_tsv, sep="\t")


In [8]:
mq_variants_df.head()

Unnamed: 0,rowid,ccms_row_id,Variant,Variant ID,Unmod variant,Total,Total- Unmodified sequence,Variants- Unmodified sequence,Proteins,Mass,...,_dyn_#Baricitinib 300nM.Tech replicate 1 of 1,_dyn_#Baricitinib 300nM.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib 30nM.Tech replicate 1 of 1,_dyn_#Baricitinib 30nM.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib 3nM.Tech replicate 1 of 1,_dyn_#Baricitinib 3nM.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib DMSO.Tech replicate 1 of 1,_dyn_#Baricitinib DMSO.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib PDPD.Tech replicate 1 of 1,_dyn_#Baricitinib PDPD.Tech replicate 1 of 1_unmod
0,1,1,.IITHPNFNGNTLDNDIM+15.995LIK.,37658,.IITHPNFNGNTLDNDIMLIK.,11683,20735,81,TRYP_PIG,2299.2,...,,,,,,,,,,
1,2,2,.VADPDHDHTGFLTEYVATR.,93378,.VADPDHDHTGFLTEYVATR.,11372,15019,62,sp|P28482-2|MK01_HUMAN;sp|P28482|MK01_HUMAN,2144.0,...,182810000.0,182810000.0,296340000.0,296340000.0,272890000.0,272890000.0,254860000.0,254860000.0,70792000.0,70792000.0
2,3,3,.LGEHNIDVLEGNEQFINAAK.,50733,.LGEHNIDVLEGNEQFINAAK.,8878,23098,134,TRYP_PIG,2211.1,...,152910000.0,152910000.0,313690000.0,313690000.0,187600000.0,187600000.0,313290000.0,313290000.0,204790000.0,204790000.0
3,4,4,.FRHENIIGINDIIR.,25741,.FRHENIIGINDIIR.,8720,12619,33,sp|P28482-2|MK01_HUMAN;sp|P28482|MK01_HUMAN,1709.9,...,115160000.0,115160000.0,223460000.0,223460000.0,182890000.0,182890000.0,236530000.0,236530000.0,97725000.0,97725000.0
4,5,5,.ESESTAGSFSLSVR.,21292,.ESESTAGSFSLSVR.,7995,8328,7,sp|P06239-2|LCK_HUMAN;sp|P06239-3|LCK_HUMAN;sp...,1456.7,...,19220000.0,19220000.0,11216000.0,11216000.0,12721000.0,12721000.0,12835000.0,12835000.0,8137600.0,8137600.0


In [6]:
# Filter rows where Proteins column doesn't contain a semicolon
single_protein_variants = mq_variants_df[~mq_variants_df['Proteins'].str.contains(';', na=False)]

# Display the shape of the original and filtered dataframes
print(f"Original dataset shape: {mq_variants_df.shape}")
print(f"Filtered dataset shape: {single_protein_variants.shape}")

# Display first few rows of the filtered dataset
single_protein_variants.head()

Original dataset shape: (83706, 1033)
Filtered dataset shape: (16044, 1033)


Unnamed: 0,rowid,ccms_row_id,Variant,Variant ID,Unmod variant,Total,Total- Unmodified sequence,Variants- Unmodified sequence,Proteins,Mass,...,_dyn_#Baricitinib 300nM.Tech replicate 1 of 1,_dyn_#Baricitinib 300nM.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib 30nM.Tech replicate 1 of 1,_dyn_#Baricitinib 30nM.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib 3nM.Tech replicate 1 of 1,_dyn_#Baricitinib 3nM.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib DMSO.Tech replicate 1 of 1,_dyn_#Baricitinib DMSO.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib PDPD.Tech replicate 1 of 1,_dyn_#Baricitinib PDPD.Tech replicate 1 of 1_unmod
0,1,1,.IITHPNFNGNTLDNDIM+15.995LIK.,37658,.IITHPNFNGNTLDNDIMLIK.,11683,20735,81,TRYP_PIG,2299.2,...,,,,,,,,,,
2,3,3,.LGEHNIDVLEGNEQFINAAK.,50733,.LGEHNIDVLEGNEQFINAAK.,8878,23098,134,TRYP_PIG,2211.1,...,152910000.0,152910000.0,313690000.0,313690000.0,187600000.0,187600000.0,313290000.0,313290000.0,204790000.0,204790000.0
5,6,6,.NYLLSLPHK.,68115,.NYLLSLPHK.,7445,11842,41,sp|P28482|MK01_HUMAN,1084.6,...,290970000.0,290970000.0,477300000.0,477300000.0,363140000.0,363140000.0,43697000.0,43697000.0,182850000.0,182850000.0
11,12,12,.PM+15.995FIVNTNVPR.,69185,.PMFIVNTNVPR.,5232,10053,12,sp|P14174|MIF_HUMAN,1303.7,...,,,,,,,,,,
12,13,13,.KLEAAEDIAYQLSR.,44634,.KLEAAEDIAYQLSR.,4958,6905,36,sp|P35232|PHB_HUMAN,1606.8,...,32624000.0,32624000.0,55130000.0,55130000.0,52332000.0,52332000.0,52156000.0,52156000.0,55709000.0,55709000.0
