In [9]:
import os
import pandas as pd
import re
import numpy as np

In [10]:
# Read the  dataset
def read_dataset(file_path):
    """
    Reads the dataset from the given file path.
    :param file_path: Path to the dataset file.
    :return: DataFrame containing the dataset.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File {file_path} does not exist.")

    # Read the dataset
    df = pd.read_csv(file_path)

    return df


df = read_dataset("data/mq_variants_intensity_cleaned.csv")

In [11]:
df.head()

Unnamed: 0,Variant,Variant ID,Unmod variant,Total,Total- Unmodified sequence,Variants- Unmodified sequence,Proteins,Mass,Charge,Num Mods,...,_dyn_#Baricitinib 300nM.Tech replicate 1 of 1,_dyn_#Baricitinib 300nM.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib 30nM.Tech replicate 1 of 1,_dyn_#Baricitinib 30nM.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib 3nM.Tech replicate 1 of 1,_dyn_#Baricitinib 3nM.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib DMSO.Tech replicate 1 of 1,_dyn_#Baricitinib DMSO.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib PDPD.Tech replicate 1 of 1,_dyn_#Baricitinib PDPD.Tech replicate 1 of 1_unmod
0,.VADPDHDHTGFLTEYVATR.,93378,.VADPDHDHTGFLTEYVATR.,11372,15019,62,sp|P28482-2|MK01_HUMAN;sp|P28482|MK01_HUMAN,2144.0,2,0,...,182810000.0,182810000.0,296340000.0,296340000.0,272890000.0,272890000.0,254860000.0,254860000.0,70792000.0,70792000.0
1,.LGEHNIDVLEGNEQFINAAK.,50733,.LGEHNIDVLEGNEQFINAAK.,8878,23098,134,TRYP_PIG,2211.1,2,0,...,152910000.0,152910000.0,313690000.0,313690000.0,187600000.0,187600000.0,313290000.0,313290000.0,204790000.0,204790000.0
2,.FRHENIIGINDIIR.,25741,.FRHENIIGINDIIR.,8720,12619,33,sp|P28482-2|MK01_HUMAN;sp|P28482|MK01_HUMAN,1709.9,2,0,...,115160000.0,115160000.0,223460000.0,223460000.0,182890000.0,182890000.0,236530000.0,236530000.0,97725000.0,97725000.0
3,.ESESTAGSFSLSVR.,21292,.ESESTAGSFSLSVR.,7995,8328,7,sp|P06239-2|LCK_HUMAN;sp|P06239-3|LCK_HUMAN;sp...,1456.7,2,0,...,19220000.0,19220000.0,11216000.0,11216000.0,12721000.0,12721000.0,12835000.0,12835000.0,8137600.0,8137600.0
4,.NYLLSLPHK.,68115,.NYLLSLPHK.,7445,11842,41,sp|P28482|MK01_HUMAN,1084.6,2,0,...,290970000.0,290970000.0,477300000.0,477300000.0,363140000.0,363140000.0,43697000.0,43697000.0,182850000.0,182850000.0


In [12]:
# Filter rows where Proteins column doesn't contain a semicolon
single_protein_variants = df[~df['Proteins'].str.contains(';', na=False)]

# Display the shape of the original and filtered dataframes
print(f"Original dataset shape: {df.shape}")
print(f"Filtered dataset shape: {single_protein_variants.shape}")

# Display first few rows of the filtered dataset
single_protein_variants.head()

Original dataset shape: (50659, 1031)
Filtered dataset shape: (9594, 1031)


Unnamed: 0,Variant,Variant ID,Unmod variant,Total,Total- Unmodified sequence,Variants- Unmodified sequence,Proteins,Mass,Charge,Num Mods,...,_dyn_#Baricitinib 300nM.Tech replicate 1 of 1,_dyn_#Baricitinib 300nM.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib 30nM.Tech replicate 1 of 1,_dyn_#Baricitinib 30nM.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib 3nM.Tech replicate 1 of 1,_dyn_#Baricitinib 3nM.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib DMSO.Tech replicate 1 of 1,_dyn_#Baricitinib DMSO.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib PDPD.Tech replicate 1 of 1,_dyn_#Baricitinib PDPD.Tech replicate 1 of 1_unmod
1,.LGEHNIDVLEGNEQFINAAK.,50733,.LGEHNIDVLEGNEQFINAAK.,8878,23098,134,TRYP_PIG,2211.1,2,0,...,152910000.0,152910000.0,313690000.0,313690000.0,187600000.0,187600000.0,313290000.0,313290000.0,204790000.0,204790000.0
4,.NYLLSLPHK.,68115,.NYLLSLPHK.,7445,11842,41,sp|P28482|MK01_HUMAN,1084.6,2,0,...,290970000.0,290970000.0,477300000.0,477300000.0,363140000.0,363140000.0,43697000.0,43697000.0,182850000.0,182850000.0
9,.KLEAAEDIAYQLSR.,44634,.KLEAAEDIAYQLSR.,4958,6905,36,sp|P35232|PHB_HUMAN,1606.8,2,0,...,32624000.0,32624000.0,55130000.0,55130000.0,52332000.0,52332000.0,52156000.0,52156000.0,55709000.0,55709000.0
11,.IITHPNFNGNTLDNDIMLIK.,37659,.IITHPNFNGNTLDNDIMLIK.,4830,20735,81,TRYP_PIG,2283.2,2,0,...,,,,,,,,,,
25,.L+28.003GEHNIDVLEGNEQFINAAK.,50735,.LGEHNIDVLEGNEQFINAAK.,3782,23098,134,TRYP_PIG,2239.1,2,1,...,,152910000.0,,313690000.0,,187600000.0,,313290000.0,,204790000.0


In [20]:
single_protein_variants.to_csv("data/mq_variants_intensity_single_protein.csv", index=False)