In [3]:
import numpy as np
import pandas as pd

In [2]:
data_path_csv = "data/mq_variants_intensity.csv"
mq_variants_df = pd.read_csv(data_path_csv, low_memory=False)

In [3]:
mq_variants_df.head()

Unnamed: 0,rowid,ccms_row_id,Variant,Variant ID,Unmod variant,Total,Total- Unmodified sequence,Variants- Unmodified sequence,Proteins,Mass,...,_dyn_#Baricitinib 300nM.Tech replicate 1 of 1,_dyn_#Baricitinib 300nM.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib 30nM.Tech replicate 1 of 1,_dyn_#Baricitinib 30nM.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib 3nM.Tech replicate 1 of 1,_dyn_#Baricitinib 3nM.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib DMSO.Tech replicate 1 of 1,_dyn_#Baricitinib DMSO.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib PDPD.Tech replicate 1 of 1,_dyn_#Baricitinib PDPD.Tech replicate 1 of 1_unmod
0,1,1,.IITHPNFNGNTLDNDIM+15.995LIK.,37658,.IITHPNFNGNTLDNDIMLIK.,11683,20735,81,TRYP_PIG,2299.2,...,,,,,,,,,,
1,2,2,.VADPDHDHTGFLTEYVATR.,93378,.VADPDHDHTGFLTEYVATR.,11372,15019,62,sp|P28482-2|MK01_HUMAN;sp|P28482|MK01_HUMAN,2144.0,...,182810000.0,182810000.0,296340000.0,296340000.0,272890000.0,272890000.0,254860000.0,254860000.0,70792000.0,70792000.0
2,3,3,.LGEHNIDVLEGNEQFINAAK.,50733,.LGEHNIDVLEGNEQFINAAK.,8878,23098,134,TRYP_PIG,2211.1,...,152910000.0,152910000.0,313690000.0,313690000.0,187600000.0,187600000.0,313290000.0,313290000.0,204790000.0,204790000.0
3,4,4,.FRHENIIGINDIIR.,25741,.FRHENIIGINDIIR.,8720,12619,33,sp|P28482-2|MK01_HUMAN;sp|P28482|MK01_HUMAN,1709.9,...,115160000.0,115160000.0,223460000.0,223460000.0,182890000.0,182890000.0,236530000.0,236530000.0,97725000.0,97725000.0
4,5,5,.ESESTAGSFSLSVR.,21292,.ESESTAGSFSLSVR.,7995,8328,7,sp|P06239-2|LCK_HUMAN;sp|P06239-3|LCK_HUMAN;sp...,1456.7,...,19220000.0,19220000.0,11216000.0,11216000.0,12721000.0,12721000.0,12835000.0,12835000.0,8137600.0,8137600.0


In [4]:
# Define a function to convert columns with comma-separated numbers to numeric
def convert_comma_separated_to_numeric(df: pd.DataFrame):
    """
    Convert columns with comma-separated numbers to numeric type.
    """
    # Find columns that might contain string representations of numbers
    string_cols = df.select_dtypes(include=["object"]).columns

    # Track conversion statistics
    converted = 0
    failed = 0

    for col in string_cols:
        # Skip columns that are clearly not numeric intensity values
        if not col.startswith("_dyn_"):
            continue

        try:
            # First, try direct conversion (handles NaN and already numeric values)
            temp_series = df[col].str.replace(",", "", regex=False)
            # Convert to numeric, forcing errors to NaN
            temp_series = pd.to_numeric(temp_series, errors="raise")

            # If there are non-null values, replace the column
            if not temp_series.isna().all():
                df[col] = temp_series
                converted += 1

        except Exception as e:
            print(f"Could not convert column: {col}, Error: {str(e)}")
            failed += 1

    print(
        f"Converted {converted} columns to numeric. Failed to convert {failed} columns."
    )
    return df


# Process the dataset
mq_variants_df = convert_comma_separated_to_numeric(mq_variants_df)

# Display info to verify conversion
print("\nDataset summary after conversion:")
print(mq_variants_df.dtypes.value_counts())

Converted 1000 columns to numeric. Failed to convert 0 columns.

Dataset summary after conversion:
float64    1008
object       14
int64        10
bool          1
dtype: int64


In [5]:
# Identify all treatment columns (those starting with "_dyn_")
treatment_cols = [col for col in mq_variants_df.columns if col.startswith("_dyn_")]

# Check which rows have all NaN values in the treatment columns
rows_to_keep = ~mq_variants_df[treatment_cols].isna().all(axis=1)

# Filter the dataframe to keep only rows with at least one treatment value
filtered_mq_variants_df = mq_variants_df[rows_to_keep].reset_index(drop=True)

# Print information about the filtering
print(f"Original number of rows: {mq_variants_df.shape[0]}")
print(f"Rows after filtering: {filtered_mq_variants_df.shape[0]}")
removed_rows = mq_variants_df.shape[0] - filtered_mq_variants_df.shape[0]
print(
    f"Removed {removed_rows} rows ({removed_rows / mq_variants_df.shape[0] * 100:.2f}% of total)"
)

# Replace the original dataframe with the filtered one
mq_variants_df = filtered_mq_variants_df

Original number of rows: 83706
Rows after filtering: 50659
Removed 33047 rows (39.48% of total)


In [6]:
mq_variants_df.drop(columns=['rowid', 'ccms_row_id'], inplace=True, errors='ignore')
mq_variants_df.head()

Unnamed: 0,Variant,Variant ID,Unmod variant,Total,Total- Unmodified sequence,Variants- Unmodified sequence,Proteins,Mass,Charge,Num Mods,...,_dyn_#Baricitinib 300nM.Tech replicate 1 of 1,_dyn_#Baricitinib 300nM.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib 30nM.Tech replicate 1 of 1,_dyn_#Baricitinib 30nM.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib 3nM.Tech replicate 1 of 1,_dyn_#Baricitinib 3nM.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib DMSO.Tech replicate 1 of 1,_dyn_#Baricitinib DMSO.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib PDPD.Tech replicate 1 of 1,_dyn_#Baricitinib PDPD.Tech replicate 1 of 1_unmod
0,.VADPDHDHTGFLTEYVATR.,93378,.VADPDHDHTGFLTEYVATR.,11372,15019,62,sp|P28482-2|MK01_HUMAN;sp|P28482|MK01_HUMAN,2144.0,2,0,...,182810000.0,182810000.0,296340000.0,296340000.0,272890000.0,272890000.0,254860000.0,254860000.0,70792000.0,70792000.0
1,.LGEHNIDVLEGNEQFINAAK.,50733,.LGEHNIDVLEGNEQFINAAK.,8878,23098,134,TRYP_PIG,2211.1,2,0,...,152910000.0,152910000.0,313690000.0,313690000.0,187600000.0,187600000.0,313290000.0,313290000.0,204790000.0,204790000.0
2,.FRHENIIGINDIIR.,25741,.FRHENIIGINDIIR.,8720,12619,33,sp|P28482-2|MK01_HUMAN;sp|P28482|MK01_HUMAN,1709.9,2,0,...,115160000.0,115160000.0,223460000.0,223460000.0,182890000.0,182890000.0,236530000.0,236530000.0,97725000.0,97725000.0
3,.ESESTAGSFSLSVR.,21292,.ESESTAGSFSLSVR.,7995,8328,7,sp|P06239-2|LCK_HUMAN;sp|P06239-3|LCK_HUMAN;sp...,1456.7,2,0,...,19220000.0,19220000.0,11216000.0,11216000.0,12721000.0,12721000.0,12835000.0,12835000.0,8137600.0,8137600.0
4,.NYLLSLPHK.,68115,.NYLLSLPHK.,7445,11842,41,sp|P28482|MK01_HUMAN,1084.6,2,0,...,290970000.0,290970000.0,477300000.0,477300000.0,363140000.0,363140000.0,43697000.0,43697000.0,182850000.0,182850000.0


In [7]:
mq_variants_df.drop(
    columns=[
        col
        for col in mq_variants_df.columns
        if col.startswith("_dyn_") and (col.endswith("_unmod") or "PDPD" in col)
    ],
    inplace=True,
)

In [8]:
mq_variants_df.head()

Unnamed: 0,Variant,Variant ID,Unmod variant,Total,Total- Unmodified sequence,Variants- Unmodified sequence,Proteins,Mass,Charge,Num Mods,...,_dyn_#Barasertib_HQPA DMSO.Tech replicate 1 of 1,_dyn_#Baricitinib 1000nM.Tech replicate 1 of 1,_dyn_#Baricitinib 100nM.Tech replicate 1 of 1,_dyn_#Baricitinib 10nM.Tech replicate 1 of 1,_dyn_#Baricitinib 30000nM.Tech replicate 1 of 1,_dyn_#Baricitinib 3000nM.Tech replicate 1 of 1,_dyn_#Baricitinib 300nM.Tech replicate 1 of 1,_dyn_#Baricitinib 30nM.Tech replicate 1 of 1,_dyn_#Baricitinib 3nM.Tech replicate 1 of 1,_dyn_#Baricitinib DMSO.Tech replicate 1 of 1
0,.VADPDHDHTGFLTEYVATR.,93378,.VADPDHDHTGFLTEYVATR.,11372,15019,62,sp|P28482-2|MK01_HUMAN;sp|P28482|MK01_HUMAN,2144.0,2,0,...,402680000.0,69163000.0,430600000.0,341340000.0,96443000.0,84155000.0,182810000.0,296340000.0,272890000.0,254860000.0
1,.LGEHNIDVLEGNEQFINAAK.,50733,.LGEHNIDVLEGNEQFINAAK.,8878,23098,134,TRYP_PIG,2211.1,2,0,...,593480000.0,108270000.0,330300000.0,337470000.0,123930000.0,91085000.0,152910000.0,313690000.0,187600000.0,313290000.0
2,.FRHENIIGINDIIR.,25741,.FRHENIIGINDIIR.,8720,12619,33,sp|P28482-2|MK01_HUMAN;sp|P28482|MK01_HUMAN,1709.9,2,0,...,280180000.0,73203000.0,209690000.0,337270000.0,90528000.0,112760000.0,115160000.0,223460000.0,182890000.0,236530000.0
3,.ESESTAGSFSLSVR.,21292,.ESESTAGSFSLSVR.,7995,8328,7,sp|P06239-2|LCK_HUMAN;sp|P06239-3|LCK_HUMAN;sp...,1456.7,2,0,...,5269500.0,11120000.0,15061000.0,18264000.0,13380000.0,14280000.0,19220000.0,11216000.0,12721000.0,12835000.0
4,.NYLLSLPHK.,68115,.NYLLSLPHK.,7445,11842,41,sp|P28482|MK01_HUMAN,1084.6,2,0,...,247540000.0,231550000.0,481230000.0,591120000.0,250100000.0,300980000.0,290970000.0,477300000.0,363140000.0,43697000.0


In [9]:
# Save the cleaned and converted numeric dataframe to a new file
mq_variants_df.to_csv('data/mq_variants_intensity_numeric.csv', index=False)
print(f"DataFrame saved to data/mq_variants_intensity_numeric.csv")
print(f"Shape of saved dataframe: {mq_variants_df.shape}")

DataFrame saved to data/mq_variants_intensity_numeric.csv
Shape of saved dataframe: (50659, 481)


In [None]:
# Create a copy of the mq_variants_df dataframe
mq_variants_df2 = mq_variants_df.copy()


In [7]:
def get_drug_names(df: pd.DataFrame):
    """
    Extract sorted, unique drug names from intensity column headers.
    Expects columns like "_dyn_#DRUG 10nM.Tech replicate…".
    """
    pattern = r"_dyn_#(?P<drug>[^ ]+) \d+nM"
    drugs = df.columns.to_series().str.extract(pattern)["drug"].dropna().unique()
    return sorted(drugs)


drugs = get_drug_names(mq_variants_df2)
# Get all dynamic columns
dyn_cols = [col for col in mq_variants_df2.columns if col.startswith("_dyn_")]

# Create dictionaries to organize columns by drug
control_columns: dict[str, str] = {}  # Will store DMSO columns for each drug
treatment_columns: dict[str, list] = {}  # Will store treatment columns for each drug

# Organize columns by drug using the predefined list
for drug in drugs:
    # Find the DMSO control column for this drug
    control_col = f"_dyn_#{drug} DMSO.Tech replicate 1 of 1"
    if control_col in dyn_cols:
        control_columns[drug] = control_col

    # Find all treatment columns for this drug
    drug_treatments = [
        col
        for col in dyn_cols
        if col.startswith(f"_dyn_#{drug} ") and "DMSO" not in col
    ]

    if drug_treatments:
        treatment_columns[drug] = drug_treatments

# Perform the subtraction (instead of log2 ratio)
for drug in treatment_columns:
    if drug not in control_columns:
        print(f"Warning: No control column found for drug {drug}")
        continue
        
    control_col = control_columns[drug]
    control_values = mq_variants_df2[control_col].copy()  # Store original control values

    # Process all treatment columns
    for treatment_col in treatment_columns[drug]:
        # Subtract control values from treatment
        mq_variants_df2[treatment_col] = mq_variants_df2[treatment_col] - control_values
        
    # Set control column to zero (reference point)
    mq_variants_df2[control_col] = 0
    # Save the file with intensity deduction normalization
mq_variants_df2.to_csv('data/mq_variants_intensity_deduction.csv', index=False)
print(f"DataFrame saved to data/mq_variants_intensity_deduction.csv")
print(f"Shape of saved dataframe: {mq_variants_df2.shape}")
print(f"Completed subtraction-based normalization for {len(treatment_columns)} drugs")

DataFrame saved to data/mq_variants_intensity_deduction.csv
Shape of saved dataframe: (50659, 481)
Completed subtraction-based normalization for 50 drugs


In [9]:
def get_drug_names(df: pd.DataFrame):
    """
    Extract sorted, unique drug names from intensity column headers.
    Expects columns like "_dyn_#DRUG 10nM.Tech replicate…".
    """
    pattern = r"_dyn_#(?P<drug>[^ ]+) \d+nM"
    drugs = df.columns.to_series().str.extract(pattern)["drug"].dropna().unique()
    return sorted(drugs)


drugs = get_drug_names(mq_variants_df)

# Get all dynamic columns
dyn_cols = [col for col in mq_variants_df.columns if col.startswith("_dyn_")]

# Create dictionaries to organize columns by drug
control_columns: dict[str, str] = {}  # Will store DMSO columns for each drug
treatment_columns: dict[str, str] = {}  # Will store treatment columns for each drug

# Organize columns by drug using the predefined list
for drug in drugs:
    # Find the DMSO control column for this drug
    control_col = f"_dyn_#{drug} DMSO.Tech replicate 1 of 1"
    if control_col in dyn_cols:
        control_columns[drug] = control_col

    # Find all treatment columns for this drug
    drug_treatments = [
        col
        for col in dyn_cols
        if col.startswith(f"_dyn_#{drug} ") and "DMSO" not in col
    ]

    if drug_treatments:
        treatment_columns[drug] = drug_treatments

# Perform the normalization and log2 transformation
for drug in treatment_columns:
    control_col = control_columns[drug]
    control_values = mq_variants_df[control_col].copy()  # Store original control values

    # First handle all treatment columns
    for treatment_col in treatment_columns[drug]:
        mq_variants_df[treatment_col] = np.log2(
            mq_variants_df[treatment_col].div(control_values)
        )

    # Then handle the control column last
    mq_variants_df[control_col] = np.log2(control_values.div(control_values))

In [10]:
mq_variants_df.head()

Unnamed: 0,Variant,Variant ID,Unmod variant,Total,Total- Unmodified sequence,Variants- Unmodified sequence,Proteins,Mass,Charge,Num Mods,...,_dyn_#Barasertib_HQPA DMSO.Tech replicate 1 of 1,_dyn_#Baricitinib 1000nM.Tech replicate 1 of 1,_dyn_#Baricitinib 100nM.Tech replicate 1 of 1,_dyn_#Baricitinib 10nM.Tech replicate 1 of 1,_dyn_#Baricitinib 30000nM.Tech replicate 1 of 1,_dyn_#Baricitinib 3000nM.Tech replicate 1 of 1,_dyn_#Baricitinib 300nM.Tech replicate 1 of 1,_dyn_#Baricitinib 30nM.Tech replicate 1 of 1,_dyn_#Baricitinib 3nM.Tech replicate 1 of 1,_dyn_#Baricitinib DMSO.Tech replicate 1 of 1
0,.VADPDHDHTGFLTEYVATR.,93378,.VADPDHDHTGFLTEYVATR.,11372,15019,62,sp|P28482-2|MK01_HUMAN;sp|P28482|MK01_HUMAN,2144.0,2,0,...,0.0,-1.881633,0.756643,0.421505,-1.401957,-1.598584,-0.47936,0.217548,0.098615,0.0
1,.LGEHNIDVLEGNEQFINAAK.,50733,.LGEHNIDVLEGNEQFINAAK.,8878,23098,134,TRYP_PIG,2211.1,2,0,...,0.0,-1.532865,0.076278,0.107261,-1.337973,-1.782213,-1.034816,0.001841,-0.739839,0.0
2,.FRHENIIGINDIIR.,25741,.FRHENIIGINDIIR.,8720,12619,33,sp|P28482-2|MK01_HUMAN;sp|P28482|MK01_HUMAN,1709.9,2,0,...,0.0,-1.692048,-0.173765,0.511881,-1.385587,-1.068768,-1.038383,-0.082007,-0.371047,0.0
3,.ESESTAGSFSLSVR.,21292,.ESESTAGSFSLSVR.,7995,8328,7,sp|P06239-2|LCK_HUMAN;sp|P06239-3|LCK_HUMAN;sp...,1456.7,2,0,...,0.0,-0.206927,0.230734,0.508919,0.059995,0.153913,0.582525,-0.194525,-0.012871,0.0
4,.NYLLSLPHK.,68115,.NYLLSLPHK.,7445,11842,41,sp|P28482|MK01_HUMAN,1084.6,2,0,...,0.0,2.405718,3.46112,3.757845,2.516899,2.784061,2.735264,3.44929,3.05492,0.0


In [11]:
mq_variants_df.to_csv('data/mq_variants_intensity_cleaned.csv', index=False)