In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import argparse
import re


In [4]:
# Read the  dataset
def read_dataset(file_path):
    """
    Reads the dataset from the given file path.
    :param file_path: Path to the dataset file.
    :return: DataFrame containing the dataset.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File {file_path} does not exist.")
    
    # Read the dataset
    df = pd.read_csv(file_path)
    
    return df  

df = read_dataset('data/mq_variants_intensity_cleaned.csv')

  df = pd.read_csv(file_path)


In [11]:
df.fillna(0, inplace=True)
df.head()

Unnamed: 0,ccms_row_id,Variant,Variant ID,Unmod variant,Total,Total- Unmodified sequence,Variants- Unmodified sequence,Proteins,Mass,Charge,...,_dyn_#Baricitinib 300nM.Tech replicate 1 of 1,_dyn_#Baricitinib 300nM.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib 30nM.Tech replicate 1 of 1,_dyn_#Baricitinib 30nM.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib 3nM.Tech replicate 1 of 1,_dyn_#Baricitinib 3nM.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib DMSO.Tech replicate 1 of 1,_dyn_#Baricitinib DMSO.Tech replicate 1 of 1_unmod,_dyn_#Baricitinib PDPD.Tech replicate 1 of 1,_dyn_#Baricitinib PDPD.Tech replicate 1 of 1_unmod
0,1,.IITHPNFNGNTLDNDIM+15.995LIK.,37658,.IITHPNFNGNTLDNDIMLIK.,11683,20735,81,TRYP_PIG,2299.2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,.VADPDHDHTGFLTEYVATR.,93378,.VADPDHDHTGFLTEYVATR.,11372,15019,62,sp|P28482-2|MK01_HUMAN;sp|P28482|MK01_HUMAN,2144.0,2,...,182810000.0,182810000.0,296340000.0,296340000.0,272890000.0,272890000.0,254860000.0,254860000.0,70792000.0,70792000.0
2,3,.LGEHNIDVLEGNEQFINAAK.,50733,.LGEHNIDVLEGNEQFINAAK.,8878,23098,134,TRYP_PIG,2211.1,2,...,152910000.0,152910000.0,313690000.0,313690000.0,187600000.0,187600000.0,313290000.0,313290000.0,204790000.0,204790000.0
3,4,.FRHENIIGINDIIR.,25741,.FRHENIIGINDIIR.,8720,12619,33,sp|P28482-2|MK01_HUMAN;sp|P28482|MK01_HUMAN,1709.9,2,...,115160000.0,115160000.0,223460000.0,223460000.0,182890000.0,182890000.0,236530000.0,236530000.0,97725000.0,97725000.0
4,5,.ESESTAGSFSLSVR.,21292,.ESESTAGSFSLSVR.,7995,8328,7,sp|P06239-2|LCK_HUMAN;sp|P06239-3|LCK_HUMAN;sp...,1456.7,2,...,19220000.0,19220000.0,11216000.0,11216000.0,12721000.0,12721000.0,12835000.0,12835000.0,8137600.0,8137600.0


In [6]:
# Extract column names that correspond to treatment conditions
drug_columns = [col for col in df.columns if col.startswith('_dyn_#') and not col.endswith('_unmod')]

# Create a dictionary to store the results
drug_results = {}

# Iterate through the drug columns
for col in drug_columns:
    # Skip DMSO and PDPD columns
    if ' DMSO.' in col or ' PDPD.' in col:
        continue
    
    # Extract drug name and concentration
    match = re.search(r'_dyn_#(.*?) (\d+n?M)\.', col)
    if match:
        drug_name = match.group(1)
        concentration = match.group(2)
        
        # Find the corresponding DMSO control column
        dmso_col = f"_dyn_#{drug_name} DMSO.Tech replicate 1 of 1"
        
        # Skip if DMSO column doesn't exist
        if dmso_col not in df.columns:
            continue
        
        # Calculate fold changes for non-null values in both columns
        mask = df[[col, dmso_col]].notna().all(axis=1)
        if mask.sum() > 0:
            fold_changes = df.loc[mask, col] / df.loc[mask, dmso_col]
            
            # Find the peptide with the highest fold change
            max_idx = fold_changes.idxmax()
            max_fold_change = fold_changes.loc[max_idx]
            
            # Get peptide sequence
            peptide = df.loc[max_idx, 'Variant']
            
            # Store the result
            key = f"{drug_name}_{concentration}"
            if drug_name not in drug_results:
                drug_results[drug_name] = {}
            
            drug_results[drug_name][concentration] = {
                'peptide': peptide,
                'fold_change': max_fold_change,
                'row_idx': max_idx
            }

# Create a dataframe to display the results
results = []
for drug, concentrations in drug_results.items():
    # Find the max fold change across all concentrations
    max_conc = max(concentrations.items(), key=lambda x: x[1]['fold_change'])
    results.append({
        'Drug': drug,
        'Max_Effect_Concentration': max_conc[0],
        'Peptide': max_conc[1]['peptide'],
        'Fold_Change': max_conc[1]['fold_change'],
        'Row_Index': max_conc[1]['row_idx']
    })

# Create dataframe and sort by fold change
results_df = pd.DataFrame(results)
if not results_df.empty:
    results_df = results_df.sort_values('Fold_Change', ascending=False)

# Display the top 20 results
if not results_df.empty:
    display(results_df.head(20))
else:
    print("No results found")

Unnamed: 0,Drug,Max_Effect_Concentration,Peptide,Fold_Change,Row_Index
6,ARRY-380_inBT474,3000nM,.GLVTPPMK.,3116.242709,35937
5,ARRY-380,30nM,.LSSPATLNSR.,1010.434412,70
48,Barasertib_HQPA,1000nM,.LSSPATLNSR.,733.719704,70
40,BMS-690514_inBT474,30000nM,.TLNIFLTK.,584.908716,156
45,BYL-719,30000nM,.LSSPATLNSR.,529.291188,70
10,AT-9283,300nM,.LISQIVSSITASLR.,487.217258,635
32,Apatinib,1000nM,.VLPSIVNEVLK.,405.91268,223
26,Afatinib,3nM,.GQVFDVGPR.,386.496438,143
8,AT-13148,1000nM,.-17.027QHSNAAQTQTGEANR.,375.491816,5578
3,AMG-208_withCAKI,100nM,.LSSPATLNSR.,351.412273,70


In [9]:
# Let's analyze the drug effects on peptides

# First, get all unique peptides in the results
peptides = results_df['Peptide'].unique()

# Create a dictionary to store peptide-drug relationships
peptide_effects = {}

# For each peptide, find all drugs that affect it
for peptide in peptides:
    # Filter results to just this peptide
    peptide_data = results_df[results_df['Peptide'] == peptide]
    
    # Get the drug with maximum and minimum effect
    if not peptide_data.empty:
        max_effect = peptide_data.loc[peptide_data['Fold_Change'].idxmax()]
        min_effect = peptide_data.loc[peptide_data['Fold_Change'].idxmin()]
        
        peptide_effects[peptide] = {
            'max_effect': {
                'drug': max_effect['Drug'],
                'concentration': max_effect['Max_Effect_Concentration'],
                'fold_change': max_effect['Fold_Change']
            },
            'min_effect': {
                'drug': min_effect['Drug'],
                'concentration': min_effect['Max_Effect_Concentration'],
                'fold_change': min_effect['Fold_Change']
            },
            'count': len(peptide_data)
        }

# Convert to DataFrame for display
effect_summary = []
for peptide, data in peptide_effects.items():
    effect_summary.append({
        'Peptide': peptide,
        'Affected_by_n_drugs': data['count'],
        'Max_effect_drug': data['max_effect']['drug'],
        'Max_effect_conc': data['max_effect']['concentration'],
        'Max_fold_change': data['max_effect']['fold_change'],
        'Min_effect_drug': data['min_effect']['drug'],
        'Min_effect_conc': data['min_effect']['concentration'],
        'Min_fold_change': data['min_effect']['fold_change']
    })

# Sort by the number of drugs affecting each peptide (descending)
effect_summary_df = pd.DataFrame(effect_summary).sort_values(['Affected_by_n_drugs', 'Max_fold_change'], 
                                                          ascending=[False, False])

# Display the results
display(effect_summary_df)

Unnamed: 0,Peptide,Affected_by_n_drugs,Max_effect_drug,Max_effect_conc,Max_fold_change,Min_effect_drug,Min_effect_conc,Min_fold_change
1,.LSSPATLNSR.,5,ARRY-380,30nM,1010.434412,AZD-8186,10nM,126.724213
3,.LISQIVSSITASLR.,2,AT-9283,300nM,487.217258,AEW-541,30nM,318.550597
4,.VLPSIVNEVLK.,2,Apatinib,1000nM,405.91268,BMS-911543,300nM,37.741754
15,.NKLNDLEDALQQAKEDLAR.,2,AEE-788_inBT474,30000nM,136.736614,Alvocidib,3nM,62.755512
24,.LGAQLADLHLDNKK.,2,AZD-7762,3000nM,85.002538,BMS-387032,3000nM,65.238538
0,.GLVTPPMK.,1,ARRY-380_inBT474,3000nM,3116.242709,ARRY-380_inBT474,3000nM,3116.242709
2,.TLNIFLTK.,1,BMS-690514_inBT474,30000nM,584.908716,BMS-690514_inBT474,30000nM,584.908716
5,.GQVFDVGPR.,1,Afatinib,3nM,386.496438,Afatinib,3nM,386.496438
6,.-17.027QHSNAAQTQTGEANR.,1,AT-13148,1000nM,375.491816,AT-13148,1000nM,375.491816
7,.HVGDLGNVTADK.,1,Apitolisib,3nM,299.063008,Apitolisib,3nM,299.063008
