In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_excel("/home/anushka.agrawal/hla_peptide_selection/braun_et_al_2025_supp1.xlsx")

In [3]:
# rename columns to concatenate cell line / replicate info
df.rename(columns={
    'VMM1': 'VMM1/Nil 1',
    'Unnamed: 6': 'VMM1/Nil 2',
    'Unnamed: 7': 'VMM1/Nil 3',
    'Unnamed: 8': 'VMM1/IFN 1',
    'Unnamed: 9': 'VMM1/IFN 2',
    'Unnamed: 10': 'VMM1/IFN 3',
    
    1106: '1106/Nil 1',
    'Unnamed: 12': '1106/Nil 2',
    'Unnamed: 13': '1106/Nil 3',
    'Unnamed: 14': '1106/IFN 1',
    'Unnamed: 15': '1106/IFN 2',
    'Unnamed: 16': '1106/IFN 3',
    
    'A431_sC6': 'A431_sC6/Rep 1',
    'Unnamed: 18': 'A431_sC6/Rep 2',
    'Unnamed: 19': 'A431_sC6/Rep 3',
}, inplace=True)

df.dropna(subset=['Peptide'], inplace=True)

In [4]:
pd.set_option('display.max_columns', None)

# drop empty column
df.drop(['Unnamed: 23'], axis=1, inplace=True)

# drop peptides w post-translational modifications
df.drop(df[df['Modification'] != 'Not found'].index, inplace=True)

# drop peptides that aren't high confidence from C0602 source based clustering 
df.drop(df[df['C0602 Source Based\non Gibbs Clustering'] != 'high confidence'].index, inplace=True)

In [5]:
# add column indicating how many samples a peptide was detected in
df['detection_count'] = 0

# subset columns w mass spec data
sample_cols = ['VMM1/Nil 1', 'VMM1/Nil 2', 'VMM1/Nil 3', 'VMM1/IFN 1', 'VMM1/IFN 2', 'VMM1/IFN 3', '1106/Nil 1', '1106/Nil 2', '1106/Nil 3', '1106/IFN 1', '1106/IFN 2', '1106/IFN 3', 'A431_sC6/Rep 1', 'A431_sC6/Rep 2', 'A431_sC6/Rep 3']

def set_detection_count(row, cols):
    """
    for a row, count how many of the mass spec cols are nonzero
    """
    count = 0
    for col in cols:
        if pd.notna(row[col]) and row[col] != '-' and row[col] != 0:
            count += 1
    return count

# set detection count to number of samples peptide was detected in
df['detection_count'] = df.apply(set_detection_count, axis=1, args=[sample_cols])
df.sort_values(by='detection_count', ascending=False, inplace=True)

In [6]:
# read in gtex
gtex_df = pd.read_table("GTEx_Analysis_v10_RNASeQCv2.4.2_gene_median_tpm.gct", sep='\t', header=2)

In [7]:
# df w just gtex skin_sun_exposed_lower_leg
gtex_leg_df = gtex_df[['Name', 'Description', 'Skin_Sun_Exposed_Lower_leg']]

In [8]:
# set uniprot id column
df['uniprot_id'] = df['Accession'].apply(lambda x: x.split('|')[0])

# map uniprot ids in df to ensembl
ensembl_map = pd.read_table("/home/anushka.agrawal/hla_peptide_selection/ensembl_uniprot_mapping.txt", sep='\t')
ensembl_map.rename(columns={"UniProtKB Gene Name ID":'uniprot_id'}, inplace=True)

df = df.merge(ensembl_map, on='uniprot_id')

In [9]:
# merge on ensembl id
df.rename(columns={'Gene stable ID':'ensembl ID'}, inplace=True)
gtex_leg_df['ensembl ID'] = gtex_leg_df['Name'].apply(lambda x: x.split('.')[0])

df = df.merge(gtex_leg_df, on='ensembl ID')
df.drop(columns='Name',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gtex_leg_df['ensembl ID'] = gtex_leg_df['Name'].apply(lambda x: x.split('.')[0])


In [10]:
# find median intensity across all samples
df[sample_cols] = df[sample_cols].apply(pd.to_numeric, errors='coerce')  
df['median_intensity'] = df[sample_cols].median(axis=1, skipna=True)

In [11]:
df.sort_values(by=['detection_count','Skin_Sun_Exposed_Lower_leg'], ascending=[False, False], inplace=True)

In [13]:
sample_cols = ['VMM1/Nil 1', 'VMM1/Nil 2', 'VMM1/Nil 3', 'VMM1/IFN 1', 'VMM1/IFN 2', 'VMM1/IFN 3', '1106/Nil 1', '1106/Nil 2', '1106/Nil 3', '1106/IFN 1', '1106/IFN 2', '1106/IFN 3', 'A431_sC6/Rep 1', 'A431_sC6/Rep 2', 'A431_sC6/Rep 3']

# get detection counts and median intensity for melanocytes and keratinocytes
melanocyte_cols = ['VMM1/Nil 1', 'VMM1/Nil 2', 'VMM1/Nil 3', 'VMM1/IFN 1', 'VMM1/IFN 2', 'VMM1/IFN 3']
keratinocyte_cols = ['1106/Nil 1', '1106/Nil 2', '1106/Nil 3', '1106/IFN 1', '1106/IFN 2', '1106/IFN 3','A431_sC6/Rep 1', 'A431_sC6/Rep 2', 'A431_sC6/Rep 3']

df['melanocyte_median_intensity'] = df[melanocyte_cols].median(axis=1, skipna=True)
df['keratinocyte_median_intensity'] = df[keratinocyte_cols].median(axis=1, skipna=True)


df['melanocyte_detection_count'] = df.apply(set_detection_count, axis=1, args=[melanocyte_cols])
df['keratinocyte_detection_count'] = df.apply(set_detection_count, axis=1, args=[keratinocyte_cols])

In [14]:
# add rankings to df
melanocyte_peptides_ranked = df.sort_values(by=['melanocyte_detection_count','NetMHCpan.4 \nScore'], ascending=[False, True]).reset_index(drop=True).reset_index()
melanocyte_peptides_ranked.rename(columns={'index':'melanocyte_rank'}, inplace=True)
melanocyte_peptides_ranked['melanocyte_rank'] += 1

keratinocyte_peptides_ranked = df.sort_values(by=['keratinocyte_detection_count','NetMHCpan.4 \nScore'], ascending=[False, True]).reset_index(drop=True).reset_index()
keratinocyte_peptides_ranked.rename(columns={'index':'keratinocyte_rank'}, inplace=True)
keratinocyte_peptides_ranked['keratinocyte_rank'] += 1

gtex_skinleg_peptides_ranked = df.sort_values(by=['Skin_Sun_Exposed_Lower_leg', 'detection_count'], ascending=[False, False]).reset_index(drop=True).reset_index()
gtex_skinleg_peptides_ranked.rename(columns={'index':'gtex_rank'}, inplace=True)
gtex_skinleg_peptides_ranked.rename(columns={'Skin_Sun_Exposed_Lower_leg':'GTEx_Skin_Sun_Exposed_Lower_leg'}, inplace=True)

In [15]:
melanocyte_ranked = melanocyte_peptides_ranked[['Accession', 'melanocyte_rank', 'Peptide', 'ensembl ID']]
keratinocyte_ranked = keratinocyte_peptides_ranked[['Accession', 'keratinocyte_rank', 'Peptide', 'ensembl ID']]
gtex_ranked = gtex_skinleg_peptides_ranked[['Accession', 'gtex_rank', 'Peptide', 'ensembl ID']]

In [16]:
merged_df = df.merge(melanocyte_ranked, on=['Accession', 'Peptide', 'ensembl ID']).drop_duplicates('melanocyte_rank')
merged_df = merged_df.merge(keratinocyte_ranked, on=['Accession', 'Peptide', 'ensembl ID']).drop_duplicates('keratinocyte_rank')
merged_df = merged_df.merge(gtex_ranked, on=['Accession', 'Peptide', 'ensembl ID']).drop_duplicates('gtex_rank')

In [17]:
# also add rankings specifically for 1106 and A431 keratinocyte cell lines
cols_1106 = ['1106/Nil 1', '1106/Nil 2', '1106/Nil 3', '1106/IFN 1', '1106/IFN 2', '1106/IFN 3']
cols_A431 = ['A431_sC6/Rep 1', 'A431_sC6/Rep 2', 'A431_sC6/Rep 3']

df['1106_median_intensity'] = df[cols_1106].median(axis=1, skipna=True)
df['A431_median_intensity'] = df[cols_A431].median(axis=1, skipna=True)

df['1106_detection_count'] = df.apply(set_detection_count, axis=1, args=[cols_1106])
df['A431_detection_count'] = df.apply(set_detection_count, axis=1, args=[cols_A431])

ranked_1106 = df.sort_values(by=['1106_detection_count','NetMHCpan.4 \nScore'], ascending=[False, True]).reset_index(drop=True).reset_index()
ranked_1106.rename(columns={'index':'1106_rank'}, inplace=True)
ranked_1106['1106_rank'] += 1

ranked_A431 = df.sort_values(by=['A431_detection_count','NetMHCpan.4 \nScore'], ascending=[False, True]).reset_index(drop=True).reset_index()
ranked_A431.rename(columns={'index':'A431_rank'}, inplace=True)
ranked_A431['A431_rank'] += 1

ranked_1106 = ranked_1106[['Accession', '1106_rank', 'Peptide', 'ensembl ID']]
ranked_A431 = ranked_A431[['Accession', 'A431_rank', 'Peptide', 'ensembl ID']]

merged_df = df.merge(ranked_1106, on=['Accession', 'Peptide', 'ensembl ID']).drop_duplicates('1106_rank')
merged_df = df.merge(ranked_A431, on=['Accession', 'Peptide', 'ensembl ID']).drop_duplicates('A431_rank')

In [22]:
# drop duplicates
merged_df.drop_duplicates('Peptide', inplace=True)

# drop some metadata cols
merged_df.drop(columns=['Pos', 'Modification', 'C0602 Binding Prediction\n(SB = strong binder, WB = weak binder)', 'C0602 Source Based\non Gibbs Clustering'], inplace=True)
merged_df.shape

(10331, 39)

In [23]:
# read in netMHCpan 4.1 results for peptides in merged df
netmhc_4_1 = pd.read_excel("netMHCpan4_1_results.xlsx", header=1)

netmhc_4_1['NB'] = netmhc_4_1['EL_Rank'].apply(lambda x: 'SB' if x <= 0.5 else ('WB' if x <= 2 else None))
netmhc_4_1.rename(columns={'EL_Rank':'netMHCpan4_1', 'NB':'netMHCpan4_1_binding_prediction'}, inplace=True)
netmhc_4_1.drop(columns=['ID', 'core', 'icore', 'Ave', 'EL-score'], inplace=True)

Unnamed: 0,Pos,Peptide,netMHCpan4_1,netMHCpan4_1_binding_prediction
0,0,TRQDHAQQL,0.0066,SB
1,0,SSGPQRLV,4.374,
2,0,GGDSSSGPQRLV,30.3,
3,0,DSSSGPQRLV,14.4174,
4,0,SSSGPQRLV,0.3362,SB


In [24]:
# merge netMHCpan 4.1 with df
merged_df = merged_df.merge(netmhc_4_1, on='Peptide').drop_duplicates()

In [26]:
merged_df.shape

(10331, 42)

In [None]:
# write to csv
merged_df.to_csv("braun2025_peptide_rank.csv", index=False)