In [14]:
import pandas as pd
import numpy as np
import plotly.express as px
import os
import plotly.express as px
import plotly.graph_objects as go

In [5]:
# datasets
data_fname = "./{}_peptides_ms_all.csv"
all_datasets = ["mhcflurry", "netmhc"]
all_datasets_names = ["MHCFlurry2.0", "NetMHCpan4.1", "MHCFlurry2.0", "NetMHCpan4.1"]
all_datasets_fname = [data_fname.format(x) for x in all_datasets]

In [6]:
data_fname = "./{}_peptides_ba.csv"
all_datasets_fname.extend([data_fname.format(x) for x in all_datasets])

In [7]:
all_data_pd = None
for i, ds in enumerate(all_datasets_fname):
    dataset_df =  pd.read_csv(ds, header=None, sep="\t")
    if "_ba" in ds:
        dataset_df["dataset_name"] = dataset_df[0].apply(lambda x: all_datasets_names[i]+"_BA")
    if "_ms_" in ds:
        dataset_df["dataset_name"] = dataset_df[0].apply(lambda x: all_datasets_names[i]+"_MS")
        
    if all_data_pd is None:
        all_data_pd = dataset_df
    else:
        all_data_pd = pd.concat([all_data_pd, dataset_df])

In [8]:
all_data_pd.columns = ["peptide", "allele", "locus", "dataset_name"]

In [9]:
def plot(all_alleles_all_datasets, locus):
    tmp_data = all_alleles_all_datasets[all_alleles_all_datasets.apply(lambda x: x.allele[4]==locus, axis=1)]
    symbols_dict = {"MHCFlurry2.0_BA": "line-ne", \
                    "NetMHCpan4.1_BA": "line-nw", \
                    "MHCFlurry2.0_MS": "line-ns", \
                    "NetMHCpan4.1_MS": "line-ew" }
    colors_dict = {"MHCFlurry2.0_BA": px.colors.qualitative.D3[3], \
                    "NetMHCpan4.1_BA": px.colors.qualitative.D3[3], \
                    "MHCFlurry2.0_MS": px.colors.qualitative.D3[0], \
                    "NetMHCpan4.1_MS": px.colors.qualitative.D3[0]}
    allele_order_df = tmp_data[tmp_data["dataset_name"]=="MHCFlurry2.0_MS"][["allele", "peptide_counts"]]\
    .sort_values(by="peptide_counts", ascending=False).reset_index(drop=True).reset_index()[["allele", "index"]]
    tmp_data["symbols"] = tmp_data.dataset_name.apply(lambda x: symbols_dict[x])
    tmp_data["color"] = tmp_data.dataset_name.apply(lambda x: colors_dict[x])
    tmp_data["allele_order"] = tmp_data.allele.apply(lambda x: allele_order_df["index"][allele_order_df.allele==x].iloc[0])
    #tmp_data = tmp_data.sort_values(by=["peptide_counts", "allele_short"], ascending=[False, True])
    fig = go.Figure()
    for dsn in ["MHCFlurry2.0_MS", "NetMHCpan4.1_MS", "MHCFlurry2.0_BA", "NetMHCpan4.1_BA"]:
        tmp_data_ds = tmp_data[tmp_data.dataset_name == dsn].sort_values(by="allele_order")
        tmp_data_ds_no_zeros = tmp_data_ds[tmp_data_ds["peptide_counts"]!=0]
        #tmp_data_ds = tmp_data_ds.sort_values(by="allele_short")
        '''
        fig.add_trace(go.Bar(x=tmp_data_ds.allele_short, y=tmp_data_ds.peptide_counts,#/tmp_data_ds.peptide_counts.sum(), \
                             name = dsn, marker_color = colors_dict[dsn]\
                            # marker_symbol=tmp_data_ds.symbols, opacity=0.5, \
                             #marker_line_width=1, marker_size=0, marker_color = tmp_data_ds.color,\
                             #line_color = colors_dict[dsn], line=dict(dash='dot')
                          ))
        '''
        fig.add_trace(go.Scatter(mode="lines", x=tmp_data_ds.allele_short, y=tmp_data_ds.peptide_counts, \
                                 marker_symbol=tmp_data_ds.symbols, opacity=0.5, \
                                 marker_line_width=1, marker_size=0, name = dsn, marker_color = tmp_data_ds.color,\
                                 line_color = colors_dict[dsn], line=dict(dash='dot')
                              ))

        fig.add_trace(go.Scatter(mode="markers", x=tmp_data_ds_no_zeros.allele_short, y=tmp_data_ds_no_zeros.peptide_counts, \
                             marker_symbol=tmp_data_ds_no_zeros.symbols, opacity=1, \
                             marker_line_width=2, marker_size=9, name = dsn, marker_color = tmp_data_ds_no_zeros.color,\
                             line_color = colors_dict[dsn], line=dict(dash='dot')
                          ))
        fig.add_trace(go.Scatter(mode="markers", x=tmp_data_ds_no_zeros.allele_short, y=tmp_data_ds_no_zeros.peptide_counts, \
                         marker_symbol="circle-open", opacity=1, \
                         marker_line_width=2, marker_size=9, name = dsn, marker_color = tmp_data_ds_no_zeros.color,\
                         line_color = colors_dict[dsn], line=dict(dash='dot')
                      ))
        
                              # marker_line_width=2, marker_size=15,
                              # hovertemplate="name: %{y}%{x}<br>number: %{marker.symbol}<extra></extra>"))
    fig.update_layout(width=800, height=400, yaxis_range=[-1000, 25000], \
                      #barmode='stack',\
                      xaxis=dict(tickangle=45,tick0=0, dtick=1.2))
    return fig

In [11]:
tmp = all_data_pd.groupby(by=["dataset_name", "allele"]).count().reset_index()
tmp = tmp[tmp.columns[:-1]]
all_data_freqs =  tmp.groupby("dataset_name")["peptide"].apply(lambda x: x*100/x.sum()).reset_index()
tmp["peptide"] = all_data_freqs["peptide"]

all_data_freqs = tmp

all_data_counts = all_data_pd.groupby(by=["dataset_name", "allele"]).count().reset_index()
all_data_freqs = all_data_freqs.sort_values(["dataset_name", "allele"], ascending=[False, True])

all_data_counts = all_data_counts.sort_values(["dataset_name", "allele"], ascending=[False, True])
all_data_counts["allele_short"] = all_data_counts["allele"].apply(lambda x: x[4:])
all_data_freqs["allele_short"] = all_data_freqs["allele"].apply(lambda x: x[4:])

In [12]:
interesting_alleles = all_data_counts[all_data_counts.allele\
                                      .apply(lambda x: any(all_data_counts[all_data_counts.allele==x].peptide > 1000))]\
.allele.unique()
all_alleles_all_datasets = pd.DataFrame({"dataset_name": all_data_counts.dataset_name.unique()}).merge( \
pd.DataFrame({"allele": interesting_alleles}) ,\
         how="cross")
def get_counts(allele, dataset_name, all_data_counts):
    tmp = all_data_counts[(all_data_counts.allele == allele)\
                    & (all_data_counts.dataset_name == dataset_name)].peptide
    if tmp.shape[0]>0:
        return tmp.iloc[0]
    else:
        return 0
                                                                          
all_alleles_all_datasets["peptide_counts"] = all_alleles_all_datasets.apply(lambda x: \
                                                                            get_counts(x.allele, x.dataset_name, \
                                                                                      all_data_counts), axis=1)

all_alleles_all_datasets["allele_short"] = all_alleles_all_datasets["allele"].apply(lambda x: x[4:])

In [21]:
import warnings
warnings.filterwarnings('ignore')

## Dataset contents locus A

In [30]:
fig = plot(all_alleles_all_datasets, "A")

fig.update_layout(width=1200, yaxis_title="#number of peptides", showlegend=True)

## Dataset contents locus B

In [31]:
fig = plot(all_alleles_all_datasets, "B")

fig.update_layout(width=1300, yaxis_title="#number of peptides", showlegend=True)

## Dataset contents locus C

In [29]:
fig = plot(all_alleles_all_datasets, "C")

fig.update_layout(width=800, yaxis_title="#number of peptides", showlegend=True)