In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import os

# 1 - Visualize per-allele dataset contents

In [2]:
# datasets
data_fname = "./datasets/{}_peptides_ms_all.csv"
all_datasets = ["mhcflurry", "netmhc"]
all_datasets_names = ["MHCFlurry2.0", "NetMHCpan4.1", "MHCFlurry2.0", "NetMHCpan4.1"]
all_datasets_fname = [data_fname.format(x) for x in all_datasets]

In [3]:
data_fname = "./datasets/{}_peptides_ba.csv"
all_datasets_fname.extend([data_fname.format(x) for x in all_datasets])

In [4]:
all_data_pd = None
for i, ds in enumerate(all_datasets_fname):
    dataset_df =  pd.read_csv(ds, header=None, sep="\t")
    if "_ba" in ds:
        dataset_df["dataset_name"] = dataset_df[0].apply(lambda x: all_datasets_names[i]+"_BA")
    if "_ms_" in ds:
        dataset_df["dataset_name"] = dataset_df[0].apply(lambda x: all_datasets_names[i]+"_MS")
        
    if all_data_pd is None:
        all_data_pd = dataset_df
    else:
        all_data_pd = pd.concat([all_data_pd, dataset_df])

In [5]:
all_data_pd.columns = ["peptide", "allele", "locus", "dataset_name"]
all_data_pd

Unnamed: 0,peptide,allele,locus,dataset_name
0,RYMPQNPCII,HLA-A*24:02,A,MHCFlurry2.0_MS
1,LYQGLLPSL,HLA-A*24:02,A,MHCFlurry2.0_MS
2,YYNPHLLLNTL,HLA-A*24:02,A,MHCFlurry2.0_MS
3,KTNHVFFLL,HLA-A*24:02,A,MHCFlurry2.0_MS
4,TYRNVMEQF,HLA-A*24:02,A,MHCFlurry2.0_MS
...,...,...,...,...
170465,YDGRYWTMWK,HLA-C*08:03,C,NetMHCpan4.1_BA
170466,YEPSQSAQL,HLA-B*55:01,B,NetMHCpan4.1_BA
170467,YKKEQTLK,HLA-B*55:01,B,NetMHCpan4.1_BA
170468,YLRMRRTAAL,HLA-B*15:10,B,NetMHCpan4.1_BA


In [11]:
all_data_freqs = all_data_pd.groupby(by=["dataset_name", "allele"]).count().groupby(["dataset_name"]).apply(lambda x: x*100/x.sum())

In [14]:
all_data_freqs

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,peptide,locus
dataset_name,dataset_name,allele,Unnamed: 3_level_1,Unnamed: 4_level_1
MHCFlurry2.0_BA,MHCFlurry2.0_BA,HLA-A*01:01,3.063406,3.063406
MHCFlurry2.0_BA,MHCFlurry2.0_BA,HLA-A*02:01,9.496377,9.496377
MHCFlurry2.0_BA,MHCFlurry2.0_BA,HLA-A*02:02,2.528986,2.528986
MHCFlurry2.0_BA,MHCFlurry2.0_BA,HLA-A*02:03,3.824275,3.824275
MHCFlurry2.0_BA,MHCFlurry2.0_BA,HLA-A*02:04,0.005435,0.005435
...,...,...,...,...
NetMHCpan4.1_MS,NetMHCpan4.1_MS,HLA-C*15:02,0.575444,0.575444
NetMHCpan4.1_MS,NetMHCpan4.1_MS,HLA-C*15:05,0.032119,0.032119
NetMHCpan4.1_MS,NetMHCpan4.1_MS,HLA-C*16:01,1.274862,1.274862
NetMHCpan4.1_MS,NetMHCpan4.1_MS,HLA-C*16:04,0.017410,0.017410


In [None]:
all_data_counts = all_data_pd.groupby(by=["dataset_name", "allele"]).count().reset_index()

In [389]:
all_data_freqs = all_data_freqs.sort_values(["dataset_name", "allele"], ascending=[False, True])

In [417]:
all_data_counts = all_data_counts.sort_values(["dataset_name", "allele"], ascending=[False, True])
all_data_counts["allele_short"] = all_data_counts["allele"].apply(lambda x: x[4:])

In [418]:
all_data_freqs["allele_short"] = all_data_freqs["allele"].apply(lambda x: x[4:])

In [397]:
#all_data["proc"] = all_data["freq"].apply(lambda x: x*100)
fig_A = px.scatter(all_data_freqs[all_data_freqs.apply(lambda x: x.allele[4]=="A" and x.peptide > 0.1, axis=1)]\
             , x="allele_short", y="peptide", color='dataset_name', width=1500, height=300)
fig_A.update_xaxes(title="<b>HLA Allele</b>", titlefont = dict(size = 18), tickfont = dict(size = 15), tickangle = 45)
fig_A.update_yaxes(title="<b>percent within the dataset</b>", titlefont = dict(size = 18), tickfont = dict(size = 15))
#fig.add_hline(y=1, line_width=2, line_dash="dash", line_color="black")
fig_A.show()

In [402]:
import plotly.graph_objects as go

In [430]:
import plotly.express as px

print(px.colors.qualitative.Vivid)

['rgb(229, 134, 6)', 'rgb(93, 105, 177)', 'rgb(82, 188, 163)', 'rgb(153, 201, 69)', 'rgb(204, 97, 176)', 'rgb(36, 121, 108)', 'rgb(218, 165, 27)', 'rgb(47, 138, 196)', 'rgb(118, 78, 159)', 'rgb(237, 100, 90)', 'rgb(165, 170, 153)']


Unnamed: 0,dataset_name,allele,peptide,locus,allele_short
396,NetMHCpan4.1_MS,HLA-A*01:01,10576,10576,A*01:01
398,NetMHCpan4.1_MS,HLA-A*02:01,19992,19992,A*02:01
399,NetMHCpan4.1_MS,HLA-A*02:03,2068,2068,A*02:03
400,NetMHCpan4.1_MS,HLA-A*02:04,3155,3155,A*02:04
401,NetMHCpan4.1_MS,HLA-A*02:05,1807,1807,A*02:05
...,...,...,...,...,...
125,MHCFlurry2.0_BA,HLA-C*08:02,135,135,C*08:02
127,MHCFlurry2.0_BA,HLA-C*12:02,27,27,C*12:02
128,MHCFlurry2.0_BA,HLA-C*12:03,172,172,C*12:03
129,MHCFlurry2.0_BA,HLA-C*14:02,243,243,C*14:02


In [574]:
interesting_alleles = all_data_counts[all_data_counts.allele\
                                      .apply(lambda x: any(all_data_counts[all_data_counts.allele==x].peptide > 1000))]\
.allele.unique()
all_alleles_all_datasets = pd.DataFrame({"dataset_name": all_data_counts.dataset_name.unique()}).merge( \
pd.DataFrame({"allele": interesting_alleles}) ,\
         how="cross")
def get_counts(allele, dataset_name, all_data_counts):
    tmp = all_data_counts[(all_data_counts.allele == allele)\
                    & (all_data_counts.dataset_name == dataset_name)].peptide
    if tmp.shape[0]>0:
        return tmp.iloc[0]
    else:
        return 0
                                                                          
all_alleles_all_datasets["peptide_counts"] = all_alleles_all_datasets.apply(lambda x: \
                                                                            get_counts(x.allele, x.dataset_name, \
                                                                                      all_data_counts), axis=1)

In [575]:

all_alleles_all_datasets["allele_short"] = all_alleles_all_datasets["allele"].apply(lambda x: x[4:])

In [565]:
tmp_data

Unnamed: 0,dataset_name,allele,peptide_counts,allele_short,symbols,color,allele_order
120,NetMHCpan4.1_MS,HLA-C*01:02,1700,C*01:02,line-ew,"rgb(55,126,184)",15
121,NetMHCpan4.1_MS,HLA-C*02:02,2733,C*02:02,line-ew,"rgb(55,126,184)",12
122,NetMHCpan4.1_MS,HLA-C*02:10,34,C*02:10,line-ew,"rgb(55,126,184)",24
123,NetMHCpan4.1_MS,HLA-C*03:03,2961,C*03:03,line-ew,"rgb(55,126,184)",4
124,NetMHCpan4.1_MS,HLA-C*03:04,2893,C*03:04,line-ew,"rgb(55,126,184)",9
...,...,...,...,...,...,...,...
763,MHCFlurry2.0_BA,HLA-C*03:01,0,C*03:01,line-ne,"rgb(228,26,28)",22
764,MHCFlurry2.0_BA,HLA-C*03:02,0,C*03:02,line-ne,"rgb(228,26,28)",18
765,MHCFlurry2.0_BA,HLA-C*03:13,0,C*03:13,line-ne,"rgb(228,26,28)",25
766,MHCFlurry2.0_BA,HLA-C*04:03,0,C*04:03,line-ne,"rgb(228,26,28)",19


In [670]:
def plot(all_alleles_all_datasets, locus):
    tmp_data = all_alleles_all_datasets[all_alleles_all_datasets.apply(lambda x: x.allele[4]==locus, axis=1)]
    symbols_dict = {"MHCFlurry2.0_BA": "line-ne", \
                    "NetMHCpan4.1_BA": "line-nw", \
                    "MHCFlurry2.0_MS": "line-ns", \
                    "NetMHCpan4.1_MS": "line-ew" }
    colors_dict = {"MHCFlurry2.0_BA": px.colors.qualitative.D3[3], \
                    "NetMHCpan4.1_BA": px.colors.qualitative.D3[3], \
                    "MHCFlurry2.0_MS": px.colors.qualitative.D3[0], \
                    "NetMHCpan4.1_MS": px.colors.qualitative.D3[0]}
    allele_order_df = tmp_data[tmp_data["dataset_name"]=="MHCFlurry2.0_MS"][["allele", "peptide_counts"]]\
    .sort_values(by="peptide_counts", ascending=False).reset_index(drop=True).reset_index()[["allele", "index"]]
    tmp_data["symbols"] = tmp_data.dataset_name.apply(lambda x: symbols_dict[x])
    tmp_data["color"] = tmp_data.dataset_name.apply(lambda x: colors_dict[x])
    tmp_data["allele_order"] = tmp_data.allele.apply(lambda x: allele_order_df["index"][allele_order_df.allele==x].iloc[0])
    #tmp_data = tmp_data.sort_values(by=["peptide_counts", "allele_short"], ascending=[False, True])
    fig = go.Figure()
    for dsn in ["MHCFlurry2.0_MS", "NetMHCpan4.1_MS", "MHCFlurry2.0_BA", "NetMHCpan4.1_BA"]:
        tmp_data_ds = tmp_data[tmp_data.dataset_name == dsn].sort_values(by="allele_order")
        tmp_data_ds_no_zeros = tmp_data_ds[tmp_data_ds["peptide_counts"]!=0]
        #tmp_data_ds = tmp_data_ds.sort_values(by="allele_short")
        '''
        fig.add_trace(go.Bar(x=tmp_data_ds.allele_short, y=tmp_data_ds.peptide_counts,#/tmp_data_ds.peptide_counts.sum(), \
                             name = dsn, marker_color = colors_dict[dsn]\
                            # marker_symbol=tmp_data_ds.symbols, opacity=0.5, \
                             #marker_line_width=1, marker_size=0, marker_color = tmp_data_ds.color,\
                             #line_color = colors_dict[dsn], line=dict(dash='dot')
                          ))
        '''
        fig.add_trace(go.Scatter(mode="lines", x=tmp_data_ds.allele_short, y=tmp_data_ds.peptide_counts, \
                                 marker_symbol=tmp_data_ds.symbols, opacity=0.5, \
                                 marker_line_width=1, marker_size=0, name = dsn, marker_color = tmp_data_ds.color,\
                                 line_color = colors_dict[dsn], line=dict(dash='dot')
                              ))

        fig.add_trace(go.Scatter(mode="markers", x=tmp_data_ds_no_zeros.allele_short, y=tmp_data_ds_no_zeros.peptide_counts, \
                             marker_symbol=tmp_data_ds_no_zeros.symbols, opacity=1, \
                             marker_line_width=2, marker_size=9, name = dsn, marker_color = tmp_data_ds_no_zeros.color,\
                             line_color = colors_dict[dsn], line=dict(dash='dot')
                          ))
        fig.add_trace(go.Scatter(mode="markers", x=tmp_data_ds_no_zeros.allele_short, y=tmp_data_ds_no_zeros.peptide_counts, \
                         marker_symbol="circle-open", opacity=1, \
                         marker_line_width=2, marker_size=9, name = dsn, marker_color = tmp_data_ds_no_zeros.color,\
                         line_color = colors_dict[dsn], line=dict(dash='dot')
                      ))
        
                              # marker_line_width=2, marker_size=15,
                              # hovertemplate="name: %{y}%{x}<br>number: %{marker.symbol}<extra></extra>"))
    fig.update_layout(width=800, height=400, yaxis_range=[-1000, 25000], \
                      #barmode='stack',\
                      xaxis=dict(tickangle=45,tick0=0, dtick=1.2))
    return fig

In [671]:
fig = plot(all_alleles_all_datasets, "C")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [672]:
fig

In [673]:
fig.write_image("scatterplot_C_alleles.svg", width=800, height=400)

In [656]:
plot(all_alleles_all_datasets, "B")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [584]:
fig.write_image("scatterplot_A_alleles.svg", width=1100, height=400)

In [585]:
tmp_data = all_alleles_all_datasets[all_alleles_all_datasets.apply(lambda x: x.allele[4]=="B", axis=1)]
symbols_dict = {"MHCFlurry2.0_BA": "line-ne", \
                "NetMHCpan4.1_BA": "line-nw", \
                "MHCFlurry2.0_MS": "line-ns", \
                "NetMHCpan4.1_MS": "line-ew" }
colors_dict = {"MHCFlurry2.0_BA": px.colors.qualitative.Set1[0], \
                "NetMHCpan4.1_BA": px.colors.qualitative.Set1[0], \
                "MHCFlurry2.0_MS": px.colors.qualitative.Set1[1], \
                "NetMHCpan4.1_MS": px.colors.qualitative.Set1[1]}
allele_order_df = tmp_data[tmp_data["dataset_name"]=="MHCFlurry2.0_MS"][["allele", "peptide_counts"]]\
.sort_values(by="peptide_counts", ascending=False).reset_index(drop=True).reset_index()[["allele", "index"]]
tmp_data["symbols"] = tmp_data.dataset_name.apply(lambda x: symbols_dict[x])
tmp_data["color"] = tmp_data.dataset_name.apply(lambda x: colors_dict[x])
tmp_data["allele_order"] = tmp_data.allele.apply(lambda x: allele_order_df["index"][allele_order_df.allele==x].iloc[0])
#tmp_data = tmp_data.sort_values(by=["peptide_counts", "allele_short"], ascending=[False, True])
fig = go.Figure()
for dsn in tmp_data.dataset_name.unique():
    tmp_data_ds = tmp_data[tmp_data.dataset_name == dsn].sort_values(by="allele_order")
    tmp_data_ds_no_zeros = tmp_data_ds[tmp_data_ds["peptide_counts"]!=0]
    #tmp_data_ds = tmp_data_ds.sort_values(by="allele_short")
    
    fig.add_trace(go.Scatter(mode="lines", x=tmp_data_ds.allele_short, y=tmp_data_ds.peptide_counts, \
                             marker_symbol=tmp_data_ds.symbols, opacity=0.5, \
                             marker_line_width=1, marker_size=0, name = dsn, marker_color = tmp_data_ds.color,\
                             line_color = colors_dict[dsn], line=dict(dash='dot')
                          ))
    
    fig.add_trace(go.Scatter(mode="markers", x=tmp_data_ds_no_zeros.allele_short, y=tmp_data_ds_no_zeros.peptide_counts, \
                         marker_symbol=tmp_data_ds_no_zeros.symbols, opacity=1, \
                         marker_line_width=2, marker_size=9, name = dsn, marker_color = tmp_data_ds_no_zeros.color,\
                         line_color = colors_dict[dsn], line=dict(dash='dot')
                      ))
    fig.add_trace(go.Scatter(mode="markers", x=tmp_data_ds_no_zeros.allele_short, y=tmp_data_ds_no_zeros.peptide_counts, \
                     marker_symbol="circle-open", opacity=1, \
                     marker_line_width=2, marker_size=9, name = dsn, marker_color = tmp_data_ds_no_zeros.color,\
                     line_color = colors_dict[dsn], line=dict(dash='dot')
                  ))
                          # marker_line_width=2, marker_size=15,
                          # hovertemplate="name: %{y}%{x}<br>number: %{marker.symbol}<extra></extra>"))
fig.update_layout(width=1300, height=400, yaxis_range=[-1000, 25000], xaxis=dict(tickangle=45,\
                                                                                tick0=0, dtick=1.2))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [586]:
fig.write_image("scatterplot_B_alleles.svg", width=1300, height=400)

In [587]:
tmp_data = all_alleles_all_datasets[all_alleles_all_datasets.apply(lambda x: x.allele[4]=="C", axis=1)]
symbols_dict = {"MHCFlurry2.0_BA": "line-ne", \
                "NetMHCpan4.1_BA": "line-nw", \
                "MHCFlurry2.0_MS": "line-ns", \
                "NetMHCpan4.1_MS": "line-ew" }
colors_dict = {"MHCFlurry2.0_BA": px.colors.qualitative.Set1[0], \
                "NetMHCpan4.1_BA": px.colors.qualitative.Set1[0], \
                "MHCFlurry2.0_MS": px.colors.qualitative.Set1[1], \
                "NetMHCpan4.1_MS": px.colors.qualitative.Set1[1]}
allele_order_df = tmp_data[tmp_data["dataset_name"]=="MHCFlurry2.0_MS"][["allele", "peptide_counts"]]\
.sort_values(by="peptide_counts", ascending=False).reset_index(drop=True).reset_index()[["allele", "index"]]
tmp_data["symbols"] = tmp_data.dataset_name.apply(lambda x: symbols_dict[x])
tmp_data["color"] = tmp_data.dataset_name.apply(lambda x: colors_dict[x])
tmp_data["allele_order"] = tmp_data.allele.apply(lambda x: allele_order_df["index"][allele_order_df.allele==x].iloc[0])
#tmp_data = tmp_data.sort_values(by=["peptide_counts", "allele_short"], ascending=[False, True])
fig = go.Figure()
for dsn in tmp_data.dataset_name.unique():
    tmp_data_ds = tmp_data[tmp_data.dataset_name == dsn].sort_values(by="allele_order")
    tmp_data_ds_no_zeros = tmp_data_ds[tmp_data_ds["peptide_counts"]!=0]
    #tmp_data_ds = tmp_data_ds.sort_values(by="allele_short")
    
    fig.add_trace(go.Scatter(mode="lines", x=tmp_data_ds.allele_short, y=tmp_data_ds.peptide_counts, \
                             marker_symbol=tmp_data_ds.symbols, opacity=0.5, \
                             marker_line_width=1, marker_size=0, name = dsn, marker_color = tmp_data_ds.color,\
                             line_color = colors_dict[dsn], line=dict(dash='dot')
                          ))
    
    fig.add_trace(go.Scatter(mode="markers", x=tmp_data_ds_no_zeros.allele_short, y=tmp_data_ds_no_zeros.peptide_counts, \
                         marker_symbol=tmp_data_ds_no_zeros.symbols, opacity=1, \
                         marker_line_width=2, marker_size=9, name = dsn, marker_color = tmp_data_ds_no_zeros.color,\
                         line_color = colors_dict[dsn], line=dict(dash='dot')
                      ))
    fig.add_trace(go.Scatter(mode="markers", x=tmp_data_ds_no_zeros.allele_short, y=tmp_data_ds_no_zeros.peptide_counts, \
                     marker_symbol="circle-open", opacity=1, \
                     marker_line_width=2, marker_size=9, name = dsn, marker_color = tmp_data_ds_no_zeros.color,\
                     line_color = colors_dict[dsn], line=dict(dash='dot')
                  ))
                          # marker_line_width=2, marker_size=15,
                          # hovertemplate="name: %{y}%{x}<br>number: %{marker.symbol}<extra></extra>"))
fig.update_layout(width=700, height=400, yaxis_range=[-1000, 25000], xaxis=dict(tickangle=45,\
                                                                                tick0=0, dtick=1.2))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [588]:
fig.write_image("scatterplot_C_alleles.svg", width=700, height=400)

In [36]:
fig_A.write_image("barplot_A_alleles.svg", width=1500, height=600)

In [58]:
#all_data["proc"] = all_data["freq"].apply(lambda x: x*100)
fig_B = px.bar(all_data_freqs[all_data_freqs.apply(lambda x: x.allele[4]=="B" and x.peptide > 0.1, axis=1)]\
             , x="allele_short", y="peptide", color='dataset_name', barmode="group", width=1500, height=300)
fig_B.update_xaxes(title="<b>HLA Allele</b>", titlefont = dict(size = 18), tickfont = dict(size = 15), tickangle = 45)
fig_B.update_yaxes(title="<b>percent within the dataset</b>", titlefont = dict(size = 18), tickfont = dict(size = 15))
#fig.add_hline(y=1, line_width=2, line_dash="dash", line_color="black")
fig_B.update_layout(showlegend=False, yaxis=dict(range=[0,11]))
fig_B.show()

In [63]:
fig_B.write_image("barplot_B_alleles.svg", width=1500, height=300)

In [62]:
#all_data["proc"] = all_data["freq"].apply(lambda x: x*100)
fig_C = px.bar(all_data_freqs[all_data_freqs.apply(lambda x: x.allele[4]=="C" and x.peptide > 0.1, axis=1)]\
             , x="allele_short", y="peptide", color='dataset_name', barmode="group", width=700, height=300)
fig_C.update_xaxes(title="<b>HLA Allele</b>", titlefont = dict(size = 18), tickfont = dict(size = 15), tickangle = 45)
fig_C.update_yaxes(title="<b>percent within the dataset</b>", titlefont = dict(size = 18), tickfont = dict(size = 15))
#fig.add_hline(y=1, line_width=2, line_dash="dash", line_color="black")
fig_C.update_layout(showlegend=False, yaxis=dict(range=[0,5]))
fig_C.show()

In [201]:
fig_C.write_image("barplot_C_alleles.svg", width=650, height=300)

## Visualize dataset balance our AFND curated data - IEDB pop coverage tool 

In [17]:
#result_files = [tmp for tmp in os.listdir("output_dataset_balance_afnd") if "pop_cov" in tmp]
#ideal_pc_files = [tmp for tmp in os.listdir("iedb_pc_results") if tmp[-8:]=="ideal.txt"]
#result_files = result_files[1:]

In [217]:
result_df_afnd = pd.read_csv("./output_dataset_balance_afnd/all_results.csv")
result_df_afnd = result_df_afnd[["population", "dataset", "js_div"]]

In [218]:
result_df_afnd

Unnamed: 0,population,dataset,js_div
0,American Samoa,hlathena,0.620448
1,American Samoa,mhcflurry,0.425913
2,American Samoa,netmhc,0.503264
3,Australia Cape York Peninsula Aborigine,hlathena,0.605182
4,Australia Cape York Peninsula Aborigine,mhcflurry,0.448087
...,...,...,...
766,Zambia Lusaka,mhcflurry,0.273066
767,Zambia Lusaka,netmhc,0.377838
768,Zimbabwe Harare Shona,hlathena,0.437495
769,Zimbabwe Harare Shona,mhcflurry,0.229575


In [3]:
afnd_df = pd.read_csv("AFND_data_locus_all.csv")

In [4]:
pop_continent_map = afnd_df.groupby(by=["Population", "country", "continent"]).sum().reset_index()
#pop_continent_map["Population"] = pop_continent_map["Population"].apply(lambda x: x.replace(" ", "").replace("/", ""))
#result_df_afnd.population = result_df_afnd.population.apply(lambda x: x.replace("_", "").replace(" ", "").replace("/", ""))

In [5]:
def find_country_continent(pop_continent_map, population):
    if population in pop_continent_map.Population.unique():
        return pop_continent_map[pop_continent_map.Population == population].country.iloc[0], \
        pop_continent_map[pop_continent_map.Population == population].continent.iloc[0]
    else:
        return None, None

In [222]:
result_df_afnd["country"] = result_df_afnd.population.apply(lambda x: find_country_continent(pop_continent_map, x)[0])
result_df_afnd["continent"] =result_df_afnd.population.apply(lambda x: find_country_continent(pop_continent_map, x)[1])

In [16]:
#all_results = all_results.sort_values(by=['dataset', "population"], ascending=[False, True])

In [223]:
result_df_afnd["dataset balance"] = result_df_afnd.js_div.apply(lambda x: 1-x)

In [224]:
result_df_afnd = result_df_afnd.sort_values(by=["dataset"], ascending=[False])

In [305]:
fig = px.box(result_df_afnd, x="continent", y="dataset balance", color="dataset",width=800, height=500, points="all")
fig.add_hline(y=1, line_width=2, line_dash="dash", line_color="red")
fig.show()

In [226]:
fig.write_image("dataset_afnd_balance.svg", width=800, height=500)

In [17]:
global_south_77 = pd.read_csv("global_south_77.txt", header=None)
global_south_77.columns = ["country"]
global_south_77 = global_south_77.append({"country": "China"}, ignore_index=True)


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [18]:
def map_population_to_gs_gn(population):
    if global_south_77.country.apply(lambda x: x in population).any():
        if "Caucasian" in population:
            return "Global North"
        return "Global South"
    else:
        if "African" in population or "Aborigine" in population or "Asian" in population \
        or "Vietnam" in population \
        or "Hispanic" in population or "Mexican" in population or "Vietnamese" in population\
        or "Maori" in population or "Polynesians" in population or "Sao Tome Island" in population:
            return "Global South"
        return "Global North"

In [21]:
gn_gs_map = { pop: map_population_to_gs_gn(pop) for pop in afnd_df.Population.unique()}

In [22]:
gn_gs_map

{'American Samoa': 'Global South',
 'Australia Cape York Peninsula Aborigine': 'Global South',
 'Australia Groote Eylandt Aborigine': 'Global South',
 'Australia Kimberly Aborigine': 'Global South',
 'Australia New South Wales Caucasian': 'Global North',
 'Australia Yuendumu Aborigine': 'Global South',
 'Bolivia/Chile Aymara NA-DHS_13 (G)': 'Global South',
 'Bolivia/Peru Quechua NA-DHS_12 (G)': 'Global South',
 'Brazil Barra Mansa Rio State Black': 'Global South',
 'Brazil Barra Mansa Rio State Caucasian': 'Global North',
 'Brazil Mixed': 'Global South',
 'Brazil Puyanawa': 'Global South',
 'Brazil Rio de Janeiro Black': 'Global South',
 'Brazil Rio de Janeiro Caucasian': 'Global North',
 'Brazil Rio de Janeiro Parda': 'Global South',
 'Brazil Terena': 'Global South',
 'Bulgaria Romani': 'Global North',
 'Cameroon Baka Pygmy': 'Global South',
 'Cameroon Beti': 'Global South',
 'Cameroon Sawa': 'Global South',
 'Canada Chipewyan NA-DHS_2 (G)': 'Global North',
 'Canada Cree NA-DHS_3 (G)'

In [232]:
result_df_afnd["GlobalSouth/GlobalNorth"] = result_df_afnd.population.apply(lambda x: gn_gs_map[x])

In [302]:
fig = px.violin(result_df_afnd, x="GlobalSouth/GlobalNorth", y="dataset balance", points="all", \
             color="dataset",width=650, height=500, orientation="v")
fig.add_hline(y=1, line_width=2, line_dash="dash", line_color="red")
fig.show()

In [180]:
fig.write_image("dataset_afnd_balance_gs_gn.svg", width=650, height=500)

## Visualize dataset balance IEDB ethnicities - IEDB pop coverage tool 

In [6]:
import os

In [7]:
result_df = pd.read_csv("./output_dataset_balance_iedb/all_results.csv")
result_df = result_df[["population", "dataset", "js_div"]]
result_df

FileNotFoundError: [Errno 2] No such file or directory: './output_dataset_balance_iedb/all_results.csv'

In [264]:
iedb_pop_map = pd.read_csv("./IEDB_population_map.csv")
iedb_pop_map = iedb_pop_map[["area", "country", "population", "ethnicity"]]
iedb_pop_map

Unnamed: 0,area,country,population,ethnicity
0,East Asia,Japan,Japan Oriental,Oriental
1,East Asia,Korea; South,Korea; South Oriental,Oriental
2,East Asia,Mongolia,Mongolia Oriental,Oriental
3,Northeast Asia,China,China Oriental,Oriental
4,Northeast Asia,Hong Kong,Hong Kong Oriental,Oriental
...,...,...,...,...
156,Oceania,Kiribati,Kiribati Micronesian,Micronesian
157,Oceania,New Zealand,New Zealand Polynesian,Polynesian
158,Oceania,Fiji,Fiji Melanesian,Melanesian
159,Oceania,Nauru,Nauru Micronesian,Micronesian


In [265]:
iedb_pop_map["population_no_suffix"] = iedb_pop_map.population.apply(lambda x: " ".join(x.split()[:-1]))

In [266]:
def match_population_to_iedb_map(population, iedb_map_df, column):
    
    try1 = iedb_map_df[iedb_map_df.population_no_suffix==population]
    if try1.shape[0] > 0:
        return try1.iloc[0][column]
    
    try2 = iedb_map_df[iedb_map_df.population==population]
    if try2.shape[0] > 0:
        return try2.iloc[0][column]
    return "?"

In [267]:
result_df["area"] = result_df.population.apply(lambda x: match_population_to_iedb_map(x, iedb_pop_map, "area"))
result_df["country"] = result_df.population.apply(lambda x: match_population_to_iedb_map(x, iedb_pop_map, "country"))
result_df["ethnicity"] = result_df.population.apply(lambda x: match_population_to_iedb_map(x, iedb_pop_map, "ethnicity"))

In [268]:
result_df[result_df["area"]=="?"].population.unique()

array(['Amerindian', 'Arab', 'Asian', 'Australian Aborigines',
       'Austronesian', 'Black', 'Caucasoid', 'Central Africa',
       'East Africa', 'East Asia', 'Europe', 'Hispanic', 'Jew', 'Kurd',
       'Melanesian', 'Mestizo', 'Mixed', 'North Africa', 'North America',
       'Northeast Asia', 'Oceania', 'Oriental', 'Other', 'Persian',
       'Polynesian', 'Siberian', 'South America', 'South Asia',
       'Southeast Asia', 'Southwest Asia', 'West Africa', 'West Indies',
       'World'], dtype=object)

In [269]:
result_df = result_df[~(result_df["area"]=="?")].reset_index()

In [270]:
result_df["dataset balance"] = result_df.js_div.apply(lambda x: 1-x)

In [271]:
result_df

Unnamed: 0,index,population,dataset,js_div,area,country,ethnicity,dataset balance
0,0,American Samoa,hlathena,0.623349,Oceania,American Samoa,Polynesian,0.376651
1,1,American Samoa,mhcflurry,0.433333,Oceania,American Samoa,Polynesian,0.566667
2,2,American Samoa,netmhc,0.509325,Oceania,American Samoa,Polynesian,0.490675
3,3,American Samoa Polynesian,netmhc,0.509325,Oceania,American Samoa,Polynesian,0.490675
4,13,Australia,hlathena,0.572915,Oceania,Australia,Caucasoid,0.427085
...,...,...,...,...,...,...,...,...
359,458,Zimbabwe,mhcflurry,0.243802,East Africa,Zimbabwe,Black,0.756198
360,459,Zimbabwe,netmhc,0.350945,East Africa,Zimbabwe,Black,0.649055
361,460,Zimbabwe Black,hlathena,0.447735,East Africa,Zimbabwe,Black,0.552265
362,461,Zimbabwe Black,mhcflurry,0.243802,East Africa,Zimbabwe,Black,0.756198


In [272]:
result_df = result_df.sort_values(by=["ethnicity", "dataset"], ascending=[True, False])

In [288]:
fig = px.box(result_df[~result_df.ethnicity.isin(["Mixed", "Mulato", "Mestizo", "Jew", \
                                                  "Siberian", "Hispanic", "Persian", "Other"])], \
             x="ethnicity", y="dataset balance", color="dataset",width=1000, height=500, points="all")
fig.add_hline(y=1, line_width=2, line_dash="dash", line_color="red")
fig.show()

In [301]:
fig = px.violin(result_df, \
             x="area", y="dataset balance", color="dataset",width=1200, height=500, points="all")
fig.add_hline(y=1, line_width=2, line_dash="dash", line_color="red")
fig.show()

In [290]:
coarse_grained_continents = {
"Oceania": "Oceania",
"South America": "South America",
"Europe": "Europe",
"West Africa": "Africa",
"North Africa": "Africa",
"South Africa": "Africa",
"East Africa": "Africa",
"Central Africa": "Africa",
"East Asia": "Asia",
"Northeast Asia": "Asia",
"Southwest Asia": "Asia",
"Southeast Asia": "Asia",
"South Asia": "Asia",
"North America": "North America"}

In [291]:
result_df["continent"] = result_df["area"].apply(lambda x: coarse_grained_continents[x])

In [300]:
fig = px.violin(result_df, \
             x="continent", y="dataset balance", color="dataset",width=800, height=500, points="all")
fig.add_hline(y=1, line_width=2, line_dash="dash", line_color="red")
fig.show()

In [294]:
global_south_77 = pd.read_csv("global_south_77.txt", header=None)
global_south_77.columns = ["country"]
global_south_77 = global_south_77.append({"country": "China"}, ignore_index=True)


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [295]:
def map_population_to_gs_gn_iedb(population):
    if global_south_77.country.apply(lambda x: x in population).any():
        if "Caucasoid" in population:
            return "Global North"
        return "Global South"
    else:
        if "African" in population or "Aborigine" in population or "Asian" in population \
        or "Vietnam" in population \
        or "Hispanic" in population or "Mexican" in population or "Vietnamese" in population\
        or "Black" in population or "Amerindian" in population or "Hispanic" in population \
        or "Austronesian" in population or "Austronesian" in population \
        or "Maori" in population or "Polynesians" in population or "Sao Tome Island" in population:
            return "Global South"
        return "Global North"

In [296]:
result_df["GlobalSouth/GlobalNorth"] = result_df.population.apply(lambda x: map_population_to_gs_gn_iedb(x))

In [299]:
fig = px.violin(result_df[~result_df.ethnicity.isin(["Mixed", "Mulato", "Mestizo", "Other"])], \
             x="GlobalSouth/GlobalNorth", y="dataset balance", color="dataset",width=800, height=500, points="all")
fig.add_hline(y=1, line_width=2, line_dash="dash", line_color="red")
fig.show()

## Visualize PC90

In [100]:
### Worldbank country income classifications

# from here: https://datahelpdesk.worldbank.org/knowledgebase/articles/906519-world-bank-country-and-lending-groups
#  `current classification by income in XLSX format`
world_bank_mapping = pd.read_csv("WorldBank_CLASS.csv", encoding='latin-1')
world_bank_mapping

Unnamed: 0,Economy,Code,Region,Income group,Lending category,Other (EMU or HIPC)
0,Aruba,ABW,Latin America & Caribbean,High income,,
1,Afghanistan,AFG,South Asia,Low income,IDA,HIPC
2,Angola,AGO,Sub-Saharan Africa,Lower middle income,IBRD,
3,Albania,ALB,Europe & Central Asia,Upper middle income,IBRD,
4,Andorra,AND,Europe & Central Asia,High income,,
...,...,...,...,...,...,...
213,Kosovo,XKX,Europe & Central Asia,Upper middle income,IDA,
214,"Yemen, Rep.",YEM,Middle East & North Africa,Low income,IDA,
215,South Africa,ZAF,Sub-Saharan Africa,Upper middle income,IBRD,
216,Zambia,ZMB,Sub-Saharan Africa,Low income,IDA,HIPC


In [135]:
def map_country_to_world_bank(country, wb_map):
    
    #adjust some country names
    if country == "England":
        country = "United Kingdom"
    if country == "Gaza":
        country = "West Bank and Gaza"
    if country == "Hong Kong":
        country = "Hong Kong SAR, China"
    if country == "Iran":
        country = "Iran, Islamic Rep."
    if country == "Ireland Northern" or country == "Ireland South":
        country = "Ireland"
    if country == "Russia":
        country = "Russian Federation"
    if country == "South Korea":
        country = "Korea, Rep."
    if country == "Taiwan":
        country = "Taiwan, China"
    if country == "Venezuela":
        country = "Venezuela, RB"
    
    wb_row = wb_map[wb_map["Economy"]==country]
    if wb_row.shape[0] > 0:
        return wb_row.iloc[0]["Income group"],wb_row.iloc[0]["Region"]
    else:
        return None, None

In [136]:
population_wb_income = pd.DataFrame({"population": pc90_results.population,
              "income_group": pc90_results.country.apply(lambda x: map_country_to_world_bank(x, world_bank_mapping)[0]),
              "region": pc90_results.country.apply(lambda x: map_country_to_world_bank(x, world_bank_mapping)[1])
             })
population_wb_income

Unnamed: 0,population,income_group,region
0,American Samoa,Upper middle income,East Asia & Pacific
1,American Samoa,Upper middle income,East Asia & Pacific
2,Australia Cape York Peninsula Aborigine,High income,East Asia & Pacific
3,Australia Cape York Peninsula Aborigine,High income,East Asia & Pacific
4,Australia Groote Eylandt Aborigine,High income,East Asia & Pacific
...,...,...,...
509,Vietnam Kinh,Lower middle income,East Asia & Pacific
510,Zambia Lusaka,Low income,Sub-Saharan Africa
511,Zambia Lusaka,Low income,Sub-Saharan Africa
512,Zimbabwe Harare Shona,Lower middle income,Sub-Saharan Africa


In [137]:
population_wb_income = population_wb_income.drop_duplicates().reset_index(drop=True)

In [284]:
dir_location_ms = "./pc90_afnd_ms_all_datasets"

In [285]:
dir_location_ba = "./pc90_afnd_ba_datasets"

In [286]:
pc90_ba_results = pd.read_csv(dir_location_ba+"/all_results.csv")
pc90_ms_results = pd.read_csv(dir_location_ms+"/all_results.csv")
pc90_ba_results["dataset"] = pc90_ba_results["dataset"].apply(lambda x: x+"_BA")
pc90_ms_results["dataset"] = pc90_ms_results["dataset"].apply(lambda x: x+"_MS")

In [287]:
#pc90_results = pd.read_csv(dir_location+"/all_results.csv")
pc90_results = pd.concat([pc90_ms_results, pc90_ba_results])

In [288]:
pc90_results["country"] = pc90_results.population.apply(lambda x: find_country_continent(pop_continent_map, x)[0])
pc90_results["continent"] =pc90_results.population.apply(lambda x: find_country_continent(pop_continent_map, x)[1])
pc90_results["income"] =  pc90_results.population.apply(lambda x: population_wb_income[population_wb_income.population == \
                                                                                      x].iloc[0].income_group)
pc90_results["wb_region"] =  pc90_results.population.apply(lambda x: population_wb_income[population_wb_income.population == \
                                                                                      x].iloc[0].region)

In [260]:
pc90_results[pc90_results["income"]=="Low income"].population.unique()

array(['Mali Bandiagara', 'Uganda Kampala', 'Uganda Kampala pop 2',
       'Zambia Lusaka'], dtype=object)

In [156]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(pc90_results[(pc90_results["income"]=="High income") & 
                 (pc90_results["dataset"]=="mhcflurry")].sort_values(by="pc90_scaled").reset_index(drop=True))

                                            population    dataset      pc90  \
0                         Australia Kimberly Aborigine  mhcflurry    548.32   
1                         Australia Yuendumu Aborigine  mhcflurry    690.80   
2                                 Singapore Riau Malay  mhcflurry   2729.25   
3                             Singapore SGVP Malay MAS  mhcflurry   2936.49   
4                                       Singapore Thai  mhcflurry   3262.34   
5                                    USA NMDP Filipino  mhcflurry   3436.62   
6              Israel Ashkenazi and Non Ashkenazi Jews  mhcflurry   4052.33   
7                                  Singapore Javaneses  mhcflurry   4186.07   
8                                  USA NMDP Vietnamese  mhcflurry   4281.06   
9                           USA African American pop 3  mhcflurry   4747.93   
10                                    USA NMDP African  mhcflurry   4871.37   
11                  Australia Groote Eylandt Aborigi

In [261]:
population_counts = pc90_results[["population", "country"]].drop_duplicates().reset_index(drop=True).\
                    groupby(by="country").count().reset_index()[["country", "population"]].sort_values("population", \
                                                                                                        ascending=[False])

In [262]:
population_counts[:20]

Unnamed: 0,country,population
58,United States,46
11,China,27
54,Taiwan,17
26,India,12
37,Mexico,11
21,Germany,11
6,Brazil,10
12,Colombia,10
13,Costa Rica,6
44,Papua New Guinea,6


In [251]:
tmp_pop_country = pc90_results[["population", "country"]].drop_duplicates().reset_index(drop=True)
tmp_pop_country[tmp_pop_country["country"]=="South Africa"]

Unnamed: 0,population,country
177,South Africa Caucasians,South Africa
178,South Africa Natal Tamil,South Africa
179,South Africa Worcester,South Africa
180,South African Black,South Africa
182,South African Mixed ancestry,South Africa


In [341]:
pc90_usa = pc90_results[pc90_results.population.apply(lambda x: "USA" in x)]

In [267]:
pc90_australia = pc90_results[pc90_results.population.apply(lambda x: "Australia" in x)]

In [268]:
pc90_brazil = pc90_results[pc90_results.country == "Brazil"]

In [352]:
def usa_population_ancestry(population):
    if "Caucasian" in population \
    or "Italy" in population or "European" in population :
        return "USA European"
    if "African" in population or "Caribbean" in population\
        or "Black" in population:
        return "USA African American"
    if "Asian" in population or "Japanese" in population \
        or "Korean" in population or "Vietnamese" in population:
        return "USA Asian"
    if "Hispanic" in population:
        return "USA Hispanic"
    return "Other"

def australia_population_ancs(population):
    if "Aborigine" in population:
        return "Aborigine"
    if "Caucasian" in population:
        return "European"
    return "Other"

def brazil_population_ancs(population):
    if "Black" in population:
        return "African"
    if "Caucasian" in population:
        return "European"
    if "Puyanawa" or "Terena" or "Ticuna" in population:
        return "Indigenous"
    return "Other"

In [270]:
pc90_brazil["ancestry"] = pc90_brazil.population.apply(lambda x: brazil_population_ancs(x))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [271]:
pc90_australia["ancestry"] = pc90_australia.population.apply(lambda x: australia_population_ancs(x))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [353]:
pc90_usa["ancestry"] = pc90_usa.population.apply(lambda x: usa_population_ancestry(x))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [299]:
pc90_results["PC90 (%)"] = pc90_results.pc90_scaled.apply(lambda x: x*100)

In [308]:
pc90_results["income-two"] = pc90_results["income"].apply(lambda x: "High and upper middle income" \
                                                          if x in ["High income", "Upper middle income"] \
                                                         else "Low and Lower middle income")

In [325]:
pc90_results = pc90_results[~pc90_results["income"].isna()]

In [326]:
income_ranks = {"High income": 1, \
               "Upper middle income": 2,\
               "Lower middle income": 3, \
               "Low income": 4}
pc90_results["income-rank"] = pc90_results["income"].apply(lambda x: income_ranks[x])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [337]:
dataset_names = {"netmhc_MS": "NetMHCpan4.1_MS", \
               "mhcflurry_MS": "MHCFlurry2.0_MS",\
               "netmhc_BA": "NetMHCpan4.1_BA", \
               "mhcflurry_BA": "MHCFlurry2.0_BA"}
dataset_ranks = {"netmhc_MS": 3, \
               "mhcflurry_MS": 4,\
               "netmhc_BA": 1, \
               "mhcflurry_BA": 2}
pc90_results["dataset_name"] = pc90_results["dataset"].apply(lambda x: dataset_names[x])
pc90_results["dataset_ranks"] = pc90_results["dataset"].apply(lambda x: dataset_ranks[x])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [712]:
fig = px.box(pc90_results.sort_values(by=["income-rank", "dataset_ranks"]), #pc90_brazil[~(pc90_brazil["ancestry"]=="Other")], \
             x="dataset_name", y="PC90 (%)", color="income",width=1000, height=500, points="all",\
              hover_data=["population"], \
            color_discrete_sequence=px.colors.qualitative.Safe)
#fig.add_hline(y=1, line_width=2, line_dash="dash", line_color="red")
fig.update_layout(yaxis_range=[0, 9.5],
                  boxgroupgap=0.4, # update
                  boxgap=0.1)
fig.show()

In [593]:
fig.write_image("PC90_incomes.svg", width=1000, height=500)

In [366]:
fig = px.box(pc90_results.sort_values(by=["income-rank", "dataset_ranks"]), #pc90_brazil[~(pc90_brazil["ancestry"]=="Other")], \
             x="dataset_name", y="PC90 (%)", color="wb_region",width=1000, height=500, #points="all",\
              hover_data=["population"], \
            color_discrete_sequence=px.colors.qualitative.Vivid)
#fig.add_hline(y=1, line_width=2, line_dash="dash", line_color="red")
fig.show()

In [594]:
ancs_ranks = {"USA African American": 1, \
               "USA Asian": 2,\
               "USA Hispanic": 3, \
               "USA European": 4, \
             "Other": 5}
pc90_usa["ansc_rank"] = pc90_usa["ancestry"].apply(lambda x: ancs_ranks[x])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [769]:
fig = px.box(pc90_usa[~(pc90_usa["ancestry"]=="Other")].sort_values(by=["dataset_ranks", "ansc_rank"], \
                                                                    ascending = [True, False]), #pc90_results[pc90_results.country == "Mexico"], #pc90_brazil[~(pc90_brazil["ancestry"]=="Other")], \
             x="dataset_name", y="PC90 (%)", color="ancestry",width=900, height=500, points="all",
              hover_data=["population"], \
            color_discrete_sequence=px.colors.sequential.ice[2::2])
#fig.add_hline(y=1, line_width=2, line_dash="dash", line_color="red")
fig.update_layout(yaxis_range=[0, 9.5])
fig.show()

In [770]:
fig.write_image("PC90_usa_ancs.svg", width=900, height=500)

In [768]:
fig = px.scatter(pc90_australia, #[~(pc90_usa["ancestry"]=="Other")], \
             x="population", y="pc90_scaled", color="dataset_name",width=700, height=500)
#fig.add_hline(y=1, line_width=2, line_dash="dash", line_color="red")
fig.show()

ValueError: Value of 'color' is not the name of a column in 'data_frame'. Expected one of ['population', 'dataset', 'pc90', 'average_hits', 'pc90_scaled', 'average_hits_scaled', 'country', 'continent', 'income', 'wb_region', 'ancestry'] but received: dataset_name

In [694]:
region_ranks_map = {
    "Europe": .0,
    "North America": 1,
    "South America": 2,
    "Asia": 3,
    "Oceania": 4,
    "Africa": 5   
}
pc90_results["continent_ranks"] = pc90_results["continent"].apply(lambda x: region_ranks_map[x])

In [725]:
afdn_regions = pd.read_csv("./country_region_map.csv")

In [726]:
afdn_regions[afdn_regions["region"]=="Oceania"].country.unique()

array([nan, 'American Samoa', 'Borneo', 'Bouvet Island',
       'British Indian Ocean Territory', 'Christmas Island',
       'Cocos Islands', 'Cook Islands', 'East Timor',
       'Federal States of Micronesia', 'Fiji', 'French Polynesia',
       'French Southern Territories', 'Guam',
       'Heard and Mc Donald Islands', 'Indonesia', 'Kiribati',
       'Madagascar', 'Maldives', 'Marshall Islands', 'Nauru',
       'New Caledonia', 'New Zealand', 'Niue', 'Norfolk Island',
       'Northern Mariana Islands', 'Palau', 'Papua New Guinea',
       'Philippines', 'Pitcairn Islands', 'Samoa', 'Solomon Islands',
       'Timor-Leste', 'Tokelau', 'Tonga', 'Tuvalu', 'Vanuatu',
       'Wallis and Futuna Islands'], dtype=object)

In [733]:
def map_afnd_region(x, afdn_regions):
    tmp =  afdn_regions[afdn_regions["country"]==x].region
    if tmp.shape[0] > 0:
        return tmp.iloc[0]
    else:
        if "Kosovo" in x or "Ireland" in x:
            return "Europe"
        return None

In [734]:
pc90_results["afnd_region"] = pc90_results.country.apply(lambda x: map_afnd_region(x, afdn_regions))

In [760]:
fig = px.box(pc90_results.sort_values(by=["dataset_ranks", "continent_ranks"], ascending = [True, True]), \
             x="dataset_name", y="PC90 (%)", color="afnd_region", width=1000, height=500, #points="all", \
             #boxmode="overlay",\ 
            hover_data=["population"], color_discrete_sequence=px.colors.qualitative.Dark24)
#fig.add_hline(y=1, line_width=2, line_dash="dash", line_color="red")
'''
fig = px.box(pc90_usa[~(pc90_usa["ancestry"]=="Other")].sort_values(by=["dataset_ranks", "ansc_rank"], \
                                                                    ascending = [True, False]), #pc90_results[pc90_results.country == "Mexico"], #pc90_brazil[~(pc90_brazil["ancestry"]=="Other")], \
             x="dataset_name", y="PC90 (%)", color="ancestry",width=900, height=500, points="all",
              hover_data=["population"], marker_line\
            color_discrete_sequence=px.colors.sequential.ice[4::2])
'''
#fig.add_hline(y=1, line_width=2, line_dash="dash", line_color="red")
fig.update_layout(yaxis_range=[0, 9.5],
                  boxgroupgap=0.5, #boxmode = "overlay",# update
                  boxgap=0.2)
fig.show()

In [761]:
fig.write_image("PC90_global_regions.svg", width=900, height=500)

In [94]:
pc90_results["GlobalSouth/GlobalNorth"] = pc90_results.population.apply(lambda x: gn_gs_map[x])

In [95]:
fig = px.box(pc90_results, x="GlobalSouth/GlobalNorth", y="pc90", \
             color="dataset",width=650, height=500, orientation="v")
fig.show()

In [293]:
pc90_results[(pc90_results["dataset"] == "mhcflurry_BA") & \
            (pc90_results["income"] == "Lower middle income")].sort_values(by=["pc90_scaled"])

Unnamed: 0,population,dataset,pc90,average_hits,pc90_scaled,average_hits_scaled,country,continent,income,wb_region
214,India West Coast Parsi,mhcflurry_BA,419.25,4431.76,0.002243,0.023708,India,Asia,Lower middle income,South Asia
34,Cameroon Baka Pygmy,mhcflurry_BA,1136.59,6551.16,0.00608,0.035046,Cameroon,Africa,Lower middle income,Sub-Saharan Africa
38,Cameroon Sawa,mhcflurry_BA,1616.95,8132.72,0.00865,0.043507,Cameroon,Africa,Lower middle income,Sub-Saharan Africa
508,Vietnam Kinh,mhcflurry_BA,2667.85,10716.41,0.014272,0.057329,Vietnam,Asia,Lower middle income,East Asia & Pacific
238,Kenya,mhcflurry_BA,3141.33,11446.8,0.016805,0.061236,Kenya,Africa,Lower middle income,Sub-Saharan Africa
240,Kenya Luo,mhcflurry_BA,3359.97,12345.44,0.017975,0.066043,Kenya,Africa,Lower middle income,Sub-Saharan Africa
404,Tanzania Maasai,mhcflurry_BA,3730.11,13300.18,0.019955,0.071151,Tanzania,Africa,Lower middle income,Sub-Saharan Africa
512,Zimbabwe Harare Shona,mhcflurry_BA,3743.36,11747.58,0.020026,0.062845,Zimbabwe,Africa,Lower middle income,Sub-Saharan Africa
506,Vietnam Hanoi Kinh pop 2,mhcflurry_BA,3854.98,10982.36,0.020623,0.058752,Vietnam,Asia,Lower middle income,East Asia & Pacific
302,Papua New Guinea East New Britain Rabaul,mhcflurry_BA,4143.94,8188.42,0.022169,0.043805,Papua New Guinea,Oceania,Lower middle income,East Asia & Pacific


In [12]:
pd.read_csv(file, nrows=1, skiprows=1, sep="\t")

Unnamed: 0,population/area,coverage,average_hit,pc90
0,American Samoa,100.0%,5926.3,3013.05


In [None]:

#pop_continent_map["Population"] = pop_continent_map["Population"].apply(lambda x: x.replace(" ", "").replace("/", ""))
#result_df_afnd.population = result_df_afnd.population.apply(lambda x: x.replace("_", "").replace(" ", "").replace("/", ""))