In [None]:


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import upsetplot

def parse_input():
    dd = pd.ExcelFile('meta/COMBO PRO Banked Samples Aliquot Clinical Data.xlsx')
    kk = []
    for x in dd.sheet_names:
        tmp = pd.read_excel('meta/COMBO PRO Banked Samples Aliquot Clinical Data.xlsx', sheet_name=x)
        tmp['loc'] = x
        kk.append(tmp)
            
    tot = pd.concat(kk)
    return tot
        
def fetch_meta():
    # metadata
    cols = "#E64B35B2", "#4DBBD5B2", "#00A087B2", "#3C5488B2", "#F39B7FB2", "#8491B4B2", "#91D1C2B2", "#DC0000B2", "#7E6148B2"
    cols = [x.lower() for x in list(cols)]

    meta = pd.read_excel('meta/COMBO PRO Banked Plasma Samples Clinical.xlsx')
    dd = list(set(meta['COMBO Plasma Box Number']))
    enc = dict(zip(dd, cols))
    col_colors = meta['COMBO Plasma Box Number'].map(enc)

    col_colors.index = meta['COMBO ID']
    meta['COMBO Plasma Box Number'] = pd.Categorical(
        meta['COMBO Plasma Box Number'])

    meta['TB Classification'] = pd.Categorical(
        meta['TB Classification'])

    meta['TB code'] = meta['TB Classification'].cat.codes
    meta['code'] = meta['COMBO Plasma Box Number'].cat.codes
    meta.dropna(subset=['TB Classification'], inplace=True)
    return meta, col_colors



def preprocess_metadata(flt):
    # metadata
    cols = "#E64B35B2", "#4DBBD5B2", "#00A087B2", "#3C5488B2", "#F39B7FB2", "#8491B4B2", "#91D1C2B2", "#DC0000B2", "#7E6148B2"
    cols = [x.lower() for x in list(cols)]

    meta = parse_input()
    dd = list(set(meta['COMBO Plasma Box Number']))
    meta = meta[meta['COMBO ID'].isin(flt)]
    enc = dict(zip(dd, cols))
    col_colors = meta['COMBO Plasma Box Number'].map(enc)

    col_colors.index = meta['COMBO ID']
    meta['COMBO Plasma Box Number'] = pd.Categorical(
        meta['COMBO Plasma Box Number'])

    meta['TB code'] = pd.Categorical(
        meta['TB Classification'])

    meta['code'] = meta['COMBO Plasma Box Number'].cat.codes
    meta['TB code'] = meta['TB code'].cat.codes
    meta.dropna(subset=['TB Classification'], inplace=True)
    return meta, col_colors


## peptides per run/proteins per run

def plot_run_order(df, meta, suf):
    '''
    plot various metrics against the run order as lineplot
    '''
    if suf == 'pep':
        piv = pd.pivot_table(df, index='Precursor.Id',
                             columns='File.Name', values='Precursor.Quantity')
        grp = piv.count().to_frame()
    elif suf == 'prot':
        piv = pd.pivot_table(df, index='Protein.Group',
                             columns='File.Name', values='Genes.Quantity')
        grp = piv.count().to_frame()
    grp['run_nr'] = [
        x.split('\\')[-1].replace('.d', '').split('_')[-1] for x in list(grp.index)]
    grp['ids'] = [
        x.split('\\')[-1].replace('.d', '').split('_')[0] for x in list(grp.index)]
    grp['batch'] = grp['ids'].map(
        dict(zip(meta['COMBO ID'], meta['COMBO Plasma Box Number'])))
    mp = dict(zip(grp['ids'], grp['run_nr']))
    grp = grp.groupby(['ids', 'batch'])[0].max().to_frame().reset_index()
    grp = grp.set_index(['ids', 'batch']).reset_index()
    grp.columns = ['rid', 'batch', 'max']
    grp['run_nr'] = grp['run_nr'].astype(int)
    grp['run_nr'] = grp['run_nr'] - grp['run_nr'].min()
    # print(grp.loc[grp['max'][::-1].idxmax()])
    grp['batch'] = [x.replace('PRO ', '') for x in grp['batch']]
    grp['rid'] = grp['rid'].map(mp)
    grp.sort_values(by='rid', inplace=True)
    grp['batch_cl'] = [x.split('Plasma')[0] for x in grp['batch']]
    grp['rid'] = list(range(1, grp.shape[0]+1))
    return grp, mp



df = pd.read_csv('data/COMBO_V3.tsv', sep='\t')
mp = dict(zip(df['Protein.Group'], df['Genes']))
df = df[df['Lib.PG.Q.Value'] <= 0.01]
meta, _ = fetch_meta()

# tmp = []
# for suf in ['pep', 'prot']:
#     grp, id2run_nr = plot_run_order(df, meta, suf)
#     grp['type'] = suf
#     tmp.append(grp)

# tmp = pd.concat(tmp)
#tmp.to_csv('meta/info_pep_level_run.csv')


In [None]:
import seaborn as sns
import pandas as pd
import numpy as np

kk = {'g/L', 'mg/L', 'ng/L', 'µg/L'}
cc = {10**9,10**6,1,10**3}

kf = dict(zip(kk, cc))


dd = pd.read_excel('meta/molarity_plasma_antibody.xlsx')
dd['detected'] = dd['GN'].isin(set(df['Genes']))
dd.dropna(inplace=True)
dd['unit'] = [kf[x.split(' ')[1]] for x in dd['Conc']]
dd['conc'] = [x.split(' ')[0] for x in dd['Conc']]
dd.to_csv('Fig1F.csv')
dd

In [None]:
from mypy import biostat as bst
import pandas as pd
import statsmodels as stats
import numpy as np

def fc_pv_calc(subdf):
    from scipy import stats
    mean_ctrl = subdf[subdf['TB Classification']
                      == 'Unlikely TB']['value'].mean()
    fc, pv = [], []
    mean_tr1 = subdf[subdf['TB Classification']
                     == 'Confirmed TB']['value'].mean()
    fc = mean_tr1 - mean_ctrl
    pv = stats.ttest_ind(subdf[subdf['TB Classification'] == 'Confirmed TB']['value'],
                         subdf[subdf['TB Classification']
                               == 'Unlikely TB']['value']
                         )[1]
    return pd.DataFrame([fc, pv]).T

dd = pd.read_csv('processed/protein_level_melted_merged.csv')
dd = dd[['Protein.Group', 'variable', 'value', 'COMBO Plasma Box Number',
         'Age', 'TB Classification', 'HIV Status', 'Genes']]


stats_df = dd.groupby(
    ['Protein.Group', 'Genes']).apply(fc_pv_calc).reset_index()
stats_df.columns = ['PID', 'GN', 'idx', 'Log2FC', 'p']
stats_df['q'] = bst.multiple_testing_correction(stats_df['p'].values)
stats_df.drop(['idx'], axis=1, inplace=True)
#stats_df['Genes'] = stats_df['PID'].map(mp)
stats_df['logq'] = -np.log10(stats_df['q'])
stats_df['issign'] = (stats_df['q'] < 0.05) & (
    np.abs(stats_df['Log2FC']) > 0.5)
stats_df.to_csv('figures/fig2/data/Confirmed_vs_Unlikely.csv', index=False)


In [None]:
stats_df = stats_df[stats_df['q']<=0.05]
stats_df['uu'] = ['up' if x>0 else 'down' for x in stats_df['Log2FC']]
stats_df.groupby(['uu']).size()
for x in 'up', 'down':
    stats_df[stats_df['uu']==x].to_csv('figures/fig2/data/{}.csv'.format(x))

In [None]:
## Fig1E
df = pd.read_csv('data/COMBO_V3.tsv', sep='\t')
mp = dict(zip(df['Protein.Group'], df['Genes']))
df = df[df['Lib.PG.Q.Value'] <= 0.01]

df.drop_duplicates(['Protein.Ids', 'Run'], inplace=True)
counts = df.groupby(['Run']).size().to_frame()
counts.sort_values(0, inplace=True)
counts = counts[counts[0] > 442]
counts.reset_index(inplace=True)
df = df[df['Run'].isin(counts['Run'])]
df['dummy'] = 1
count_table = pd.pivot_table(
    data=df, columns='Run', index='Protein.Ids', values='dummy')
count_table.to_csv('Fig1E.csv')

In [None]:
## Fig2C
from upsetplot import from_contents, plot
from upsetplot import UpSet
from matplotlib import cm

meta['Location'] = [x.split('Plasma')[0] for x in meta['COMBO Plasma Box Number']]
toplot=df[['Run', 'Protein.Ids']]

toplot['Run']=[x.split('_')[0] for x in toplot['Run']]
toplot['Run'] = toplot['Run'].map(dict(zip(meta['COMBO ID'], meta['Location'])))
toplot.drop_duplicates(subset=['Run', 'Protein.Ids'], inplace=True)
toplot.dropna(inplace=True)
dd = {}

for x in set(toplot['Run']):
    dd[x] = list(toplot[toplot['Run']==x]['Protein.Ids'])

toplot = from_contents(dd)
toplot

fig = plt.figure(figsize=(2.5, 4))
upset = UpSet(toplot, sort_by='cardinality', show_percentages=True)  # disable the default bar chart

upset.plot(fig=fig)

#plt.savefig('figures/fig1/upset.pdf', bbox_inches='tight', dpi=800)
#plt.savefig('figures/fig1/upset.svg', bbox_inches='tight', dpi=800)

plt.close()

In [None]:
import pandas as pd
import numpy as np

# Fig2E

def fetch_meta():
    # metadata
    cols = "#E64B35B2", "#4DBBD5B2", "#00A087B2", "#3C5488B2", "#F39B7FB2", "#8491B4B2", "#91D1C2B2", "#DC0000B2", "#7E6148B2"
    cols = [x.lower() for x in list(cols)]

    meta = pd.read_excel('meta/COMBO PRO Banked Plasma Samples Clinical.xlsx')
    dd = list(set(meta['COMBO Plasma Box Number']))
    enc = dict(zip(dd, cols))
    col_colors = meta['COMBO Plasma Box Number'].map(enc)

    col_colors.index = meta['COMBO ID']
    meta['COMBO Plasma Box Number'] = pd.Categorical(
        meta['COMBO Plasma Box Number'])

    meta['TB Classification'] = pd.Categorical(
        meta['TB Classification'])

    meta['TB code'] = meta['TB Classification'].cat.codes
    meta['code'] = meta['COMBO Plasma Box Number'].cat.codes
    meta.dropna(subset=['TB Classification'], inplace=True)
    return meta, col_colors


def cv(grp):
    return np.std(grp['value'])/np.mean(grp['value'])


protlevel = pd.read_csv('processed/protein_level.csv')
protlevel = pd.melt(protlevel, id_vars='Protein.Group')
meta, _ = fetch_meta()

meta['Location'] = [x.split('Plasma')[0] for x in meta['COMBO Plasma Box Number']]
protlevel['loc'] = protlevel['variable'].map(dict(zip(meta['COMBO ID'], meta['Location'])))
loc_cv =protlevel.groupby(['loc', 'Protein.Group']).apply(cv).reset_index()
tot_cv = protlevel.groupby(['Protein.Group']).apply(cv).reset_index()
tot_cv['loc'] = 'Total'

mrg_cv = pd.concat([loc_cv, tot_cv])
mrg_cv.to_csv('figures/fig1/data/cv_all.csv')
mrg_cv.groupby('loc', 0).mean()

In [None]:
## Fig2B
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt


def theme_Publication():
    sns.set_style("ticks")
    sns.set_context("paper", font_scale=1.2)
    sns.set_palette("colorblind")
    
    plt.rcParams["font.family"] = "Helvetica"
    plt.rcParams["figure.figsize"] = [1.5, 1.5]
    plt.rcParams["axes.linewidth"] = 1
    plt.rcParams["axes.edgecolor"] = "black"
    plt.rcParams["legend.frameon"] = False
    plt.rcParams["legend.loc"] = "lower center"
    plt.rcParams["legend.framealpha"] = 1
    plt.rcParams["legend.fontsize"] = 9
    plt.rcParams["legend.title_fontsize"] = 9
    plt.rcParams["axes.grid"] = False
    plt.rcParams["figure.subplot.left"] = 0
    plt.rcParams["figure.subplot.right"] = 1
    plt.rcParams["figure.subplot.bottom"] = 0
    plt.rcParams["figure.subplot.top"] = 1

    return None

theme_Publication()


dd = pd.read_csv('processed/ProteomicsDataReport.csv')
dd['loc'] = [x.split(' ')[0] for x in list(dd['COMBO Plasma Box Number'])]
dd = dd.groupby(['Protein.Group', 'loc'])['value'].min().to_frame()
dd.reset_index(inplace=True)
dd['value'] = np.log10(dd['value'])

cols = dict(zip(['Gambia', 'PRO', 'Peru', 'Uganda'], [
            '#3b499240', '#008b4540', '#ee000040', '#63187940']))

dd.to_csv('Fig2B.csv')

sns.ecdfplot(data=dd, x="value",
             alpha=1,
             hue=list(dd['loc']),
             palette=cols)

fig = plt.gcf()
fig.set_size_inches(1.5, 1.5)  # Set the width and height in inches
# Increase the number of ticks on the y-axis
plt.xticks(range(1,6, 1))  # Set the desired tick positions

# Save the plot at the specified size
plt.savefig("figures/fig1/ecdf.pdf", dpi=600, bbox_inches='tight')  # Save the plot as a PNG file with 300 DPI

# Show the plot
plt.show()
