In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
# from IPython.core.display import display, HTML, clear_output
# display(HTML("<style>.container { width:100% !important; }</style>"))

In [7]:
import pandas as pd
import numpy as np
from scipy import optimize
from scipy.stats import sem

import os
import sys

import matplotlib.pyplot as plt
import matplotlib.ticker
from venn import venn, pseudovenn
import seaborn as sns
from matplotlib.collections import PathCollection

from Bio import SeqIO, SeqUtils, Seq
from Bio.SeqUtils.ProtParam import ProteinAnalysis

from addict import Dict
import json

In [8]:
sys.path.append("../")
import plotting as my_plot

sys.path.append("./mean_field_elect/")
import proteomics_functions as prot_fun

# Load data

In [9]:
dfs = Dict()
dfs_mab = Dict()

file = pd.ExcelFile(f'./data/Chase SWATH 20221128.xlsx')
dfs.cq.g12  = file.parse('CaptoQ 1G12')
dfs.cq.eluate  = file.parse('CaptoQ Eluate')
dfs.hq.a5 = file.parse('Poros50 HQ 1A05!') # short list method
dfs.hq.g12  = file.parse('Poros50 HQ 1G12')
dfs.hq.eluate  = file.parse('Poros50 HQ Eluate')
dfs.xq.g12  = file.parse('PorosXQ 1G12')
dfs.xq.eluate  = file.parse('PorosXQ Eluate')

file = pd.ExcelFile(f'./data/Chase SWATH 20230220.xlsx')
dfs.cq.a5  = file.parse('CaptoQFT 1A05')
dfs.cq.b12  = file.parse('CaptoQFT 1B12')
dfs.cq.d9  = file.parse('CaptoQFT 1D09')
dfs.hq.b12  = file.parse('Poros50HQFT 1B12!') # short list method
dfs.hq.d9  = file.parse('Poros50HQFT 1D09')
dfs.xq.a5 = file.parse('PorosXQFT 1A05!') # short list method
dfs.xq.b12 = file.parse('PorosXQFT 1B12') # short list method
dfs.xq.d9 = file.parse('PorosXQFT 1D09') # short list method

for resin in ['cq', 'hq', 'xq']:
    for frac in dfs[resin].keys():
        dfs[resin][frac].dropna(axis='columns', inplace=True)
        dfs[resin][frac].columns = ['accn', 'name', 'rep1_log2_norm_area', 'rep2_log2_norm_area', 'rep3_log2_norm_area', 'prot_mw', 'rep1_ng', 'rep2_ng', 'rep3_ng', 'ave_ng', 'cv']
        dfs[resin][frac]['ave_ppm'] = dfs[resin][frac]['ave_ng']/90.91 * 1e3 # x ng/90.91 ug * 1e-3 ug/ng * 1e6 ppm
                   
        # Get mAb as a separate df
        dfs_mab[resin][frac] = dfs[resin][frac][ dfs[resin][frac]['name'].str.contains('Custom') ]
        dfs_mab[resin][frac].reset_index(inplace=True, drop=True)
        
        # Select only CHO HCPs
        dfs[resin][frac] = dfs[resin][frac][ dfs[resin][frac]['name'].str.contains('Cricetulus griseus') ]
        dfs[resin][frac].reset_index(inplace=True, drop=True)

# Assemble a flat df with ave_ppm values

In [10]:
names = {}

for source in dfs.keys():
    for frac in dfs[source].keys():
        df = dfs[source][frac]
        for i, cont in df.iterrows():
            names[cont['accn']] = cont['name']
                

df_master = pd.DataFrame.from_dict(names, orient='index')
df_master.reset_index(inplace=True)
df_master.columns = ['accn', 'name']

for source in dfs.keys():
    for frac in dfs[source].keys():
        df = dfs[source][frac]
        for i, cont in df.iterrows():
            df_master.loc[df_master.accn == cont.accn, f'{source}_{frac}'] = cont.ave_ppm

In [11]:
df_master = df_master[['accn', 'name', 
                       'cq_a5', 'cq_b12', 'cq_d9', 'cq_g12', 'cq_eluate', 
                       'xq_a5', 'xq_b12', 'xq_d9', 'xq_g12', 'xq_eluate',
                       'hq_a5', 'hq_b12', 'hq_d9', 'hq_g12', 'hq_eluate']]

In [12]:
df_master.to_csv('./generated_tables/swath_master_df_ppm_with_na.csv', index=False)
df_master = df_master.fillna(0)
df_master.to_csv('./generated_tables/swath_master_df_ppm.csv', index=False)

# Add biophysical properties

In [44]:
df_dda = pd.read_csv('./generated_tables/df_master_dda_with_bare_properties.csv')

In [45]:
new_accn = [accn for accn in list(df_master.accn) if accn not in list(df_dda.accn)]
len(new_accn)

387

In [46]:
# # Get new html links to look up sequences

# html = ''
# cnt = 0

# for i, a in enumerate(new_accn):
#     if i % 200 == 0:
#         print(html[:-1], '\n'*2)
#         html = 'https://www.ncbi.nlm.nih.gov/protein/'        
#     html += a + ','
#     cnt += 1
    
# print(html[:-1], '\n'*2)

In [47]:
# My substitution rules for uncertain amino acids
my_sub_rules = {'B':'D', 'Z':'E', 'X':'A', 'J':'L'}

# Get sequence dictionary {accession:sequence_object}
sequences = {}
subbed_ids = []

for r in SeqIO.parse("./data/sequences_all.fasta", "fasta"):
    for aa in my_sub_rules.keys(): # uncertain amino acids
        if aa in r.seq:
            r.seq = Seq.Seq(str(r.seq).replace(aa, my_sub_rules[aa]))
            subbed_ids.append(r.id)
    sequences[r.id] = r.seq

In [48]:
missing = []
for i, cont in df_master.iterrows():
    if cont.accn not in list(sequences.keys()):
        missing.append(cont.accn)
assert len(missing) == 0

In [49]:
# Get pI and mass dictionaries {accession:pI/mass}
pI_vals = {}
masses = {}

for p_id, seq in sequences.items():
    pI, is_solved = prot_fun.get_pI(seq)
    assert is_solved
    pI_vals[p_id] = pI
    masses[p_id] = SeqUtils.molecular_weight(seq, seq_type='protein')

  improvement from the last ten iterations.


In [50]:
# Add sequences, pI values, and masses to df_master
for i, cont in df_master.iterrows():
    df_master.at[i, 'sequence'] = str(sequences[cont.accn])
    df_master.at[i, 'pI'] = pI_vals[cont.accn]
    df_master.at[i, 'mass'] = masses[cont.accn]

In [51]:
# Get other biophysical property dictionaries (assuming pH 7.0)
net_charges, net_neg_charges, net_pos_charges, charge_densities, charge_densities_neg, charge_densities_pos = {}, {}, {}, {}, {}, {}

for p_id, seq in sequences.items():
    net_charge, net_neg_charge, net_pos_charge, charge_dens, charge_dens_neg, charge_dens_pos = prot_fun.get_charge(pH=7.0, seq=seq, charge_contributions=True)
    net_charges[p_id] = net_charge
    net_neg_charges[p_id] = net_neg_charge
    net_pos_charges[p_id] = net_pos_charge
    charge_densities[p_id] = charge_dens
    charge_densities_neg[p_id] = charge_dens_neg
    charge_densities_pos[p_id] = charge_dens_pos

In [52]:
# Add these biophysical properties to df_master
for i, cont in df_master.iterrows():
    df_master.at[i, 'net_charge'] = net_charges[cont.accn]
    df_master.at[i, 'net_charge_neg'] = net_neg_charges[cont.accn]
    df_master.at[i, 'net_charge_pos'] = net_pos_charges[cont.accn]
    df_master.at[i, 'charge_dens_C_m2'] = charge_densities[cont.accn]
    df_master.at[i, 'charge_dens_neg_C_m2'] = charge_densities_neg[cont.accn]
    df_master.at[i, 'charge_dens_pos_C_m2'] = charge_densities_pos[cont.accn]

In [53]:
df_master.to_csv('./generated_tables/swath_master_df_ppm_with_properties.csv', index=False)

# Get error estimate for total mass

In [72]:
df_brad = pd.read_csv('./generated_tables/bradford_errors.csv')

In [73]:
for resin in ['cq', 'hq', 'xq']:
    for frac in dfs[resin].keys():
        brad_rsd = df_brad.loc[(df_brad.resin == resin) & (df_brad.frac == frac), 'bradford_rsd'].iloc[0]
        df = dfs[resin][frac]
        
        m_ave = df['ave_ng'].sum()
        m_stdev = np.std([df['rep1_ng'].sum(), df['rep2_ng'].sum(), df['rep3_ng'].sum()], ddof=1, axis=0)
        swath_rsd = m_stdev / m_ave
        
        total_rsd = np.sqrt(brad_rsd**2 + swath_rsd**2)
        df_brad.at[(df_brad.resin == resin) & (df_brad.frac == frac), 'swath_rsd'] = swath_rsd
        df_brad.at[(df_brad.resin == resin) & (df_brad.frac == frac), 'total_rsd'] = total_rsd

In [74]:
df_brad.to_csv('./generated_tables/bradford_and_swath_errors.csv', index=False)