In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
from scipy import optimize
from scipy.stats import sem

import os
import sys

import matplotlib.pyplot as plt
import matplotlib.ticker
from venn import venn, pseudovenn
import seaborn as sns
from matplotlib.collections import PathCollection

from Bio import SeqIO, SeqUtils
from Bio.SeqUtils.ProtParam import ProteinAnalysis

from addict import Dict
import json


In [4]:
sys.path.append("../")
import plotting as my_plot

sys.path.append("/home/chase/my_repos/Misc_electrostatics/sim_vs_exp/mean_field_elect/")
import proteomics_functions as prot_fun
import base_classes
import morbidelli as m

# Load data

In [6]:
folder = './data/'
dfs = Dict()

file = pd.ExcelFile(folder + 'CH_CrossDigestNative_DDA.xlsx')
dfs.native.hccf.feed  = file.parse('HCCF Feed')
dfs.native.hccf.large = file.parse('HCCF Large Agg')
dfs.native.hccf.small = file.parse('HCCF Small Agg')
dfs.native.hccf.mab   = file.parse('HCCF mAb')
dfs.native.hccf.lmw1  = file.parse('HCCF LMW 1')

file = pd.ExcelFile(folder + 'CH_CrossDigestStandard_DDA.xlsx')
dfs.std.pafvin.feed  = file.parse('PAFVIN feed')
dfs.std.pafvin.large = file.parse('PAFVIN Large Agg')
dfs.std.pafvin.small = file.parse('PAFVIN Small Agg')
dfs.std.pafvin.mab   = file.parse('PAFVIN mAb')

file = pd.ExcelFile(folder + 'CH_20220421_HCCF DDA zero.xlsx')
dfs.std.hccf.feed  = file.parse('HCCF Feed')
dfs.std.hccf.large = file.parse('HCCF Large Agg')
dfs.std.hccf.small = file.parse('HCCF Small Agg')
dfs.std.hccf.mab   = file.parse('HCCF mAb')
dfs.std.hccf.lmw1  = file.parse('HCCF LMW1')
dfs.std.hccf.lmw2  = file.parse('HCCF LMW2')

file = pd.ExcelFile(folder + 'CH_20220421_PAFVIN DDA zero.xlsx')
dfs.native.pafvin.feed  = file.parse('PAFVIN Feed')
dfs.native.pafvin.large = file.parse('PAFVIN Large Agg')
dfs.native.pafvin.small = file.parse('PAFVIN Small Agg')
dfs.native.pafvin.mab   = file.parse('PAFVIN mAb')

# Remove non-CHO and reversed (decoy) proteins

In [14]:
def remove_non_CHO_and_reversed(df):
    df = df[df['Name'].str.contains('Cricetulus griseus')]
    df_rev = df[df['Accession'].str.contains('RRRRR')]
    df = df[~df['Accession'].str.contains('RRRRR')]
    df.reset_index(inplace=True, drop=True)
    df_rev.reset_index(inplace=True, drop=True)
    return df, df_rev

In [15]:
dfs_rev = Dict()

for digest in dfs.keys():
    for source in dfs[digest].keys():
        for frac in dfs[digest][source].keys():
            dfs[digest][source][frac], df_rev = remove_non_CHO_and_reversed(dfs[digest][source][frac])
            if not df_rev.empty:
                dfs_rev[digest][source][frac] = df_rev

## What confidence criteria were given to reversed proteins?

In [16]:
for digest in dfs_rev.keys():
    for source in dfs_rev[digest].keys():
        for frac in dfs_rev[digest][source].keys():
            df = dfs_rev[digest][source][frac]
            print(source, '\t', frac, '\t', df.at[0, 'Unused'], '\t', df.at[0, 'Peptides(95%)'], '\t', digest)

hccf 	 feed 	 1.24 	 1 	 native
hccf 	 large 	 0.01 	 1 	 native
hccf 	 small 	 1.68 	 1 	 native
hccf 	 lmw1 	 0.0 	 1 	 native
pafvin 	 feed 	 2.0 	 1 	 native
pafvin 	 mab 	 2.0 	 1 	 native
pafvin 	 feed 	 2.0 	 1 	 std
pafvin 	 small 	 2.0 	 1 	 std
pafvin 	 mab 	 0.0 	 1 	 std
hccf 	 large 	 2.0 	 1 	 std
hccf 	 lmw2 	 2.0 	 1 	 std


# Select HCPs with $\geq 2$ peptides

In [17]:
for digest in dfs.keys():
    for source in dfs[digest].keys():
        for frac in dfs[digest][source].keys():
            df = dfs[digest][source][frac]
            df.drop(df[df['Peptides(95%)'] == 1].index, inplace=True)
            df.reset_index(inplace=True, drop=True)

# Deal with the problem of $Unused \; score = 0$

Problem:  among multiple possible HCPs, only one needs to be selected, but this can lead to the appearance of different HCPs in different samples that should truly be treated as the same HCP.

I plan to try to minimize the total number of HCPs in the union by selecting among HCP possibilities based on some ranking (eg. most confident identifications elswhere, etc.).

In [18]:
accn_groups = []
single_ids = []

for digest in dfs.keys():
    for source in dfs[digest].keys():
        for frac in dfs[digest][source].keys():
            df = dfs[digest][source][frac]
            if len(df) > 0:
                n_vals = list(set(df.N))
                n_vals.sort()
                for n in n_vals:
                    df_prot = df[df.N == n]
                    accn_groups.append(list(df_prot.Accession))
                    if len(df_prot) == 1:
                        single_ids.append([digest, source, frac, df_prot.iloc[0]['Accession'], df_prot.iloc[0]['Unused'], df_prot.iloc[0]['Peptides(95%)']])
                    
unique_accn_groups = list(set(map(tuple, accn_groups)))
df_singles = pd.DataFrame(single_ids, columns=['digest', 'source', 'frac', 'accn', 'unused', 'peptides'])

In [19]:
# Get frequency counts for HCPs
hcp_freq = {}

for digest in dfs.keys():
    for source in dfs[digest].keys():
        for frac in dfs[digest][source].keys():
            df = dfs[digest][source][frac]
            for a in df.Accession:
                if a in hcp_freq.keys():
                    hcp_freq[a] += 1
                else:
                    hcp_freq[a] = 1

In [20]:
# Select "winner" HCPs
# 1) Are there multiple options? If not, the selection is unnecessary.
# 2) If there are multiple options, narrow it down by taking the HCPs with the highest number of peptides. Then, are any of the HCPs found elsewhere as a singlet? 
#    If so, select the singlet that was identified with the greatest confidence (based on number of peptides, then the unused score)
# 3) If none of the degenerate HCPs are ever found as singlets, check that I haven't already selected any of the HCPs (from 4 and 5 below).
#    If previous selections have been made, take the one that has the higher frequency of previous selection, 
#    and if they all have the same frequency of previous selection, take the top one from the list of ordered accn ids.
# 4) If none of the HCPs have been previously selected, does any one of the HCPs appear at a higher frequency? If so, select that one.
# 5) If not, take the top entry in the list of ordered accn ids. 

# May need to unbias the bias toward large HCPs

already_selected = {}

for digest in dfs.keys():
    for source in dfs[digest].keys():
        for frac in dfs[digest][source].keys():
            df = dfs[digest][source][frac]
            df['selection'] = False
            if len(df) > 0:
                n_vals = list(set(df.N))
                n_vals.sort()
                for n in n_vals:
                    df_prot = df[df.N == n]
                    if len(df_prot) == 1:
                        index = df_prot.index[0]
                        df.at[index, 'selection'] = True
                    else:
                        df_prot = df_prot[df_prot['Peptides(95%)'] == df_prot['Peptides(95%)'].max()]
                        df_single_test = df_singles[df_singles.accn.isin(list(df_prot.Accession))].copy()
                        if len(df_single_test) > 0:
                            df_single_test.sort_values(by=['peptides', 'unused'], inplace=True, ignore_index=True, ascending=False)
                            accn_selection = df_single_test.at[0, 'accn']
                            index = df_prot.loc[df_prot.Accession == accn_selection].index[0]
                            df.at[index, 'selection'] = True
                        else:
                            df_already = df_prot[df_prot.Accession.isin(already_selected.keys())].copy()
                            if len(df_already) == 0:
                                temp = []
                                for a in df_prot.Accession:
                                    temp.append([a, hcp_freq[a]])
                                df_freq = pd.DataFrame(temp, columns=['accn', 'freq'])
                                df_freq.sort_values(by=['freq', 'accn'], inplace=True, ignore_index=True, ascending=False)
                                accn_selection = df_freq.at[0, 'accn']
                                index = df_prot.loc[df_prot.Accession == accn_selection].index[0]
                                df.at[index, 'selection'] = True
                                already_selected[accn_selection] = 1
                            else:
                                df_b = df_already.copy()
                                for i, cont in df_already.iterrows():
                                    df_already.at[i, 'freq'] = already_selected[cont.Accession]
                                df_already.sort_values(by=['freq', 'Accession'], inplace=True, ignore_index=True, ascending=False)
                                accn_selection = df_already.at[0, 'Accession']
                                index = df_prot.loc[df_prot.Accession == accn_selection].index[0]
                                df.at[index, 'selection'] = True
                                already_selected[accn_selection] += 1                         

In [21]:
# Check
for digest in dfs.keys():
    for source in dfs[digest].keys():
        for frac in dfs[digest][source].keys():
            df = dfs[digest][source][frac]
            assert df.selection.sum() - len(list(set(df.N))) == 0
            if len(df) > 0:
                n_vals = list(set(df.N))
                n_vals.sort()
                for n in n_vals:
                    df_prot = df[df.N == n]
                    assert df_prot.selection.sum() == 1            

In [22]:
# # Only for reference

# # Get all accn ids
# all_accn = []
# for digest in dfs.keys():
#     for source in dfs[digest].keys():
#         for frac in dfs[digest][source].keys():
#             df = dfs[digest][source][frac]
#             if len(df) > 0:
#                 all_accn.append(list(df.Accession))
                    
# all_accn = list(set([item for sublist in all_accn for item in sublist]))
# all_accn.sort()

# # Get dictionary of alternative accn ids (at least in one of the degenerate HCP options)
# aliases_dic = {}
# for accn in all_accn:
#     temp = [a for a in unique_accn_groups if accn in a]
#     temp = list(set([item for sublist in temp for item in sublist]))
#     temp.remove(accn) # N.B.
#     temp.sort()
#     aliases_dic[accn] = temp

# # Instances where HCPs are unique in the given dataframe but not unique elsewhere
# dfs_alias_hits = Dict() 
# for digest in dfs.keys():
#     for source in dfs[digest].keys():
#         for frac in dfs[digest][source].keys():
#             df = dfs[digest][source][frac]
#             if len(df) > 0:
#                 df_nonzero = df[df.Unused > 0]
#                 accn_list = list(df_nonzero.Accession)
#                 temp = []
#                 for a in accn_list:
#                     aliases = aliases_dic[a]
#                     for alias in aliases:
#                         if alias in accn_list:
#                             temp.append([a, alias])
#                 dfs_alias_hits[digest][source][frac] = pd.DataFrame(temp, columns=['accn_1', 'accn_2'])
#                 print(digest, source, frac, len(dfs_alias_hits[digest][source][frac]))

# Get set of all proteins and a master dataframe with their locations

In [23]:
names = {}

for digest in dfs.keys():
    for source in dfs[digest].keys():
        for frac in dfs[digest][source].keys():
            df = dfs[digest][source][frac]
            df = df[df.selection]
            for i, cont in df.iterrows():
                names[cont['Accession']] = cont['Name']
                
df_master_1 = pd.DataFrame.from_dict(names, orient='index')
df_master_1.reset_index(inplace=True)
df_master_1.columns = ['accession', 'name']

for digest in dfs.keys():
    for source in dfs[digest].keys():
        for frac in dfs[digest][source].keys():
            df = dfs[digest][source][frac]
            df = df[df.selection]
            for i, cont in df_master_1.iterrows():
                df_master_1.at[i, f'{digest}_{source}_{frac}'] = cont.accession in list(df['Accession'])

In [24]:
# For reference - if I had just selected the first species in each degenerate case

names = {}

for digest in dfs.keys():
    for source in dfs[digest].keys():
        for frac in dfs[digest][source].keys():
            df = dfs[digest][source][frac]
            df = df[df.Unused > 0]
            for i, cont in df.iterrows():
                names[cont['Accession']] = cont['Name']
                
df_master_dummy = pd.DataFrame.from_dict(names, orient='index')
df_master_dummy.reset_index(inplace=True)
df_master_dummy.columns = ['accession', 'name']

for digest in dfs.keys():
    for source in dfs[digest].keys():
        for frac in dfs[digest][source].keys():
            df = dfs[digest][source][frac]
            df = df[df.Unused > 0]
            for i, cont in df_master_dummy.iterrows():
                df_master_dummy.at[i, f'{digest}_{source}_{frac}'] = cont.accession in list(df['Accession'])
                
len(df_master_dummy), len(df_master_1), (len(df_master_dummy) - len(df_master_1))/len(df_master_1)

(3124, 2668, 0.17091454272863568)

# Add sequences, pI values, and masses to the dataframe

In [25]:
# # Get new html links to look up sequences

# html = ''
# cnt = 0

# for i, a in enumerate(df_master_1.accession):
#     if i % 200 == 0:
#         print(html[:-1], '\n'*2)
#         html = 'https://www.ncbi.nlm.nih.gov/protein/'        
#     html += a + ','
#     cnt += 1
    
# print(html[:-1], '\n'*2)

In [26]:
# My substitution rules for uncertain amino acids
my_sub_rules =\
{
    'B':'D',
    'Z':'E',
    'X':'A',
    'J':'L'
}

# Get sequence dictionary {accession:sequence_object}
sequences = {}
subbed_ids = []

for r in SeqIO.parse("./data/sequences.fasta", "fasta"):
    for aa in my_sub_rules.keys(): # uncertain amino acids
        if aa in r.seq:
            r.seq = r.seq.replace(aa, my_sub_rules[aa])
            subbed_ids.append(r.id)
    sequences[r.id] = r.seq

In [27]:
# missing = []
# for i, cont in df_master_1.iterrows():
#     if cont.accession not in list(sequences.keys()):
#         missing.append(cont.accession)

In [28]:
# for accn in df_master_1.accession:
#     assert accn in sequences.keys()

In [29]:
# Get pI and mass dictionaries {accession:pI/mass}
pI_vals = {}
masses = {}

for p_id, seq in sequences.items():
    pI, is_solved = prot_fun.get_pI(seq)
    assert is_solved
    pI_vals[p_id] = pI
    masses[p_id] = SeqUtils.molecular_weight(seq, seq_type='protein')

  improvement from the last ten iterations.


In [30]:
# Add sequences, pI values, and masses to df_master_1
for i, cont in df_master_1.iterrows():
    df_master_1.at[i, 'sequence'] = str(sequences[cont.accession])
    df_master_1.at[i, 'pI'] = pI_vals[cont.accession]
    df_master_1.at[i, 'mass'] = masses[cont.accession]

In [31]:
# Get other biophysical property dictionaries (assuming pH 7.0)
net_charges, net_neg_charges, net_pos_charges, charge_densities, charge_densities_neg, charge_densities_pos = {}, {}, {}, {}, {}, {}

for p_id, seq in sequences.items():
    net_charge, net_neg_charge, net_pos_charge, charge_dens, charge_dens_neg, charge_dens_pos = prot_fun.get_charge(pH=7.0, seq=seq, charge_contributions=True)
    net_charges[p_id] = net_charge
    net_neg_charges[p_id] = net_neg_charge
    net_pos_charges[p_id] = net_pos_charge
    charge_densities[p_id] = charge_dens
    charge_densities_neg[p_id] = charge_dens_neg
    charge_densities_pos[p_id] = charge_dens_pos

In [32]:
# Add these biophysical properties to df_master_1
for i, cont in df_master_1.iterrows():
    df_master_1.at[i, 'net_charge'] = net_charges[cont.accession]
    df_master_1.at[i, 'net_charge_neg'] = net_neg_charges[cont.accession]
    df_master_1.at[i, 'net_charge_pos'] = net_pos_charges[cont.accession]
    df_master_1.at[i, 'charge_dens_C_m2'] = charge_densities[cont.accession]
    df_master_1.at[i, 'charge_dens_neg_C_m2'] = charge_densities_neg[cont.accession]
    df_master_1.at[i, 'charge_dens_pos_C_m2'] = charge_densities_pos[cont.accession]

In [33]:
# Check for duplicate sequences (i.e. degeneracy of accession numbers corresponding to a single sequence)
sequence_dict = {}
for i, s in enumerate(df_master_1.sequence):
    if s in sequence_dict.keys():
        sequence_dict[s].append(i)
    else:
        sequence_dict[s] = [i]
        
dup_seq = [s for s, indeces in sequence_dict.items() if len(indeces) > 1]
dup_seq_indeces = [indeces for n, indeces in sequence_dict.items() if len(indeces) > 1]

print(dup_seq, dup_seq_indeces)

[] []


In [34]:
# Get cysteine content
for i, cont in df_master_1.iterrows():
    x = ProteinAnalysis(str(sequences[cont.accession]))
    df_master_1.at[i, 'cysteine_cont_percent'] = x.get_amino_acids_percent()['C'] * 100
    df_master_1.at[i, 'cysteine_num'] = x.count_amino_acids()['C']

In [35]:
# Drop sequences for readability
try:
    df_master_1.drop(columns=['sequence'], inplace=True)
except:
    pass

# Combine entries with identically repeated names

In [36]:
hcp_names = {}
for i, n in enumerate(df_master_1.name):
    if n in hcp_names.keys():
        hcp_names[n].append(i)
    else:
        hcp_names[n] = [i]
        
dup_names = [n for n, indeces in hcp_names.items() if len(indeces) > 1]
dup_indeces = [indeces for n, indeces in hcp_names.items() if len(indeces) > 1]
dup_indeces_flat = [item for sublist in dup_indeces for item in sublist]

n_dup_max = max([len(sublist) for sublist in dup_indeces])

In [37]:
for indeces in dup_indeces:
    df_temp = df_master_1.iloc[indeces]
    for sample in ['native_hccf_feed', 'native_hccf_large',
                   'native_hccf_small', 'native_hccf_mab', 'native_hccf_lmw1',
                   'native_pafvin_feed', 'native_pafvin_large', 'native_pafvin_small',
                   'native_pafvin_mab', 'std_pafvin_feed', 'std_pafvin_large',
                   'std_pafvin_small', 'std_pafvin_mab', 'std_hccf_feed', 'std_hccf_large',
                   'std_hccf_small', 'std_hccf_mab', 'std_hccf_lmw1', 'std_hccf_lmw2']:
        if df_temp[sample].sum() > 1:
            print(indeces, '\t', sample,  '\t', df_temp.at[indeces[0], 'name'])
    

[90, 2616] 	 std_hccf_large 	 elongation factor 1-alpha 1 [Cricetulus griseus]
[650, 1005] 	 native_hccf_feed 	 10 kDa heat shock protein, mitochondrial [Cricetulus griseus]
[650, 1005] 	 native_hccf_lmw1 	 10 kDa heat shock protein, mitochondrial [Cricetulus griseus]
[650, 1005] 	 std_hccf_lmw2 	 10 kDa heat shock protein, mitochondrial [Cricetulus griseus]
[772, 1015] 	 native_hccf_feed 	 beta-hexosaminidase subunit beta isoform X2 [Cricetulus griseus]
[772, 1015] 	 native_hccf_small 	 beta-hexosaminidase subunit beta isoform X2 [Cricetulus griseus]
[772, 1015] 	 native_pafvin_large 	 beta-hexosaminidase subunit beta isoform X2 [Cricetulus griseus]


In [38]:
# df_master_1.iloc[dup_indeces_flat]

In [39]:
unique_names = df_master_1.iloc[~df_master_1.index.isin(dup_indeces_flat)].copy()
unique_names.reset_index(inplace=True, drop=True)
unique_names.rename(columns={'accession':'accession_0', 'pI':'pI_0', 'mass':'mass_0', 'net_charge':'net_charge_0', 
                             'net_charge_neg':'net_charge_neg_0', 'net_charge_pos':'net_charge_pos_0',
                             'charge_dens_C_m2':'charge_dens_C_m2_0', 'charge_dens_neg_C_m2':'charge_dens_neg_C_m2_0', 
                             'charge_dens_pos_C_m2':'charge_dens_pos_C_m2_0', 'cysteine_cont_percent':'cysteine_cont_percent_0',
                             'cysteine_num':'cysteine_num_0'}, inplace=True)

for n in dup_names:
    df = df_master_1.iloc[hcp_names[n]].copy()
    df.reset_index(inplace=True, drop=True)
    combined_entry = {}
    
    for i, cont in df.iterrows():
        for column in ['accession', 'pI', 'mass', 'net_charge', 'net_charge_neg', 'net_charge_pos',
                       'charge_dens_C_m2', 'charge_dens_neg_C_m2', 'charge_dens_pos_C_m2',
                       'cysteine_cont_percent', 'cysteine_num']:
            combined_entry[f'{column}_{i}'] = [cont[column]]
        
        if i == 0:
            combined_entry['name'] = [cont['name']]
    
    for sample in ['native_hccf_feed', 'native_hccf_large',
                   'native_hccf_small', 'native_hccf_mab', 'native_hccf_lmw1',
                   'native_pafvin_feed', 'native_pafvin_large', 'native_pafvin_small',
                   'native_pafvin_mab', 'std_pafvin_feed', 'std_pafvin_large',
                   'std_pafvin_small', 'std_pafvin_mab', 'std_hccf_feed', 'std_hccf_large',
                   'std_hccf_small', 'std_hccf_mab', 'std_hccf_lmw1', 'std_hccf_lmw2']:
        combined_entry[sample] = [True in list(df[sample])]
    
    df_temp = pd.DataFrame(combined_entry)
    unique_names = pd.concat([unique_names, df_temp], ignore_index=True)

In [40]:
pI_columns = []
mass_columns = []
net_charge_columns = []
net_charge_neg_columns = []
net_charge_pos_columns = []
charge_dens_C_m2_columns = []
charge_dens_neg_C_m2_columns = []
charge_dens_pos_C_m2_columns = []
cysteine_cont_percent_columns = []
cysteine_num_columns = []

for i in range(n_dup_max):
    pI_columns.append(f'pI_{i}')
    mass_columns.append(f'mass_{i}')
    net_charge_columns.append(f'net_charge_{i}')
    net_charge_neg_columns.append(f'net_charge_neg_{i}')
    net_charge_pos_columns.append(f'net_charge_pos_{i}')
    charge_dens_C_m2_columns.append(f'charge_dens_C_m2_{i}')
    charge_dens_neg_C_m2_columns.append(f'charge_dens_neg_C_m2_{i}')
    charge_dens_pos_C_m2_columns.append(f'charge_dens_pos_C_m2_{i}')
    cysteine_cont_percent_columns.append(f'cysteine_cont_percent_{i}')
    cysteine_num_columns.append(f'cysteine_num_{i}')
    
unique_names['pI_mean'] = unique_names[pI_columns].mean(axis=1)
unique_names['mass_mean'] = unique_names[mass_columns].mean(axis=1)
unique_names['net_charge_mean'] = unique_names[net_charge_columns].mean(axis=1)
unique_names['net_charge_neg_mean'] = unique_names[net_charge_neg_columns].mean(axis=1)
unique_names['net_charge_pos_mean'] = unique_names[net_charge_pos_columns].mean(axis=1)
unique_names['charge_dens_C_m2_mean'] = unique_names[charge_dens_C_m2_columns].mean(axis=1)
unique_names['charge_dens_neg_C_m2_mean'] = unique_names[charge_dens_neg_C_m2_columns].mean(axis=1)
unique_names['charge_dens_pos_C_m2_mean'] = unique_names[charge_dens_pos_C_m2_columns].mean(axis=1)
unique_names['cysteine_cont_percent_mean'] = unique_names[cysteine_cont_percent_columns].mean(axis=1)
unique_names['cysteine_num_mean'] = unique_names[cysteine_num_columns].mean(axis=1)

# Save master dataframe

In [41]:
df_master = unique_names
for i, cont in df_master.iterrows():
    text = cont['name'].replace(' [Cricetulus griseus]', '')
    df_master.at[i, 'desc_lower'] = text.lower()

In [42]:
df_master = df_master[['accession_0', 'name', 'std_hccf_feed', 'std_hccf_large',
                       'std_hccf_small', 'std_hccf_mab', 'std_hccf_lmw1', 'std_hccf_lmw2',
                       'native_hccf_feed', 'native_hccf_large', 'native_hccf_small', 
                       'native_hccf_mab', 'native_hccf_lmw1', 'std_pafvin_feed', 'std_pafvin_large',
                       'std_pafvin_small', 'std_pafvin_mab', 'native_pafvin_feed', 
                       'native_pafvin_large', 'native_pafvin_small', 'native_pafvin_mab', 
                       'pI_mean', 'mass_mean', 'cysteine_cont_percent_mean', 'cysteine_num_mean',
                       'net_charge_mean', 'net_charge_neg_mean', 'net_charge_pos_mean', 
                       'charge_dens_C_m2_mean', 'charge_dens_neg_C_m2_mean', 'charge_dens_pos_C_m2_mean',
                       'pI_0', 'mass_0', 'net_charge_0', 'net_charge_neg_0',
                       'net_charge_pos_0', 'charge_dens_C_m2_0', 'charge_dens_neg_C_m2_0',
                       'charge_dens_pos_C_m2_0', 'accession_1', 'pI_1', 'mass_1',
                       'net_charge_1', 'net_charge_neg_1', 'net_charge_pos_1',
                       'charge_dens_C_m2_1', 'charge_dens_neg_C_m2_1',
                       'charge_dens_pos_C_m2_1', 'accession_2', 'pI_2', 'mass_2',
                       'net_charge_2', 'net_charge_neg_2', 'net_charge_pos_2',
                       'charge_dens_C_m2_2', 'charge_dens_neg_C_m2_2',
                       'charge_dens_pos_C_m2_2', 
                       'cysteine_cont_percent_0', 'cysteine_num_0', 'cysteine_cont_percent_1', 'cysteine_num_1',
                       'cysteine_cont_percent_2', 'cysteine_num_2',
                       'desc_lower']]

In [43]:
for i, cont in df_master.iterrows():
    desc_lower_2 = cont.desc_lower[:]
    desc_lower_2 = desc_lower_2.replace('-', ' ')
    desc_lower_2 = desc_lower_2.replace('_', ' ')
    desc_lower_2 = desc_lower_2.replace(',', '')
    df_master.at[i, 'desc_lower_2'] = desc_lower_2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_master.at[i, 'desc_lower_2'] = desc_lower_2


In [44]:
df_master.to_csv('./generated_tables/location_master_with_properties.csv', index=False)
np.savetxt('./generated_tables/master_accn.txt', df_master.accession.values, fmt='%s')

AttributeError: 'DataFrame' object has no attribute 'accession'

In [45]:
df_master_pared_down = df_master.copy()

In [46]:
df_master_pared_down.rename(columns={'accession_0':'accession'}, inplace=True)
df_master_pared_down.drop(columns=['pI_0', 'mass_0', 'net_charge_0', 'net_charge_neg_0', 'net_charge_pos_0',
                       'charge_dens_C_m2_0', 'charge_dens_neg_C_m2_0',
                       'charge_dens_pos_C_m2_0', 'accession_1', 'pI_1', 'mass_1',
                       'net_charge_1', 'net_charge_neg_1', 'net_charge_pos_1',
                       'charge_dens_C_m2_1', 'charge_dens_neg_C_m2_1',
                       'charge_dens_pos_C_m2_1', 'accession_2', 'pI_2', 'mass_2',
                       'net_charge_2', 'net_charge_neg_2', 'net_charge_pos_2',
                       'charge_dens_C_m2_2', 'charge_dens_neg_C_m2_2',
                       'charge_dens_pos_C_m2_2', 'cysteine_cont_percent_0', 'cysteine_num_0',
                       'cysteine_cont_percent_1', 'cysteine_num_1', 'cysteine_cont_percent_2',
                       'cysteine_num_2'], inplace=True)

In [47]:
df_master_pared_down.to_csv('./generated_tables/location_master_with_only_mean_properties.csv', index=False)

# Unite native and standard digest results

In [48]:
df_master_united = df_master_pared_down.copy()

In [49]:
for source_frac in ['hccf_feed', 'hccf_large', 'hccf_small', 'hccf_mab', 'hccf_lmw1', 'pafvin_feed', 'pafvin_large', 'pafvin_small', 'pafvin_mab']:
    df_master_united[source_frac] = False
    indeces = df_master_united[(df_master_united[f'std_{source_frac}']) | (df_master_united[f'native_{source_frac}'])].index
    df_master_united.loc[indeces, source_frac] = True
    
df_master_united.rename(columns={'std_hccf_lmw2':'hccf_lmw2'}, inplace=True)
df_master_united.drop(columns=['std_hccf_feed', 'std_hccf_large',
                       'std_hccf_small', 'std_hccf_mab', 'std_hccf_lmw1',
                       'native_hccf_feed', 'native_hccf_large', 'native_hccf_small', 
                       'native_hccf_mab', 'native_hccf_lmw1', 'std_pafvin_feed', 'std_pafvin_large',
                       'std_pafvin_small', 'std_pafvin_mab', 'native_pafvin_feed', 
                       'native_pafvin_large', 'native_pafvin_small', 'native_pafvin_mab'], inplace=True)

df_master_united = df_master_united[['accession', 'name', 'hccf_feed', 'hccf_large', 'hccf_small', 'hccf_mab', 'hccf_lmw1', 'hccf_lmw2', 
                       'pafvin_feed', 'pafvin_large', 'pafvin_small', 'pafvin_mab',
                       'pI_mean', 'mass_mean', 'net_charge_mean',
                       'net_charge_neg_mean', 'net_charge_pos_mean', 'charge_dens_C_m2_mean',
                       'charge_dens_neg_C_m2_mean', 'charge_dens_pos_C_m2_mean', 'desc_lower', 'desc_lower_2']]

In [50]:
df_master_united.to_csv('./generated_tables/location_master_united_with_only_mean_properties.csv', index=False)