In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from scipy import optimize
from scipy.stats import sem

import os
import sys

import matplotlib.pyplot as plt
import matplotlib.ticker
from matplotlib.patches import Rectangle
from matplotlib.colors import ListedColormap, BoundaryNorm

from venn import venn, pseudovenn
import seaborn as sns

from Bio import SeqIO, SeqUtils

from addict import Dict

import math

In [3]:
sys.path.append("../")
import plotting as my_plot

sys.path.append("/home/chase/my_repos/Misc_electrostatics/sim_vs_exp/mean_field_elect/")
import proteomics_functions as prot_fun

In [4]:
def clean_data(df_con):
    for i, cont in df_con.iterrows():
        string = cont['name'].lower()

        string = string.replace('✓', '')
        string = string.replace('\xa0', ' ')
        string = string.replace('\u2009', ' ')
        string = string.replace(') c', ')')
        string = string.replace(') d', ')')
        string = string.replace(') e', ')')
        string = string.replace(' - ', '-')
        string = string.replace('-β1', '')

        string = string.strip()
        if string.find('(')-1 > 0:
            df_con.at[i, 'desc_lower'] = string[:string.find('(')-1]
        else:
            df_con.at[i, 'desc_lower'] = string

        if '(' in string:
            df_con.at[i, 'acronym'] = string[string.find('(')+1:-1]
        else:
            df_con.at[i, 'acronym'] = False
        
        desc_lower = df_con.at[i, 'desc_lower']
        desc_lower_2 = desc_lower.replace('-', ' ')
        desc_lower_2 = desc_lower_2.replace('_', ' ')
        desc_lower_2 = desc_lower_2.replace(',', '')
        df_con.at[i, 'desc_lower_2'] = desc_lower_2
    return 

# Biophorum
10.1002/bit.27808
- Arbitrary list of difficult-to-remove species
- Cited list of high-risk species

In [5]:
df_biophorum = pd.read_csv('./lit_data/dirty/biophorum_dirty.csv')
df_biophorum.columns = ['name', 'mw_kDa', 'pI', 'uniprot', 'n_aa']
clean_data(df_biophorum)
df_biophorum.to_csv('./lit_data/biophorum_clean.csv', index=False)

In [6]:
df_risk = pd.read_csv('./lit_data/dirty/biophorum_high_risk_dirty.csv')
df_risk.columns = ['name', 'function', 'impact', 'impact_type', 'references']
clean_data(df_risk)
df_risk.to_csv('./lit_data/biophorum_high_risk_clean.csv', index=False)

# Molden
10.1080/19420862.2021.1955811
- In at least one of 29 commercial antibody products

In [7]:
df_molden = pd.read_csv('./lit_data/dirty/molden_dirty.csv')
df_molden.rename(columns={'Accession Number (Uniport) ':'uniprot', 'Protein Name':'name'}, inplace=True)
clean_data(df_molden)
df_molden.to_csv('./lit_data/molden_clean.csv', index=False)

# Falkenberg
10.1002/btpr.2788
- In 5 / 6 mAb drug substances

In [8]:
df_falk = pd.read_csv('./lit_data/dirty/falkenberg_table_4_dirty.csv')
clean_data(df_falk)
df_falk.to_csv('./lit_data/falkenberg_table_4_clean.csv', index=False)

# Kreimer
10.1021/acs.analchem.6b04892
- In drug substance of one mAb

In [9]:
df = pd.read_csv('./lit_data/dirty/kreimer_table_1_dirty.csv')
clean_data(df)
df.to_csv('./lit_data/kreimer_table_1_clean.csv', index=False)

# Zhang
10.1002/btpr.2272
- Co-purification in Pro A in 15 mAbs

In [10]:
df = pd.read_csv('./lit_data/dirty/zhang_table_2_dirty.csv')
clean_data(df)
df.to_csv('./lit_data/zhang_table_2_clean.csv', index=False)

# Levy 2014 
10.1002/bit.25158
- Product association with at least one of 5 mAbs

In [11]:
df = pd.read_csv('./lit_data/dirty/levy_2014_dirty.csv')
clean_data(df)
df.to_csv('./lit_data/levy_2014_clean.csv', index=False)

# Levy 2016
10.1002/bit.25882
- Coelution during polishing operations or association with mAb

In [12]:
df = pd.read_csv('./lit_data/dirty/levy_2016_dirty.csv')
clean_data(df)
df.to_csv('./lit_data/levy_2016_clean.csv', index=False)

# Aboulaich 2014
10.1002/btpr.1948
- Association with one of four mAbs

In [13]:
df = pd.read_csv('./lit_data/dirty/aboulaich_table_1_dirty.csv')
clean_data(df)
df.to_csv('./lit_data/aboulaich_table_1_clean.csv', index=False)

# Consolidation

In [14]:
df_biophorum = pd.read_csv('./lit_data/biophorum_clean.csv')
df_risk      = pd.read_csv('./lit_data/biophorum_high_risk_clean.csv')
df_levy_4    = pd.read_csv('./lit_data/levy_2014_clean.csv')
df_zhang     = pd.read_csv('./lit_data/zhang_table_2_clean.csv')
df_levy_6    = pd.read_csv('./lit_data/levy_2016_clean.csv')
df_molden    = pd.read_csv('./lit_data/molden_clean.csv')
df_falk      = pd.read_csv('./lit_data/falkenberg_table_4_clean.csv')
df_kreimer   = pd.read_csv('./lit_data/kreimer_table_1_clean.csv')

dfs = [df_biophorum, df_risk, df_levy_4, df_zhang, df_levy_6, df_molden, df_falk, df_kreimer]
papers = ['Biophorum DTR', 'Biophorum high-risk', 'Levy 2014 - Association', 'Zhang 2016 - Pro A', 
          'Levy 2016 - Coelution', 'Molden 2021 - 29 DS', 'Falkenberg 2019 - 6 DS', 'Kreimer 2017 - 1 DS']
dois = ['10.1002/bit.27808', '10.1002/bit.27808', '10.1002/bit.25158', '10.1002/btpr.2272', '10.1002/bit.25882',
       '10.1080/19420862.2021.1955811', '10.1002/btpr.2788', '10.1021/acs.analchem.6b04892']

In [15]:
name_list, desc_lower_list, desc_lower_2_list, paper_list, doi_list = [], [], [], [], []

for (df, paper, doi) in zip(dfs, papers, dois):
    for i, cont in df.iterrows():
        name_list.append(cont.name)
        desc_lower_list.append(cont.desc_lower)
        desc_lower_2_list.append(cont.desc_lower_2)
        paper_list.append(paper)
        doi_list.append(doi)
        
df = pd.DataFrame({'name':name_list, 'desc_lower':desc_lower_list, 'desc_lower_2':desc_lower_2_list,
                   'paper':paper_list, 'doi':doi_list})
df.to_csv('./lit_data/combined.csv')