In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from scipy import optimize
from scipy.stats import sem

import os
import sys

import matplotlib.pyplot as plt
import matplotlib.ticker
from matplotlib.patches import Rectangle
from matplotlib.colors import ListedColormap, BoundaryNorm

from venn import venn, pseudovenn
import seaborn as sns

from Bio import SeqIO, SeqUtils

from addict import Dict

import math

In [3]:
sys.path.append("../")
import plotting as my_plot

sys.path.append("./mean_field_elect/")
import proteomics_functions as prot_fun

# Prepare SWATH data

In [7]:
df_master = pd.read_csv('./generated_tables/swath_master_df_ppm.csv')
for i, cont in df_master.iterrows():
    text = cont['name'].replace(' [Cricetulus griseus]', '')
    text = text.lower()
    text = text.replace('-', ' ')
    text = text.replace('_', ' ')
    text = text.replace(',', '')
    df_master.at[i, 'desc_lower_2'] = text

# High-risk HCPs 

In [6]:
# Generate the high-risk candidate list
path = './lit_data/biophorum_high_risk_clean.csv'
df_lit = pd.read_csv(path)
prot_fun.get_name_matches(df_master, path, 'high_risk')
df_risk = df_master[df_master.high_risk_name.notnull()].copy()
df_risk.reset_index(drop=True, inplace=True)

for i, cont in df_risk.iterrows():
    for col in ['impact', 'impact_type', 'function']:
        data = df_lit.loc[df_lit.desc_lower_2 == cont.high_risk_name, col].iloc[0] 
        df_risk.at[i, col] = data
        
df_risk.to_csv('./generated_tables/high_risk_swath_unreviewed.csv', index=False)

# Difficult to remove

**Re-run** the SWATH data prep before running this section

In [8]:
prot_fun.get_name_matches(df_master, './lit_data/biophorum_clean.csv', 'Biophorum DTR', name_append='')
prot_fun.get_name_matches(df_master, './lit_data/biophorum_high_risk_clean.csv', 'Biophorum high-risk', name_append='')
prot_fun.get_name_matches(df_master, './lit_data/levy_2014_clean.csv', 'Levy 2014 - Association', name_append='')
prot_fun.get_name_matches(df_master, './lit_data/zhang_table_2_clean.csv', 'Zhang 2016 - Pro A', name_append='')
prot_fun.get_name_matches(df_master, './lit_data/levy_2016_clean.csv', 'Levy 2016 - Coelution', name_append='')
prot_fun.get_name_matches(df_master, './lit_data/molden_clean.csv', 'Molden 2021 - 29 DS', name_append='')
prot_fun.get_name_matches(df_master, './lit_data/falkenberg_table_4_clean.csv', 'Falkenberg 2019 - 6 DS', name_append='')
prot_fun.get_name_matches(df_master, './lit_data/kreimer_table_1_clean.csv', 'Kreimer 2017 - 1 DS', name_append='')
prot_fun.get_name_matches(df_master, './lit_data/aboulaich_table_1_clean.csv', 'Aboulaich 2014 - Association', name_append='')

In [9]:
papers = ['Biophorum DTR', 'Biophorum high-risk', 'Levy 2014 - Association', 'Zhang 2016 - Pro A', 
          'Levy 2016 - Coelution', 'Molden 2021 - 29 DS', 'Falkenberg 2019 - 6 DS', 'Kreimer 2017 - 1 DS',
          'Aboulaich 2014 - Association']

df_dtr = df_master[(df_master[papers[0]].notnull()) |
                   (df_master[papers[1]].notnull()) |
                   (df_master[papers[2]].notnull()) |
                   (df_master[papers[3]].notnull()) |
                   (df_master[papers[4]].notnull()) |
                   (df_master[papers[5]].notnull()) |
                   (df_master[papers[6]].notnull()) |
                   (df_master[papers[7]].notnull()) |
                   (df_master[papers[8]].notnull()) ]

df_dtr.reset_index(inplace=True, drop=True)
df_dtr.to_csv('./generated_tables/difficult_to_remove_swath_unreviewed.csv', index=False)

# Ad hoc - after updating the ppm values 
After having already reviewed the name matches

In [16]:
df_master = pd.read_csv('./generated_tables/swath_master_df_ppm.csv')
df_risk = pd.read_csv('./generated_tables/high_risk_swath_reviewed_prev.csv') # previous (wrong) ppm values
df_dtr = pd.read_csv('./generated_tables/difficult_to_remove_swath_reviewed_prev.csv') # previous (wrong) ppm values

In [18]:
for i, cont in df_risk.iterrows():
    for col in ['hccf_feed', 'hccf_large', 'hccf_small', 'hccf_mab', 'hccf_spf1', 'hccf_spf2', 'pavin_feed', 'pavin_large', 'pavin_small']:
        df_risk.at[i, col] = df_master.loc[df_master.accn == cont.accn, col].iloc[0]
        
for i, cont in df_dtr.iterrows():
    for col in ['hccf_feed', 'hccf_large', 'hccf_small', 'hccf_mab', 'hccf_spf1', 'hccf_spf2', 'pavin_feed', 'pavin_large', 'pavin_small']:
        df_dtr.at[i, col] = df_master.loc[df_master.accn == cont.accn, col].iloc[0]

In [19]:
# df_risk.to_csv('./generated_tables/high_risk_swath_reviewed.csv', index=False)
# df_dtr.to_csv('./generated_tables/difficult_to_remove_swath_reviewed.csv', index=False)

# Ad hoc - after receiving PAVIN mAb data

In [5]:
df_master = pd.read_csv('./generated_tables/swath_master_df_ppm.csv')
df_risk = pd.read_csv('./generated_tables/high_risk_swath_reviewed_no_pavin_mAb.csv') # previous (wrong) ppm values
df_dtr = pd.read_csv('./generated_tables/difficult_to_remove_swath_reviewed_no_pavin_mAb.csv') # previous (wrong) ppm values

In [6]:
for i, cont in df_risk.iterrows():
    for col in ['hccf_feed', 'hccf_large', 'hccf_small', 'hccf_mab', 'hccf_spf1', 'hccf_spf2', 'pavin_feed', 'pavin_large', 'pavin_small', 'pavin_mab']:
        df_risk.at[i, col] = df_master.loc[df_master.accn == cont.accn, col].iloc[0]
        
for i, cont in df_dtr.iterrows():
    for col in ['hccf_feed', 'hccf_large', 'hccf_small', 'hccf_mab', 'hccf_spf1', 'hccf_spf2', 'pavin_feed', 'pavin_large', 'pavin_small', 'pavin_mab']:
        df_dtr.at[i, col] = df_master.loc[df_master.accn == cont.accn, col].iloc[0]

In [7]:
# df_risk.to_csv('./generated_tables/high_risk_swath_reviewed.csv', index=False)
# df_dtr.to_csv('./generated_tables/difficult_to_remove_swath_reviewed.csv', index=False)