In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from IPython.core.display import display, HTML, clear_output
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
import pandas as pd
import numpy as np
from scipy import optimize
from scipy.stats import sem

import os
import sys

import matplotlib.pyplot as plt
import matplotlib.ticker
from matplotlib.patches import Rectangle
from matplotlib.colors import ListedColormap, BoundaryNorm

from venn import venn, pseudovenn
import seaborn as sns

from Bio import SeqIO, SeqUtils

from addict import Dict

import math

In [4]:
sys.path.append("/home/chase/codes/python_functions/")
import plotting as my_plot

import proteomics_functions as prot_fun

# Prepare SWATH data

In [23]:
df_master = pd.read_csv('./generated_tables/swath_master_df_ppm.csv')
for i, cont in df_master.iterrows():
    text = cont['name'].replace(' [Cricetulus griseus]', '')
    text = text.lower()
    text = text.replace('-', ' ')
    text = text.replace('_', ' ')
    text = text.replace(',', '')
    df_master.at[i, 'desc_lower_2'] = text

# High-risk HCPs 

In [9]:
# # Generate the high-risk candidate list
# path = './lit_data/biophorum_high_risk_clean.csv'
# df_lit = pd.read_csv(path)
# prot_fun.get_name_matches(df_master, path, 'high_risk')
# df_risk = df_master[df_master.high_risk.notnull()].copy()
# df_risk.reset_index(drop=True, inplace=True)

# for i, cont in df_risk.iterrows():
#     for col in ['impact', 'impact_type', 'function']:
#         data = df_lit.loc[df_lit.desc_lower_2 == cont.high_risk, col].iloc[0] 
#         df_risk.at[i, col] = data
        
# df_risk.to_csv('./generated_tables/high_risk_swath_unreviewed.csv', index=False)

# Difficult to remove

In [24]:
prot_fun.get_name_matches(df_master, './lit_data/biophorum_clean.csv', 'Biophorum DTR')
prot_fun.get_name_matches(df_master, './lit_data/biophorum_high_risk_clean.csv', 'Biophorum high-risk')
prot_fun.get_name_matches(df_master, './lit_data/levy_2014_clean.csv', 'Levy 2014 - Association')
prot_fun.get_name_matches(df_master, './lit_data/zhang_table_2_clean.csv', 'Zhang 2016 - Pro A')
prot_fun.get_name_matches(df_master, './lit_data/levy_2016_clean.csv', 'Levy 2016 - Coelution')
prot_fun.get_name_matches(df_master, './lit_data/molden_clean.csv', 'Molden 2021 - 29 DS')
prot_fun.get_name_matches(df_master, './lit_data/falkenberg_table_4_clean.csv', 'Falkenberg 2019 - 6 DS')
prot_fun.get_name_matches(df_master, './lit_data/kreimer_table_1_clean.csv', 'Kreimer 2017 - 1 DS')
prot_fun.get_name_matches(df_master, './lit_data/aboulaich_table_1_clean.csv', 'Aboulaich 2014 - Association')

In [27]:
papers = ['Biophorum DTR', 'Biophorum high-risk', 'Levy 2014 - Association', 'Zhang 2016 - Pro A', 
          'Levy 2016 - Coelution', 'Molden 2021 - 29 DS', 'Falkenberg 2019 - 6 DS', 'Kreimer 2017 - 1 DS',
          'Aboulaich 2014 - Association']

df_dtr = df_master[(df_master[papers[0]].notnull()) |
                   (df_master[papers[1]].notnull()) |
                   (df_master[papers[2]].notnull()) |
                   (df_master[papers[3]].notnull()) |
                   (df_master[papers[4]].notnull()) |
                   (df_master[papers[5]].notnull()) |
                   (df_master[papers[6]].notnull()) |
                   (df_master[papers[7]].notnull()) |
                   (df_master[papers[8]].notnull()) ].copy()

df_dtr.reset_index(inplace=True, drop=True)
df_dtr.sort_values(by=['perfect_match', 'contains_ubiquitin', 'contains_actin', 'desc_lower_2'], inplace=True)

# To make the review easier - this requires manual attention

df_dtr = df_dtr[['desc_lower_2', 'Biophorum DTR', 'Biophorum high-risk',
                 'Levy 2014 - Association', 'Zhang 2016 - Pro A',
                 'Levy 2016 - Coelution', 'Molden 2021 - 29 DS', 
                 'Falkenberg 2019 - 6 DS', 'Kreimer 2017 - 1 DS',
                 'Aboulaich 2014 - Association', 
                 'perfect_match', 'contains_ubiquitin', 'contains_actin', 
                 'accn', 'name', 'cq_a5', 'cq_b12', 'cq_d9', 'cq_g12', 'cq_eluate',
                 'xq_a5', 'xq_b12', 'xq_d9', 'xq_g12', 'xq_eluate', 'hq_a5', 'hq_b12',
                 'hq_d9', 'hq_g12', 'hq_eluate']]

df_dtr.to_csv('./generated_tables/difficult_to_remove_swath_unreviewed.csv', index=False)

In [42]:
# Add risk data - do this before manual review next time

df = pd.read_csv('./generated_tables/difficult_to_remove_swath_reviewed_no_risk_data.csv')
df_lit = pd.read_csv('./lit_data/biophorum_high_risk_clean.csv')

risk_indeces = df[df[papers[1]].notnull()].index

for i in risk_indeces:
    desc_lower_2 = df.at[i, papers[1]]
    for col in ['impact', 'impact_type', 'function']:
        data = df_lit.loc[df_lit['desc_lower_2'] == desc_lower_2, col].iloc[0] 
        df.at[i, col] = data
        
# df.to_csv('./generated_tables/difficult_to_remove_swath_reviewed.csv', index=False)

# Save high-risk as a separate dataframe for the article

In [9]:
# df = pd.read_csv('./generated_tables/difficult_to_remove_swath_reviewed.csv')
# df[df['Biophorum high-risk'].notnull()].to_csv('./generated_tables/just_high_risk_swath_reviewed.csv')