In [1]:
import pandas as pd

# read data
resist = pd.read_csv('~/data0118/scoary_summary/resist.csv', index_col = 0, header = 0)
gold_anno = pd.read_pickle('/home/hermuba/data0118/goldstandard/ec_rmplasmid_node_anno_df')

# subsetting genes 
resist_clusters = resist.columns.tolist()
resist_subset = gold_anno.loc[gold_anno['cluster'].isin(resist_clusters)]

In [2]:
resist_subset.shape # total 5340 resistant genes

(5340, 20)

In [13]:
resist_subset.index

Index(['562.10576.con.0026_20|562.10576', '562.10576.con.0029_10|562.10576',
       '562.10576.con.0029_16|562.10576', '562.10576.con.0029_19|562.10576',
       '562.10576.con.0029_44|562.10576', '562.10576.con.0029_46|562.10576',
       '562.10576.con.0029_6|562.10576', '562.10576.con.0033_20|562.10576',
       '562.10576.con.0037_1|562.10576', '562.10576.con.0041_12|562.10576',
       ...
       'NMWW01000332_5|562.19090', 'NMWW01000332_7|562.19090',
       'NMWW01000332_8|562.19090', 'NMWW01000332_9|562.19090',
       'NMWW01000333_10|562.19090', 'NMWW01000339_1|562.19090',
       'NMWW01000349_6|562.19090', 'NMWW01000392_6|562.19090',
       'NMWW01000413_7|562.19090', 'NMWW01000488_6|562.19090'],
      dtype='object', length=5340)

In [3]:
# how many has domain annotation: a lot better than GO --> might provide more clue than GO 
resist_subset['domain'].count()/resist_subset.shape[0]

0.61385767790262169

In [11]:
# when calculating domain network I had absense presence pattern of those domains for each gene
domain_abs_file = '~/data0118/domain/domain_abs_rm_plasmid'

domain_abs = pd.read_csv(domain_abs_file, header = 0, index_col = 0)

In [44]:
# domain names annotation from interpro
domain_anno_file = '~/data0118/interpro/all'
from Genome.goldstandard_pair.parse_interpro_out import parse
domain_anno = parse(domain_anno_file)

In [45]:
domain_anno = domain_anno.loc[domain_anno['ipr_accession'].notnull()]
domain_anno.drop_duplicates(subset = 'ipr_accession', inplace = True)
ipr_name_mapper = domain_anno.set_index('ipr_accession')['ipr_describe']

In [46]:
ipr_name_mapper.shape # map ipr accession to name

(8543,)

In [12]:
domain_abs.shape # super big file nah

(18601, 8543)

In [27]:
cont_df = pd.DataFrame(columns = domain_abs.columns, index = ['tp', 'tn', 'fp', 'fn', 'odds'])
no_resist_gene = resist_subset.shape[0]
no_genes = domain_abs.shape[0]
def contingency_table(ipr):
    tp = domain_abs.loc[resist_subset.index, ipr].sum() # how many resistant genes have that domain
    p = domain_abs[ipr].sum()
    fp = p - tp # with that domain, but not resistant gene
    
    n = no_genes - p # total genes without that domain
    fn = no_resist_gene - tp # resistant gene without that domain
    tn = n - fn
    odds = (tp+tn)/(fp+fn)
    cont_df.loc[:, ipr] = [tp, tn, fp, fn, odds]

In [28]:
# run for all domain
_ = [contingency_table(ipr) for ipr in domain_abs.columns]

In [29]:
cont_df

Unnamed: 0,IPR021908,IPR022369,IPR014730,IPR007495,IPR007420,IPR010574,IPR013584,IPR017162,IPR040761,IPR011789,...,IPR033718,IPR003774,IPR009288,IPR010917,IPR002792,IPR009308,IPR011130,IPR022737,IPR001661,IPR005074
tp,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,1.0
tn,13261.0,13260.0,13255.0,13260.0,13259.0,13260.0,13260.0,13261.0,13259.0,13260.0,...,13258.0,13260.0,13259.0,13245.0,13258.0,13260.0,13260.0,13260.0,13258.0,13249.0
fp,0.0,1.0,6.0,1.0,2.0,1.0,1.0,0.0,2.0,1.0,...,3.0,1.0,2.0,16.0,3.0,1.0,1.0,1.0,3.0,12.0
fn,5339.0,5339.0,5340.0,5340.0,5340.0,5340.0,5340.0,5339.0,5340.0,5340.0,...,5340.0,5340.0,5340.0,5335.0,5340.0,5340.0,5340.0,5340.0,5340.0,5339.0
odds,2.483986,2.483333,2.479424,2.482681,2.482029,2.482681,2.482681,2.483986,2.482029,2.482681,...,2.481378,2.482681,2.482029,2.476173,2.481378,2.482681,2.482681,2.482681,2.481378,2.476173


In [30]:
# narrow down
cont_df.loc[:, cont_df.loc['tp'] > 0]
# domain might be too rare to perform statistic analysis??

Unnamed: 0,IPR021908,IPR022369,IPR017162,IPR025955,IPR018117,IPR027477,IPR022532,IPR036442,IPR002053,IPR025285,...,IPR003615,IPR036667,IPR019995,IPR003663,IPR000551,IPR037171,IPR014735,IPR002104,IPR010917,IPR005074
tp,1.0,1.0,1.0,4.0,3.0,1.0,1.0,3.0,1.0,1.0,...,21.0,1.0,1.0,1.0,4.0,10.0,2.0,53.0,5.0,1.0
tn,13261.0,13260.0,13261.0,13252.0,13234.0,13253.0,13259.0,13250.0,13260.0,13249.0,...,13146.0,13252.0,13260.0,13255.0,13251.0,13211.0,13257.0,13082.0,13245.0,13249.0
fp,0.0,1.0,0.0,9.0,27.0,8.0,2.0,11.0,1.0,12.0,...,115.0,9.0,1.0,6.0,10.0,50.0,4.0,179.0,16.0,12.0
fn,5339.0,5339.0,5339.0,5336.0,5337.0,5339.0,5339.0,5337.0,5339.0,5339.0,...,5319.0,5339.0,5339.0,5339.0,5336.0,5330.0,5338.0,5287.0,5335.0,5339.0
odds,2.483986,2.483333,2.483986,2.480075,2.467748,2.478773,2.482681,2.478123,2.483333,2.476173,...,2.423077,2.478123,2.483333,2.480075,2.479424,2.457435,2.482029,2.403037,2.476173,2.476173


In [47]:
cont_df.loc['odds', :].sort_values(ascending = False).head() # not helpful at all, all be 2

IPR025668    2.487252
IPR003325    2.486598
IPR012933    2.485945
IPR008490    2.485945
IPR006142    2.485945
Name: odds, dtype: float64

In [56]:
# most prevalent domain among resistant genes
prev_domain = cont_df.loc['tp', :].sort_values(ascending = False) 
pd.concat([ipr_name_mapper, prev_domain[:20]], join = 'inner', axis = 1).sort_values(by = 'tp', ascending = False)# what are the top 100 prevalent domains

Unnamed: 0,ipr_describe,tp
IPR027417,P-loop containing nucleoside triphosphate hydr...,217.0
IPR036388,Winged helix-like DNA-binding domain superfamily,107.0
IPR036390,Winged helix DNA-binding domain superfamily,80.0
IPR009057,Homeobox-like domain superfamily,75.0
IPR010982,"Lambda repressor-like, DNA-binding domain supe...",74.0
IPR029063,S-adenosyl-L-methionine-dependent methyltransf...,58.0
IPR011010,"DNA breaking-rejoining enzyme, catalytic core",58.0
IPR013762,"Integrase-like, catalytic domain superfamily",55.0
IPR012337,Ribonuclease H-like superfamily,54.0
IPR002104,"Integrase, catalytic domain",53.0


In [62]:
# what domains are more predominantly in resistant genes
pred_domain = (cont_df.loc['tp', :]/cont_df.loc['fp', :]).sort_values(ascending = False)
pred_domain.name = 'tp_fp'
ratio = pd.concat([ipr_name_mapper, pred_domain, prev_domain.loc[pred_domain.index]], join = 'inner', axis = 1).sort_values(by = 'tp_fp', ascending = False)# what are the top 100 prevalent domains
ratio.loc[ratio['tp'] > 2]

Unnamed: 0,ipr_describe,tp_fp,tp
IPR021767,Transposon Tn21 modulator protein,inf,3.0
IPR004111,"Tetracycline repressor TetR, C-terminal",inf,3.0
IPR003012,"Tetracycline transcriptional regulator, TetR",inf,3.0
IPR013423,Conserved hypothetical protein CHP02594,inf,3.0
IPR020404,Protein of unknown function DUF2713,inf,3.0
IPR018602,Gp37/putative cytoplasmic protein STM4215,inf,3.0
IPR025140,Putative 2/3 transmembrane domain holin,inf,3.0
IPR003586,Hint domain C-terminal,inf,4.0
IPR030934,Intein C-terminal splicing region,inf,4.0
IPR006142,Intein,inf,4.0


In [None]:
cont_df.to_pickle('domain_cont_table')