In [31]:
### input: .hhr files from all vs all ECF comparison
### output: MCL clustering of ECF based on prob x qcov matrix
### output: Cytoscape ECF networks (full and limited to ECFs clustering together with CC-annotated ECFs)

In [32]:
import glob
import sys
import pandas as pd
sys.path.append('/Users/kszczepaniak/Code/phage-pipeline-env/phage-pipeline')
from lib_phage.ecf_finder_wrapper import load_and_filter_data
from csb.bio.io import HHOutputParser

work_dir       = '/Users/kszczepaniak/Data/Phage/preliminary-results/ecf-compare/'
output_dirpath = work_dir + 'output/prot-families/all-by-all/'
annot_filepath = '/Users/kszczepaniak/Data/Phage/Bogna/annotation.data.for.rafal.new.cov.0.8.txt'

In [106]:
### get all hhr data (parse to csv file)

def build_hhr_table(work_dir):

    """Build a table of results from hhr files."""
    """FIXME: should go into package"""

    output_hhblits_dirpath = work_dir + 'intermediate/prot-families/all-by-all'
    
    hhr_table_filpath   =  '{}/table-ecf-hhr.txt'.format(work_dir + 'output/prot-families/all-by-all')
    ftable              = open(hhr_table_filpath, 'w')
    ftable.write('qname,qstart,qend,qlength,sname,sstart,send,slength,pident,bitscore,eval,prob,pval\n') # write header

    for fhhr in sorted(glob.glob(output_hhblits_dirpath + '/*.hhr')):
        qname    = fhhr.split('/')[-1].split('.')[0]
        parser   = HHOutputParser()
        hit_list = parser.parse_file(fhhr)
        for hit in hit_list:
            record = ','.join([ str(i) for i in [qname, hit.qstart, hit.qend,
                               hit.qlength, 'ecf_' + hit.id, hit.start, hit.end, hit.length,
                               int(hit.identity), hit.score, hit.evalue, (hit.probability * 100),
                               hit.pvalue]])
            ftable.write(record + '\n')
    ftable.close()
    
build_hhr_table(work_dir)

In [3]:
### load and show table
hhr_table        = pd.read_csv(output_dirpath + 'table-ecf-hhr.txt', sep=',')
hhr_table

Unnamed: 0,qname,qstart,qend,qlength,sname,sstart,send,slength,pident,bitscore,eval,prob,pval
0,ecf_reprseq00025_0,1,277,277,ecf_reprseq00025_0,1,277,277,45,958.0,6.000000e-146,100.0,2.000000e-149
1,ecf_reprseq00025_0,2,188,277,ecf_reprseq00025_1,5,142,138,69,67.9,9.800000e-07,96.7,3.300000e-10
2,ecf_reprseq00025_0,157,189,277,ecf_reprseq14277_0,65,95,31,23,51.6,2.400000e-04,94.5,8.000000e-08
3,ecf_reprseq00025_1,1,156,156,ecf_reprseq00025_1,1,156,156,70,420.2,2.500000e-69,100.0,8.400000e-73
4,ecf_reprseq00025_1,37,155,156,ecf_reprseq24083_2,3,116,114,6,187.2,1.300000e-29,99.9,4.500000e-33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1103877,ecf_reprseq34445_1,11,34,74,ecf_reprseq16773_0,16,39,24,4,49.6,1.700000e-04,94.7,5.600000e-08
1103878,ecf_reprseq34445_1,3,62,74,ecf_reprseq17440_0,51,128,78,0,41.8,7.200000e-04,93.4,2.500000e-07
1103879,ecf_reprseq34447_0,1,71,71,ecf_reprseq34447_0,1,71,71,89,189.3,8.300000e-37,100.0,2.800000e-40
1103880,ecf_reprseq34447_0,1,71,71,ecf_reprseq19584_0,5,75,71,24,174.5,2.800000e-33,99.9,9.700000e-37


In [4]:
### add fields to the table
# add qcov
hhr_table = hhr_table.assign(qcov = lambda x: (x.qend - x.qstart + 1) / x.qlength)
# create prob x qcov metrics
hhr_table = hhr_table.assign(probxqcov = lambda x: x.prob * x.qcov * 0.01)

In [5]:
# DEBUG
# hhr_table = hhr_table[hhr_table['sname'] == hhr_table['qname']]
# hhr_table
hhr_table[hhr_table['qname'] == 'ecf_reprseq20966_0']

Unnamed: 0,qname,qstart,qend,qlength,sname,sstart,send,slength,pident,bitscore,eval,prob,pval,qcov,probxqcov
644743,ecf_reprseq20966_0,1,22,22,ecf_reprseq20966_0,1,22,22,100,63.3,1.3e-15,99.1,4.4e-19,1.0,0.991


In [7]:
### select interactions (pval/prob threshold)
# hhr_table[hhr_table.pval < 1e-03]
hhr_table = hhr_table[hhr_table.prob >= 95.0]
hhr_table = hhr_table[hhr_table.qcov >= 0.5]

# eliminate self-hits (discard singletons that have only hit to itself)
# hhr_table = hhr_table[hhr_table['sname'] != hhr_table['qname']]
hhr_table

Unnamed: 0,qname,qstart,qend,qlength,sname,sstart,send,slength,pident,bitscore,eval,prob,pval,qcov,probxqcov
0,ecf_reprseq00025_0,1,277,277,ecf_reprseq00025_0,1,277,277,45,958.0,6.000000e-146,100.0,2.000000e-149,1.000000,1.000000
1,ecf_reprseq00025_0,2,188,277,ecf_reprseq00025_1,5,142,138,69,67.9,9.800000e-07,96.7,3.300000e-10,0.675090,0.652812
3,ecf_reprseq00025_1,1,156,156,ecf_reprseq00025_1,1,156,156,70,420.2,2.500000e-69,100.0,8.400000e-73,1.000000,1.000000
4,ecf_reprseq00025_1,37,155,156,ecf_reprseq24083_2,3,116,114,6,187.2,1.300000e-29,99.9,4.500000e-33,0.762821,0.762058
5,ecf_reprseq00025_1,37,155,156,ecf_reprseq10300_0,1,117,117,12,172.3,7.500000e-27,99.8,2.600000e-30,0.762821,0.761295
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1103874,ecf_reprseq34445_1,2,73,74,ecf_reprseq11064_2,31,120,90,8,59.0,3.000000e-07,97.0,1.000000e-10,0.972973,0.943784
1103875,ecf_reprseq34445_1,3,59,74,ecf_reprseq23952_0,4,71,68,9,51.9,2.500000e-05,95.7,8.400000e-09,0.770270,0.737149
1103879,ecf_reprseq34447_0,1,71,71,ecf_reprseq34447_0,1,71,71,89,189.3,8.300000e-37,100.0,2.800000e-40,1.000000,1.000000
1103880,ecf_reprseq34447_0,1,71,71,ecf_reprseq19584_0,5,75,71,24,174.5,2.800000e-33,99.9,9.700000e-37,1.000000,0.999000


In [111]:
### save to SIF file for Cytoscape
# this will consider every hit from table as an interaction in network (all of them will be equal in Cytoscape)
output_dirpath = work_dir + 'output/prot-families/all-by-all/'
fsif           = open(output_dirpath + 'ecf_network.sif', 'w')

for eid, ecf in hhr_table.groupby('qname'):
    row = eid + ' pp'
    for hid, hit in ecf.iterrows():
        row += ' ' + hit['sname']
    row += '\n'
    fsif.write(row)    
fsif.close()

In [9]:
### save annotations file - to be loaded as additional data by Cytoscape
### based on Bogna's annotation file

# load
annot = pd.read_csv(annot_filepath, sep='\t')

# process
annot_ecfs = annot.iloc[0:0]

for eid, ecf in hhr_table.groupby('qname'):
    a = annot[annot.qname == eid[4:-2]].copy()
    a.qname = eid
    annot_ecfs = annot_ecfs.append(a, ignore_index=True)

# save
annot_ecfs = annot_ecfs.set_index('qname')
annot_ecfs.to_csv(output_dirpath + 'ecf_annot.txt')
annot_ecfs

Unnamed: 0_level_0,seq.length,BP.level.1,BP.level.2,BP.level.3,BP.level.4,BP.level.5,CC.level.1,CC.level.2,CC.level.3,MF.level.1,MF.level.2,phrog.class,phrog.annot
qname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ecf_reprseq00025_0,288,,,,,,,,,,,,
ecf_reprseq00025_1,288,,,,,,,,,,,,
ecf_reprseq00027_0,82,,,,,,,,,,,transcription regulation| unknown,transcriptional regulator| transcriptional rep...
ecf_reprseq00027_1,82,,,,,,,,,,,transcription regulation| unknown,transcriptional regulator| transcriptional rep...
ecf_reprseq00027_2,82,,,,,,,,,,,transcription regulation| unknown,transcriptional regulator| transcriptional rep...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
ecf_reprseq34436_4,79,,,,,,,,,,,integration and excision,cox-like excisionase and repressor
ecf_reprseq34445_0,163,,,,,,,,,,,other| unknown,mazg-like pyrophosphatase| unknown function
ecf_reprseq34445_1,163,,,,,,,,,,,other| unknown,mazg-like pyrophosphatase| unknown function
ecf_reprseq34447_0,133,,,,,,,,,,,"moron, auxiliary metabolic gene and host takeover",hicb-like antitoxin| toxin-antitoxin system hi...


In [202]:
# save ECF interactions to csv readable to MCL
# interactions are weighted by prob x qcov parameter

fcsv = open(output_dirpath + 'ecf_mcl_in.csv', 'w')
for hid, hit in hhr_table.iterrows():
    fcsv.write(' '.join([hit.qname, hit.sname, str(hit.probxqcov), '\n']))
fcsv.close()

In [10]:
### EXTERNAL: run MCL [externally, it was done on LBS logan]

In [11]:
# get clusters from MCL results
ecf_clusters = {}

fmcl = open(output_dirpath + 'dump.data.mci.I14', 'r')
for cid, clust in enumerate(fmcl):
    ecf_clusters[cid] = clust.strip().split('\t')
    
fmcl.close()

In [13]:
# DEBUG
ecf_clusters[34]

['ecf_reprseq06659_0',
 'ecf_reprseq07217_3',
 'ecf_reprseq08117_0',
 'ecf_reprseq06762_1',
 'ecf_reprseq07894_0',
 'ecf_reprseq07442_0',
 'ecf_reprseq26546_1',
 'ecf_reprseq06762_0',
 'ecf_reprseq08238_1',
 'ecf_reprseq08279_0',
 'ecf_reprseq21396_0',
 'ecf_reprseq10157_0',
 'ecf_reprseq07395_1',
 'ecf_reprseq16762_0',
 'ecf_reprseq21055_0',
 'ecf_reprseq21055_1',
 'ecf_reprseq06935_0',
 'ecf_reprseq20932_0',
 'ecf_reprseq28708_0',
 'ecf_reprseq06708_0',
 'ecf_reprseq06921_0',
 'ecf_reprseq07259_2',
 'ecf_reprseq08818_0',
 'ecf_reprseq17280_1',
 'ecf_reprseq07682_1',
 'ecf_reprseq12325_0',
 'ecf_reprseq23066_0',
 'ecf_reprseq26947_0',
 'ecf_reprseq23666_1',
 'ecf_reprseq33960_0',
 'ecf_reprseq10690_1',
 'ecf_reprseq07217_1',
 'ecf_reprseq26473_0',
 'ecf_reprseq07682_0',
 'ecf_reprseq10190_0',
 'ecf_reprseq21972_0',
 'ecf_reprseq28250_1',
 'ecf_reprseq25923_0',
 'ecf_reprseq09669_1']

In [205]:
### table for Bogna (ECF clusters and their composition)
### store table of ecf_clust <id>: ecf_id1, ecf_id2...

fclust_prot = open(output_dirpath + 'ecf_clusts_to_ecfs.txt', 'w')

for cid, clust in ecf_clusters.items():
    reprseqs = [ x.split('_')[1] for x in clust ]
    fclust_prot.write(str(cid) + ' ' + ','.join(list(reprseqs)) + '\n')

fclust_prot.close()

In [27]:
# select MCL clusters with ECFs from proteins annotated with GO CC terms

protein_clusters_CC = {}

# prepare reprseq annot: filter to only those with CC GO term
annot_CC        = annot[~annot['CC.level.1'].isnull() | ~annot['CC.level.2'].isnull() | ~annot['CC.level.3'].isnull()]
annot_CC_qnames = annot_CC['qname'].to_list()

# for each cluster: check if contains ECF from protein with GO CC term
# if yes: store to new dict
for cid, clust in ecf_clusters.items():
    for ecf in clust:
        prot_id = ecf.split('_')[1]
        if prot_id in annot_CC_qnames:
            protein_clusters_CC[cid] = clust
            break # once any protein meeting condition is found there is no point to seek further

print(len(ecf_clusters)) # total clusters
print(len(protein_clusters_CC)) # clusters with ECF from CC annotated proteins
annot_CC

410
64


Unnamed: 0,qname,seq.length,BP.level.1,BP.level.2,BP.level.3,BP.level.4,BP.level.5,CC.level.1,CC.level.2,CC.level.3,MF.level.1,MF.level.2,phrog.class,phrog.annot
58,reprseq00059,360,viral process,viral life cycle,virion assembly,"virion assembly, unknown",,virion part,viral capsid,"viral capsid, unknown",molecular_function,catalytic activity| structural molecule activity,head and packaging,major head protein
59,reprseq00060,140,,,,,,virion part,viral capsid,"viral capsid, decoration",,,,
61,reprseq00062,552,viral process,viral life cycle,viral entry into host cell| virion assembly,"viral entry into host cell, unknown| virion as...",,virion part,viral capsid,viral portal complex,molecular_function,binding| structural molecule activity,head and packaging,portal protein
69,reprseq00070,748,viral process,viral life cycle,viral genome replication,,,virion part,"virion part, unknown",,molecular_function,binding| catalytic activity,,
123,reprseq00124,131,,,,,,virion part,viral capsid,"viral capsid, decoration",,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34339,reprseq34340,523,,,,,,virion part,viral capsid,"viral capsid, unknown",molecular_function,binding| catalytic activity,head and packaging,major head protein
34379,reprseq34380,321,,,,,,virion part,"virion part, unknown",,molecular_function,binding| catalytic activity,head and packaging,major head protein
34448,reprseq34449,345,viral process,viral life cycle,viral entry into host cell,"viral entry into host cell, unknown",,virion part,viral capsid,"viral capsid, unknown",,,head and packaging,portal protein
34451,reprseq34452,336,,,,,,virion part,viral capsid,"viral capsid, unknown",,,head and packaging,virion structural protein


In [25]:
# create non-redundant list of allowed reprseqs for table filtering
lst_lst          = [ list(x) for x in protein_clusters_CC.values() ]
CC_reprseq_list  = [y for x in lst_lst for y in x] # flatten list of lists
CC_reprseq_list  = list(set(CC_reprseq_list)) # eliminate redeundancy
allowed_reprseqs = '|'.join(CC_reprseq_list)

In [26]:
# apply filter: leave only those records where either qname or sname is reprseq that was found in a cluster with at least one CC term reprseq
hhr_table_CC = hhr_table[(hhr_table['qname'].str.contains(allowed_reprseqs)) | (hhr_table['sname'].str.contains(allowed_reprseqs))]
hhr_table_CC

Unnamed: 0,qname,qstart,qend,qlength,sname,sstart,send,slength,pident,bitscore,eval,prob,pval,qcov,probxqcov
7574,ecf_reprseq00048_1,1,417,417,ecf_reprseq00048_1,1,417,417,57,727.2,1.800000e-97,100.0,7.000000e-101,1.000000,1.000000
7575,ecf_reprseq00048_1,1,415,417,ecf_reprseq00178_2,36,430,395,21,579.8,2.000000e-76,100.0,7.100000e-80,0.995204,0.995204
7576,ecf_reprseq00048_1,1,415,417,ecf_reprseq00536_2,1,409,409,20,538.6,5.300000e-71,100.0,1.900000e-74,0.995204,0.995204
7577,ecf_reprseq00048_1,2,415,417,ecf_reprseq27494_1,2,384,383,17,438.4,2.700000e-57,100.0,9.500000e-61,0.992806,0.992806
7578,ecf_reprseq00048_1,4,415,417,ecf_reprseq21183_0,1,372,372,23,436.6,3.100000e-57,100.0,1.100000e-60,0.988010,0.988010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1092457,ecf_reprseq34338_0,2,313,313,ecf_reprseq29920_0,1,279,279,12,263.0,1.300000e-35,100.0,4.400000e-39,0.996805,0.996805
1092462,ecf_reprseq34338_0,90,284,313,ecf_reprseq16822_0,49,176,128,6,140.6,3.200000e-17,99.3,1.100000e-20,0.623003,0.618642
1092477,ecf_reprseq34340_0,1,331,331,ecf_reprseq34340_0,1,331,331,74,805.8,1.000000e-114,100.0,4.000000e-118,1.000000,1.000000
1092478,ecf_reprseq34340_0,2,330,331,ecf_reprseq27098_1,11,288,278,17,428.6,9.300000e-60,100.0,3.100000e-63,0.993958,0.993958


In [28]:
### save to SIF file for Cytoscape (ONLY CC term)
output_dirpath = work_dir + 'output/prot-families/all-by-all/'
fsif           = open(output_dirpath + 'ecf_network_CC.sif', 'w')

for eid, ecf in hhr_table_CC.groupby('qname'):
    row = eid + ' pp'
    for hid, hit in ecf.iterrows():
        row += ' ' + hit['sname']
    row += '\n'
    fsif.write(row)    
fsif.close()

In [29]:
### save annotations file - to be loaded as additional data by Cytoscape (ONLY CC term)
### based on Bogna's annotation file

annot_ecfs_CC = annot.iloc[0:0]

for eid, ecf in hhr_table_CC.groupby('qname'):
    a = annot[annot.qname == eid[4:-2]].copy()
    a.qname = eid
    annot_ecfs_CC = annot_ecfs_CC.append(a, ignore_index=True)

# save
annot_ecfs_CC = annot_ecfs_CC.set_index('qname')
annot_ecfs_CC.to_csv(output_dirpath + 'ecf_annot_CC.txt')
annot_ecfs_CC

Unnamed: 0_level_0,seq.length,BP.level.1,BP.level.2,BP.level.3,BP.level.4,BP.level.5,CC.level.1,CC.level.2,CC.level.3,MF.level.1,MF.level.2,phrog.class,phrog.annot
qname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ecf_reprseq00048_1,961,,,,,,,,,,,tail,tail protein
ecf_reprseq00070_1,748,viral process,viral life cycle,viral genome replication,,,virion part,"virion part, unknown",,molecular_function,binding| catalytic activity,,
ecf_reprseq00178_0,1472,,,,,,,,,,,tail,tail protein
ecf_reprseq00178_1,1472,,,,,,,,,,,tail,tail protein
ecf_reprseq00178_2,1472,,,,,,,,,,,tail,tail protein
...,...,...,...,...,...,...,...,...,...,...,...,...,...
ecf_reprseq34223_0,742,,,,,,,,,,,,
ecf_reprseq34223_1,742,,,,,,,,,,,,
ecf_reprseq34290_0,522,,,,,,,,,,,,
ecf_reprseq34338_0,427,,,,,,,,,,,"dna, rna and nucleotide metabolism",hnh endonuclease


In [212]:
# DEBUG
annot_ecfs_CC.loc['ecf_reprseq16264_0']

seq.length                          309
BP.level.1                          NaN
BP.level.2                          NaN
BP.level.3                          NaN
BP.level.4                          NaN
BP.level.5                          NaN
CC.level.1                  virion part
CC.level.2                 viral capsid
CC.level.3     viral capsid, decoration
MF.level.2                          NaN
phrog.class          head and packaging
phrog.annot    hoc-like head decoration
Name: ecf_reprseq16264_0, dtype: object

In [125]:
### DEPRECATED: initial table for Bogna (misunderstanding of required data)
# ecf_clust_<id> prot1, prot2, ..., protx

# load table with ECF data (protein composition)
ecf80_path = '/Users/kszczepaniak/Data/Phage/preliminary-results/table-ecf-aho-80.csv'
ecf_80_table = pd.read_csv(ecf80_path)

# for each cluster: get all ECFs and retrive comprising proteins

protein_clusters = {}

for cid, clust in ecf_clusters.items():
    seqs_list = []
    for ecf in clust:
        seqid = ecf.replace('ecf_', '')
        prots = ecf_80_table[ecf_80_table.ecf_id == seqid].hits.to_list()[0].split(';')
        seqs_list += prots
    protein_clusters[cid] = set(seqs_list)
    
# store in table of format:
fclust_prot = open(output_dirpath + 'ecf_clusters_proteins.txt', 'w')

for cid, prots in protein_clusters.items():
    fclust_prot.write(str(cid) + ' ' + ','.join(list(prots)) + '\n')

fclust_prot.close()