In [2]:
import os
import csv
import pandas as pd
from collections import Counter, OrderedDict
from Bio import SeqIO
from itertools import combinations, product

# Counts

In [9]:
clusters = {}
descriptions = {}

with open('lpp/blast_rodeo/blast_rodeo_out/peptidase_RODEO/main_co_occur.csv', 'r') as i:
    next(i)
    for row in csv.reader(i):
        if len(row) > 1:
            
            if len(row) > 7:
                dom = row[8]
                description = row[9]
            else:
                dom = 'no_match'
                description = 'No matching domains found'
            descriptions[dom] = description
            
            if row[0] not in clusters:
                clusters[row[0]] = []
            clusters[row[0]].append(dom)

In [13]:
def count(clusters):
    flat_clusters = []
    for x in clusters:
        flat_clusters += clusters[x]
    return(Counter(flat_clusters))

In [15]:
def cocount(clusters):
    flat_pairs = []
    for x in clusters:
        domains = set(clusters[x])
        pairs = list(combinations(domains, 2))
        flat_pairs += [tuple(sorted(x)) for x in pairs]
    return(Counter(flat_pairs))

In [17]:
counter = count(clusters)
cocounter = cocount(clusters)

In [19]:
lpp_counter = count(clusters)
lpp_cocounter = cocount(clusters)

In [21]:
df = pd.DataFrame.from_dict(counter, orient='index').reset_index()
df.columns = ['Domain', 'Count']
df = df.sort_values(by='Count', ascending=False)
df['Description'] = df['Domain'].map(descriptions)

In [23]:
df

Unnamed: 0,Domain,Count,Description
6,no_match,661,No matching domains found
9,TIGR02228,192,sigpep_I_arch: signal peptidase I
8,PqqD,186,Coenzyme PQQ synthesis protein D (PqqD)
11,NTP_transf_5,156,Uncharacterised nucleotidyltransferase
10,TIGR02203,104,MsbA_lipidA: lipid A export permease/ATP-bindi...
...,...,...,...
416,TIGR00280,1,eL43_euk_arch: ribosomal protein eL43
414,TIGR01750,1,fabZ: beta-hydroxyacyl-(acyl-carrier-protein) ...
412,TIGR02488,1,flgG_G_neg: flagellar basal-body rod protein FlgG
411,TIGR02490,1,flgF: flagellar basal-body rod protein FlgF


In [25]:
codf = pd.DataFrame([(k[0], k[1], v) for k, v in cocounter.items()])
codf.columns = ['Domain 1', 'Domain 2', 'Co-occurrence']
codf = codf.sort_values(by='Co-occurrence', ascending=False)

In [27]:
codf

Unnamed: 0,Domain 1,Domain 2,Co-occurrence
88,TIGR02228,no_match,183
78,PqqD,no_match,143
70,PqqD,TIGR02228,142
84,NTP_transf_5,TIGR02228,120
118,NTP_transf_5,no_match,119
...,...,...,...
5342,Asparaginase_II,PqqD,1
5341,Asparaginase_II,YbjN,1
5340,Asparaginase_II,SNARE_assoc,1
5339,Asparaginase_II,BPD_transp_1,1


In [33]:
with pd.ExcelWriter('counts.xlsx', engine='xlsxwriter') as writer:
    df.to_excel(writer, sheet_name='Domain count', index=False)
    codf.to_excel(writer, sheet_name='Domain co-occurrence', index=False)

# Domain clusters

In [36]:
dom_cl = {}
with open('dom_cluster.csv', 'r') as i:
    for row in csv.reader(i, delimiter = ';'):
        dom_cl[row[0]] = [x for x in row[:8] if x != '']

In [40]:
dom_cl_flat = list(dom_cl.keys())
for x in dom_cl:
    dom_cl_flat += dom_cl[x]
dom_cl_flat = set(dom_cl_flat)

In [54]:
dom_cl_rev = {}
for x in dom_cl:
    for y in dom_cl[x]:
        dom_cl_rev[y] = x

In [56]:
clusters_domcl = {}
for x in clusters:
    clusters_domcl[x] = []
    for y in clusters[x]:
        if y in dom_cl_rev:
            dom = dom_cl_rev[y]
        else:
            dom = y
        clusters_domcl[x].append(dom)

In [58]:
counter_domcl = count(clusters_domcl)
cocounter_domcl = cocount(clusters_domcl)

In [60]:
lpp_counter_domcl = count(clusters_domcl)
lpp_cocounter_domcl = cocount(clusters_domcl)

In [62]:
df_domcl = pd.DataFrame.from_dict(counter_domcl, orient='index').reset_index()
df_domcl.columns = ['Domain', 'Count']
df_domcl = df_domcl.sort_values(by='Count', ascending=False)
df_domcl['Description'] = df_domcl['Domain'].map(descriptions)

In [64]:
codf_domcl = pd.DataFrame([(k[0], k[1], v) for k, v in cocounter_domcl.items()])
codf_domcl.columns = ['Domain 1', 'Domain 2', 'Co-occurrence']
codf_domcl = codf_domcl.sort_values(by='Co-occurrence', ascending=False)

In [66]:
with pd.ExcelWriter('counts_domcl.xlsx', engine='xlsxwriter') as writer:
    df_domcl.to_excel(writer, sheet_name='Domain count', index=False)
    codf_domcl.to_excel(writer, sheet_name='Domain co-occurrence', index=False)

# Per connected component

In [65]:
net = pd.read_csv('peptidase sequence pruned default node.csv')

In [69]:
cc_rev = net.set_index('id')['__ccCluster'].to_dict()

In [73]:
cc = {}
for x in cc_rev:
    if cc_rev[x] not in cc:
        cc[cc_rev[x]] = []
    cc[cc_rev[x]].append(x)

In [199]:
def tigr_name(x, descriptions):
    if x.startswith('TIGR'):
        name = descriptions[x].split(': ')[0]
    else:
        name = x
    return name

In [None]:
def count_filter(filter, clusters):
    flat_clusters = []
    for x in clusters:
        if x in filter:
            flat_clusters += clusters[x]
    return(Counter(flat_clusters))

In [None]:
def cocount_filter(filter, clusters):
    flat_pairs = []
    for x in clusters:
        if x in filter:
            domains = set(clusters[x])
            pairs = list(combinations(domains, 2))
            flat_pairs += [tuple(sorted(x)) for x in pairs]
    return(Counter(flat_pairs))

In [201]:
count_per_cc = {}
cocount_per_cc = {}
for x in cc:
    counter = count_filter(cc[x], clusters_domcl)
    cocounter = cocount_filter(cc[x], clusters_domcl)

    rename = {}
    n = 0
    for y in counter:
        n += 1
        rename[y] = 'cc' + str(x) + '_' + str(n)
        count_per_cc[rename[y]] = [y, tigr_name(y, descriptions), descriptions[y], x, counter[y], counter[y]/len(cc[x]), counter_domcl[y]]

    for z in cocounter:
        tuple_renamed = tuple(rename[dom] for dom in z)
        cocount_per_cc[tuple_renamed] = [cocounter[z], cocounter[z]/len(cc[x])]

In [207]:
df = pd.DataFrame.from_dict(count_per_cc, orient='index').reset_index()
df.columns = ['cc_id', 'domain', 'domain_name', 'description', 'cc', 'cc_count', 'norm_cc_count', 'total_count']
df = df.sort_values(by='total_count', ascending=False)

In [209]:
df

Unnamed: 0,cc_id,domain,domain_name,description,cc,cc_count,norm_cc_count,total_count
433,cc36_2,no_match,no_match,No matching domains found,36,3,3.000000,661
1035,cc8_1,no_match,no_match,No matching domains found,8,8,4.000000,661
889,cc21_2,no_match,no_match,No matching domains found,21,10,10.000000,661
975,cc5_7,no_match,no_match,No matching domains found,5,31,6.200000,661
1066,cc16_9,no_match,no_match,No matching domains found,16,1,1.000000,661
...,...,...,...,...,...,...,...,...
490,cc4_26,HTH_38,HTH_38,Helix-turn-helix domain,4,1,0.166667,1
506,cc4_42,TIGR03990,Arch_GlmM,Arch_GlmM: phosphoglucosamine mutase,4,1,0.166667,1
507,cc4_43,End_N_terminal,End_N_terminal,N terminal extension of bacteriophage endosial...,4,1,0.166667,1
508,cc4_44,polyprenyl_synt,polyprenyl_synt,Polyprenyl synthetase,4,1,0.166667,1


In [211]:
codf = pd.DataFrame([(k[0], k[1], v[0], v[1]) for k, v in cocount_per_cc.items()])
codf.columns = ['cc_id 1', 'cc_id 2', 'co-occurrence', 'norm_co-occurrence']
codf = codf.sort_values(by='co-occurrence', ascending=False)

In [213]:
codf

Unnamed: 0,cc_id 1,cc_id 2,co-occurrence,norm_co-occurrence
92,cc1_10,cc1_7,120,0.944882
8,cc1_10,cc1_8,115,0.905512
6,cc1_8,cc1_7,113,0.889764
100,cc1_9,cc1_10,111,0.874016
7,cc1_9,cc1_8,110,0.866142
...,...,...,...,...
5133,cc1_348,cc1_10,1,0.007874
5132,cc1_340,cc1_345,1,0.007874
5131,cc1_348,cc1_340,1,0.007874
5130,cc1_340,cc1_13,1,0.007874


In [215]:
with pd.ExcelWriter('cc_counts.xlsx', engine='xlsxwriter') as writer:
    df.to_excel(writer, sheet_name='Domain count', index=False)
    codf.to_excel(writer, sheet_name='Domain co-occurrence', index=False)

In [216]:
df.to_csv('cc_counts.csv', index=False)
codf.to_csv('cc_cocounts.csv', index=False)