In [1]:
from pathlib import Path
import subprocess
from collections import namedtuple

# external modules 
import requests
import pandas as pd
from Bio import SeqIO

In [87]:
multifasta_proteins = '../../results/cassettes/clusters/merged_cassette_proteins.faa.gz'
clusters = '../../results/cassettes/clusters/mmseqs2_cluster.tsv'
annotation_dir = '../../results/Prokka/'
defense_system_prediction_dir = '../../results/cassettes/extracted/'

In [3]:
def cassettes_headers_to_df(multifasta_proteins):
    cmd = ['zgrep', '-Po', '(?<=^\>).+', multifasta_proteins]
    columns = ['contigs', 'cassettes', 'locus_tags', 'headers']
    data = []
    for header in subprocess.run(cmd, check=True, stdout=subprocess.PIPE).stdout.decode('UTF-8').splitlines():
        rows = header.split(':')
        assert len(rows) == 3, 'Header {} is invalid !'.format(header)
        rows.append(header)
        data.append(rows)
        
    df = pd.DataFrame(data=data, columns=columns) \
            .assign(genome_ids = lambda x: x['contigs'].str.split(':') \
                .apply(lambda x: '_'.join(x[0].split('_')[1:-1])))
    return df

In [4]:
headers_df = cassettes_headers_to_df(multifasta_proteins=multifasta_proteins)
clusters_df = pd.read_csv(clusters, sep='\t', header=None).rename(columns={0:'representatives', 1:'members'})

In [5]:
# Count mmseqs clusters with more than one sequence (botton length value is count)
clusters_df.groupby('representatives')['representatives'].count().sort_values(ascending=False).where(lambda x: x > 1).dropna()

representatives
Contig_1395103.4_47:Cassette_1:KDHGNBDG_03652    14.0
Contig_936157.3_23:Cassette_1:IOMDNFNB_01075     12.0
Contig_936157.3_23:Cassette_1:IOMDNFNB_01077     12.0
Contig_936157.3_62:Cassette_1:IOMDNFNB_04497     12.0
Contig_936157.3_62:Cassette_1:IOMDNFNB_04496     12.0
                                                 ... 
Contig_1395103.4_5:Cassette_1:KDHGNBDG_00493      2.0
Contig_28901.3694_7:Cassette_5:HDGJAOOK_01579     2.0
Contig_28901.4009_1:Cassette_2:BKDLJELO_00369     2.0
Contig_28901.3694_7:Cassette_5:HDGJAOOK_01568     2.0
Contig_28901.4009_1:Cassette_2:BKDLJELO_00371     2.0
Name: representatives, Length: 237, dtype: float64

In [14]:
# Check if sequence is a mmseqs clusters representative
headers_df = headers_df.assign(is_rep = lambda df: list(map(lambda header: header in clusters_df.representatives.values, df.headers.values)))
headers_df[headers_df.is_rep].head()

Unnamed: 0,contigs,cassettes,locus_tags,headers,genome_ids,is_rep
0,Contig_936157.3_6,Cassette_1,IOMDNFNB_00323,Contig_936157.3_6:Cassette_1:IOMDNFNB_00323,936157.3,True
1,Contig_936157.3_6,Cassette_1,IOMDNFNB_00324,Contig_936157.3_6:Cassette_1:IOMDNFNB_00324,936157.3,True
2,Contig_936157.3_6,Cassette_1,IOMDNFNB_00325,Contig_936157.3_6:Cassette_1:IOMDNFNB_00325,936157.3,True
3,Contig_936157.3_6,Cassette_1,IOMDNFNB_00326,Contig_936157.3_6:Cassette_1:IOMDNFNB_00326,936157.3,True
5,Contig_936157.3_6,Cassette_1,IOMDNFNB_00328,Contig_936157.3_6:Cassette_1:IOMDNFNB_00328,936157.3,True


In [7]:
def list_annotations_files(annotation_dir: str) -> dict:
    cmd = [
        'find',
        annotation_dir,
        '-type', 'f',
        '-name', '*.gbk'
    ]
    files = {}
    for file in subprocess.run(cmd, check=True, stdout=subprocess.PIPE).stdout.decode('UTF-8').splitlines():
        file = Path(file)
        genome_id = file.stem.replace('_out', '')
        genome_path = str(file.absolute())
        assert not files.get(genome_id), 'Repeated ids: {}'.format(genome_id)
        files[genome_id] = genome_path
        
    return files

def retrive_features_from_gbk(df, gbk_path):
    my_feature = namedtuple('Feature', 'locus_tag gene product translation')
    wanted_locus_tags = set(df.locus_tags.values)
    CDSs = []
    # Read files and get the features of the representative clusters
    with open(gbk_path) as f:
        for record in SeqIO.parse(gbk_path, 'genbank'):
            for feature in filter(lambda fet: fet.type == 'CDS' and fet.qualifiers.get('locus_tag', [None])[0] in wanted_locus_tags, record.features):
                get_qual = lambda x: feature.qualifiers.get(x, [None])[0]
                locus_tag = get_qual('locus_tag')
                gene = get_qual('gene')
                gene = 'Unk' if not gene else gene
                product = get_qual('product')
                product = 'Unk' if product == 'hypothetical protein' else product 
                translation = get_qual('translation')
                CDSs.append((
                    locus_tag,
                    my_feature(
                        locus_tag,
                        gene,
                        product,
                        translation,
                    )
                ))
    
    CDSs = dict(CDSs)
    # Add features to dataframe
    return df.assign(features = lambda df: df.locus_tags.apply(lambda locus_tag: CDSs.get(locus_tag, None)))
                
                
def locus_tag_to_edges(topology: list, drop_duplicates=False):
    edges = []
    for indx, cluster_id in enumerate(topology):
        if indx == 0:
            edges.append((cluster_id, topology[indx+1]))
        elif indx == len(topology) - 1:
            edges.append((cluster_id, topology[indx-1]))
        else:
            edges.append((cluster_id, topology[indx-1]))
            edges.append((cluster_id, topology[indx+1]))
    if not drop_duplicates:
        return edges
    
    strict_edges = []
    for edge in edges:
        if not ( edge in strict_edges or edge[::-1] in strict_edges ):
            strict_edges.append(edge)
    return strict_edges     
        

In [8]:
# Retrive feature (gene name, translation, locus_tag) information of clusters representatives 
annotation_files = list_annotations_files(annotation_dir)

rep_with_features = []
for genome, df in headers_df[headers_df.is_rep].groupby('genome_ids'):
    gbk_path = annotation_files.get(genome)
    if gbk_path:
        rep_with_features.append(retrive_features_from_gbk(df, gbk_path))
    
get_clusters_ids = lambda df: list(map(lambda x: 'ID_{}_{}'.format(*x),zip(list(map(str, df.index.values)), list(map(lambda fet: fet.gene, df.features)))))
rep_with_features_df = pd.concat(rep_with_features).reset_index(drop=True).assign(cluster_ids = lambda df: get_clusters_ids(df))
rep_with_features_df[~rep_with_features_df.cluster_ids.str.contains('Unk')].head()

Unnamed: 0,contigs,cassettes,locus_tags,headers,genome_ids,is_rep,features,cluster_ids
9,Contig_1395103.4_5,Cassette_1,KDHGNBDG_00483,Contig_1395103.4_5:Cassette_1:KDHGNBDG_00483,1395103.4,True,"(KDHGNBDG_00483, rep_2, ATP-dependent DNA heli...",ID_9_rep_2
11,Contig_1395103.4_5,Cassette_1,KDHGNBDG_00487,Contig_1395103.4_5:Cassette_1:KDHGNBDG_00487,1395103.4,True,"(KDHGNBDG_00487, proA_1, Gamma-glutamyl phosph...",ID_11_proA_1
12,Contig_1395103.4_5,Cassette_1,KDHGNBDG_00488,Contig_1395103.4_5:Cassette_1:KDHGNBDG_00488,1395103.4,True,"(KDHGNBDG_00488, proB, Glutamate 5-kinase, MSD...",ID_12_proB
13,Contig_1395103.4_5,Cassette_1,KDHGNBDG_00490,Contig_1395103.4_5:Cassette_1:KDHGNBDG_00490,1395103.4,True,"(KDHGNBDG_00490, crl, Sigma factor-binding pro...",ID_13_crl
15,Contig_1395103.4_5,Cassette_1,KDHGNBDG_00492,Contig_1395103.4_5:Cassette_1:KDHGNBDG_00492,1395103.4,True,"(KDHGNBDG_00492, gpt, Xanthine phosphoribosylt...",ID_15_gpt


In [59]:
# Get nodes and edges
neighborhood_gdf = clusters_df \
        .merge(right=rep_with_features_df.loc[:,['headers', 'cluster_ids']], left_on='representatives', right_on='headers') \
        .drop(['representatives', 'headers'], axis=1) \
        .assign(cassettes = lambda df: tuple(map(lambda member: ':'.join(member.split(':')[:2]), df.members))) \
        .assign(locus_tags = lambda df: tuple(map(lambda member: member.split(':')[2], df.members))) \
        .groupby('cassettes')

In [76]:
edges = []
for cassette, df in neighborhood_gdf:
    # make shore that the locus_tags are ordered otherwise we lost
    # topology relationships.
    
    df = df.sort_values(['locus_tags'], key=lambda locus_tags: list(map(lambda x: int(x[1]), locus_tags.str.split('_').values)))
    cassette_edges = pd.DataFrame(data=locus_tag_to_edges(topology=df.loc[:,['cluster_ids']].cluster_ids.values, drop_duplicates=True), columns=['source', 'target'])
    
    # edges.append(pd.merge(
    #     left=cassette_edges,
    #     right=rep_with_features_df.loc[:,['cluster_ids', 'features']].assign(features = lambda df: list(map(lambda fet: str(fet).replace('Feature(', '').replace(')', '').replace(',',';').replace(' ',''), df.features))),
    #     left_on='sources',
    #     right_on='cluster_ids',
    #     how='inner'
    # ).drop('cluster_ids', axis=1).rename(columns={'features':'source_features'}))
    
    edges.append(cassette_edges)
    

edges_df = pd.concat(edges).drop_duplicates().reset_index(drop=True)

In [66]:
import requests
exec(requests.get("https://raw.githubusercontent.com/cytoscape/jupyter-bridge/master/client/p4c_init.py").text)
IPython.display.Javascript(_PY4CYTOSCAPE_BROWSER_CLIENT_JS) # Start browser client
import py4cytoscape as p4c
p4c.cytoscape_ping()
p4c.cytoscape_version_info()



Loading Javascript client ... 855b6e9c-dc35-451e-842a-e055ba8ff0df on https://jupyter-bridge.cytoscape.org
You are connected to Cytoscape!


{'apiVersion': 'v1',
 'cytoscapeVersion': '3.9.1',
 'automationAPIVersion': '1.6.0',
 'py4cytoscapeVersion': '1.5.0'}

In [83]:
# nodes_df = edges_df.loc[:,['source']].rename(columns={'source':'id'})
# p4c.create_network_from_data_frames(nodes_df, edges_df, title="Salmonella Defense Systems", collection="DataFrame Example")

In [82]:
p4c.delete_all_networks()

''

In [100]:
cmd = ['find', defense_system_prediction_dir, '-name', '*merged_defense_systems_prediction.csv']
file_paths = subprocess.run(cmd, check=True, stdout=subprocess.PIPE).stdout.decode('UTF-8').splitlines()
defense_genes_prediction_df = pd.concat(map(lambda path: pd.read_csv(path, sep=','), file_paths))
defense_genes_prediction_df.head()

Unnamed: 0,locus_tags,names,systems,tool
0,BKDLJELO_00330,MTase_I,DMS_other,padloc
1,BKDLJELO_00330,MTase_I,RM_type_I,padloc
2,BKDLJELO_00331,Specificity_I,DMS_other,padloc
3,BKDLJELO_00331,Specificity_I,RM_type_I,padloc
4,BKDLJELO_00332,REase_I,DMS_other,padloc


In [208]:
format_values = lambda x: '='.join(x)

map_defence_prediction = []
for locus_tag, df in defense_genes_prediction_df.head(100).groupby('locus_tags'):
    df = df.fillna('None')
    prediction_string = ';'.join(list(map(lambda x: ','.join(x),df.drop('locus_tags', axis=1).apply(lambda x: x .name + '=' + x).iloc[:,::-1].values)))
    map_defence_prediction.append((locus_tag, prediction_string))

map_defence_prediction_df = pd.DataFrame(map_defence_prediction, columns=['locus_tags', 'predictions']).drop_duplicates()
map_defence_prediction_df.tail()

Unnamed: 0,locus_tags,predictions
44,IOMDNFNB_04284,"tool=padloc,systems=druantia_type_II,names=DruK"
45,IOMDNFNB_04490,"tool=padloc,systems=DMS_other,names=Specificit..."
46,IOMDNFNB_04491,"tool=padloc,systems=DMS_other,names=MTase_I;to..."
47,IOMDNFNB_04492,"tool=padloc,systems=DMS_other,names=REase_I;to..."
48,IOMDNFNB_04493,"tool=padloc,systems=DMS_other,names=mREase_IV;..."


In [214]:
oi = map_defence_prediction_df.merge(
    headers_df.loc[:,['locus_tags', 'headers']],
    on='locus_tags',
    how='inner'
).groupby('locus_tags')

In [220]:
for i, df in oi:
    if df.shape[0] > 1:
        print(df)

        locus_tags                                        predictions  \
26  IOMDNFNB_01071  tool=padloc,systems=DMS_other,names=oMTase;too...   
27  IOMDNFNB_01071  tool=padloc,systems=DMS_other,names=oMTase;too...   

                                         headers  
26  Contig_936157.3_23:Cassette_1:IOMDNFNB_01071  
27  Contig_936157.3_23:Cassette_2:IOMDNFNB_01071  
        locus_tags                                        predictions  \
28  IOMDNFNB_01072  tool=padloc,systems=DMS_other,names=REase_II;t...   
29  IOMDNFNB_01072  tool=padloc,systems=DMS_other,names=REase_II;t...   

                                         headers  
28  Contig_936157.3_23:Cassette_1:IOMDNFNB_01072  
29  Contig_936157.3_23:Cassette_2:IOMDNFNB_01072  
        locus_tags                                        predictions  \
30  IOMDNFNB_02069  tool=padloc,systems=druantia_type_III,names=DruH3   
31  IOMDNFNB_02069  tool=padloc,systems=druantia_type_III,names=DruH3   

                                  