In [2]:
from pathlib import Path
import subprocess
from collections import namedtuple

# external modules 
import requests
import pandas as pd
from Bio import SeqIO

In [215]:
def cassettes_headers_to_df(multifasta_proteins):
    cmd = ['zgrep', '-Po', '(?<=^\>).+', multifasta_proteins]
    columns = ['contigs', 'cassettes', 'locus_tags', 'headers']
    data = []
    for header in subprocess.run(cmd, check=True, stdout=subprocess.PIPE).stdout.decode('UTF-8').splitlines():
        rows = header.split(':')
        assert len(rows) == 3, 'Header {} is invalid !'.format(header)
        rows.append(header)
        data.append(rows)
        
    df = pd.DataFrame(data=data, columns=columns) \
            .assign(genome_ids = lambda x: x['contigs'].str.split(':') \
                .apply(lambda x: '_'.join(x[0].split('_')[1:-1])))
    return df


                
def get_edges(topology: list, drop_duplicates=True):
    edges = []
    for indx, cluster_id in enumerate(topology):
        if indx == 0:
            edges.append((cluster_id, topology[indx+1]))
        elif indx == len(topology) - 1:
            edges.append((cluster_id, topology[indx-1]))
        else:
            edges.append((cluster_id, topology[indx-1]))
            edges.append((cluster_id, topology[indx+1]))
    if not drop_duplicates:
        return edges
    
    strict_edges = []
    for edge in edges:
        if not ( edge in strict_edges or edge[::-1] in strict_edges ):
            strict_edges.append(edge)
    return strict_edges

def get_wanted_locus_tags_from_gbk(gbk_path: str, wanted_locus_tags: list):
    with open(gbk_path) as f:
        features = []
        for record in SeqIO.parse(gbk_path, 'genbank'):
            for feature in filter(lambda fet: fet.qualifiers.get('locus_tag', [None])[0] in wanted_locus_tags, record.features):
                features.append((feature.qualifiers.get('locus_tag', [None])[0], feature))
        return pd.DataFrame(data=features, columns=['locus_tags', 'features'])
    
def list_gbk_files(annotation_dir: str) -> dict:
    cmd = [
        'find',
        annotation_dir,
        '-type', 'f',
        '-name', '*.gbk'
    ]
    files = {}
    for file in subprocess.run(cmd, check=True, stdout=subprocess.PIPE).stdout.decode('UTF-8').splitlines():
        file = Path(file)
        genome_id = file.stem.replace('_out', '')
        genome_path = str(file.absolute())
        assert not files.get(genome_id), 'Repeated ids: {}'.format(genome_id)
        files[genome_id] = genome_path
        
    return files

def get_feature_df(genome_filters: list, annotation_dir: str):
    """genome_filter = { genome_id : locus_tags }"""
    genome_paths: dict = list_gbk_files(annotation_dir)
    
    features_df = pd.DataFrame()
    for genome_id, locus_tags in genome_filters.items():
        df = get_wanted_locus_tags_from_gbk(gbk_path=genome_paths[genome_id], wanted_locus_tags=locus_tags)
        if features_df.empty:
            features_df = df
            continue
        features_df = pd.concat([features_df, df])
        
    return features_df.reset_index(drop=True)
        
            
            
def get_feature_string(feature):
    get_qualifier = lambda query: feature.qualifiers.get(query, ['NA'])[0]
    fields = ('gene', 'locus_tag', 'product', 'translation')
    return ';'.join(list(map(lambda field: '='.join((field, get_qualifier(field))) , fields)))

    
    
    


In [3]:
multifasta_proteins = '../../results/cassettes/clusters/merged_cassette_proteins.faa.gz'
clusters = '../../results/cassettes/clusters/mmseqs2_cluster.tsv'
annotation_dir = '../../results/Prokka/'
defense_system_prediction_dir = '../../results/cassettes/extracted/'

In [4]:
headers_df = cassettes_headers_to_df(multifasta_proteins=multifasta_proteins)
clusters_df = pd.read_csv(clusters, sep='\t', header=None).rename(columns={0:'representatives', 1:'members'})

In [5]:
# Count mmseqs clusters with more than one sequence (botton length value is count)
clusters_df.groupby('representatives')['representatives'].count().sort_values(ascending=False).where(lambda x: x > 1).dropna()

representatives
Contig_1395103.4_47:Cassette_1:KDHGNBDG_03652    14.0
Contig_936157.3_23:Cassette_1:IOMDNFNB_01075     12.0
Contig_936157.3_23:Cassette_1:IOMDNFNB_01077     12.0
Contig_936157.3_62:Cassette_1:IOMDNFNB_04497     12.0
Contig_936157.3_62:Cassette_1:IOMDNFNB_04496     12.0
                                                 ... 
Contig_1395103.4_5:Cassette_1:KDHGNBDG_00493      2.0
Contig_28901.3694_7:Cassette_5:HDGJAOOK_01579     2.0
Contig_28901.4009_1:Cassette_2:BKDLJELO_00369     2.0
Contig_28901.3694_7:Cassette_5:HDGJAOOK_01568     2.0
Contig_28901.4009_1:Cassette_2:BKDLJELO_00371     2.0
Name: representatives, Length: 237, dtype: float64

In [14]:
# Check if sequence is a mmseqs clusters representative
headers_df = headers_df.assign(is_rep = lambda df: list(map(lambda header: header in clusters_df.representatives.values, df.headers.values)))
headers_df[headers_df.is_rep].head()

Unnamed: 0,contigs,cassettes,locus_tags,headers,genome_ids,is_rep
0,Contig_936157.3_6,Cassette_1,IOMDNFNB_00323,Contig_936157.3_6:Cassette_1:IOMDNFNB_00323,936157.3,True
1,Contig_936157.3_6,Cassette_1,IOMDNFNB_00324,Contig_936157.3_6:Cassette_1:IOMDNFNB_00324,936157.3,True
2,Contig_936157.3_6,Cassette_1,IOMDNFNB_00325,Contig_936157.3_6:Cassette_1:IOMDNFNB_00325,936157.3,True
3,Contig_936157.3_6,Cassette_1,IOMDNFNB_00326,Contig_936157.3_6:Cassette_1:IOMDNFNB_00326,936157.3,True
5,Contig_936157.3_6,Cassette_1,IOMDNFNB_00328,Contig_936157.3_6:Cassette_1:IOMDNFNB_00328,936157.3,True


In [8]:
# Retrive feature (gene name, translation, locus_tag) information of clusters representatives 
annotation_files = list_annotations_files(annotation_dir)

rep_with_features = []
for genome, df in headers_df[headers_df.is_rep].groupby('genome_ids'):
    gbk_path = annotation_files.get(genome)
    if gbk_path:
        rep_with_features.append(retrive_features_from_gbk(df, gbk_path))
    
get_clusters_ids = lambda df: list(map(lambda x: 'ID_{}_{}'.format(*x),zip(list(map(str, df.index.values)), list(map(lambda fet: fet.gene, df.features)))))
rep_with_features_df = pd.concat(rep_with_features).reset_index(drop=True).assign(cluster_ids = lambda df: get_clusters_ids(df))
rep_with_features_df[~rep_with_features_df.cluster_ids.str.contains('Unk')].head()

Unnamed: 0,contigs,cassettes,locus_tags,headers,genome_ids,is_rep,features,cluster_ids
9,Contig_1395103.4_5,Cassette_1,KDHGNBDG_00483,Contig_1395103.4_5:Cassette_1:KDHGNBDG_00483,1395103.4,True,"(KDHGNBDG_00483, rep_2, ATP-dependent DNA heli...",ID_9_rep_2
11,Contig_1395103.4_5,Cassette_1,KDHGNBDG_00487,Contig_1395103.4_5:Cassette_1:KDHGNBDG_00487,1395103.4,True,"(KDHGNBDG_00487, proA_1, Gamma-glutamyl phosph...",ID_11_proA_1
12,Contig_1395103.4_5,Cassette_1,KDHGNBDG_00488,Contig_1395103.4_5:Cassette_1:KDHGNBDG_00488,1395103.4,True,"(KDHGNBDG_00488, proB, Glutamate 5-kinase, MSD...",ID_12_proB
13,Contig_1395103.4_5,Cassette_1,KDHGNBDG_00490,Contig_1395103.4_5:Cassette_1:KDHGNBDG_00490,1395103.4,True,"(KDHGNBDG_00490, crl, Sigma factor-binding pro...",ID_13_crl
15,Contig_1395103.4_5,Cassette_1,KDHGNBDG_00492,Contig_1395103.4_5:Cassette_1:KDHGNBDG_00492,1395103.4,True,"(KDHGNBDG_00492, gpt, Xanthine phosphoribosylt...",ID_15_gpt


In [59]:
# Get nodes and edges
neighborhood_gdf = clusters_df \
        .merge(right=rep_with_features_df.loc[:,['headers', 'cluster_ids']], left_on='representatives', right_on='headers') \
        .drop(['representatives', 'headers'], axis=1) \
        .assign(cassettes = lambda df: tuple(map(lambda member: ':'.join(member.split(':')[:2]), df.members))) \
        .assign(locus_tags = lambda df: tuple(map(lambda member: member.split(':')[2], df.members))) \
        .groupby('cassettes')

In [76]:
edges = []
for cassette, df in neighborhood_gdf:
    # make shore that the locus_tags are ordered otherwise we lost
    # topology relationships.
    
    df = df.sort_values(['locus_tags'], key=lambda locus_tags: list(map(lambda x: int(x[1]), locus_tags.str.split('_').values)))
    cassette_edges = pd.DataFrame(data=locus_tag_to_edges(topology=df.loc[:,['cluster_ids']].cluster_ids.values, drop_duplicates=True), columns=['source', 'target'])
    
    # edges.append(pd.merge(
    #     left=cassette_edges,
    #     right=rep_with_features_df.loc[:,['cluster_ids', 'features']].assign(features = lambda df: list(map(lambda fet: str(fet).replace('Feature(', '').replace(')', '').replace(',',';').replace(' ',''), df.features))),
    #     left_on='sources',
    #     right_on='cluster_ids',
    #     how='inner'
    # ).drop('cluster_ids', axis=1).rename(columns={'features':'source_features'}))
    
    edges.append(cassette_edges)
    

edges_df = pd.concat(edges).drop_duplicates().reset_index(drop=True)

In [66]:
import requests
exec(requests.get("https://raw.githubusercontent.com/cytoscape/jupyter-bridge/master/client/p4c_init.py").text)
IPython.display.Javascript(_PY4CYTOSCAPE_BROWSER_CLIENT_JS) # Start browser client
import py4cytoscape as p4c
p4c.cytoscape_ping()
p4c.cytoscape_version_info()



Loading Javascript client ... 855b6e9c-dc35-451e-842a-e055ba8ff0df on https://jupyter-bridge.cytoscape.org
You are connected to Cytoscape!


{'apiVersion': 'v1',
 'cytoscapeVersion': '3.9.1',
 'automationAPIVersion': '1.6.0',
 'py4cytoscapeVersion': '1.5.0'}

In [83]:
# nodes_df = edges_df.loc[:,['source']].rename(columns={'source':'id'})
# p4c.create_network_from_data_frames(nodes_df, edges_df, title="Salmonella Defense Systems", collection="DataFrame Example")

In [82]:
p4c.delete_all_networks()

''

In [100]:
cmd = ['find', defense_system_prediction_dir, '-name', '*merged_defense_systems_prediction.csv']
file_paths = subprocess.run(cmd, check=True, stdout=subprocess.PIPE).stdout.decode('UTF-8').splitlines()
defense_genes_prediction_df = pd.concat(map(lambda path: pd.read_csv(path, sep=','), file_paths))
defense_genes_prediction_df.head()

Unnamed: 0,locus_tags,names,systems,tool
0,BKDLJELO_00330,MTase_I,DMS_other,padloc
1,BKDLJELO_00330,MTase_I,RM_type_I,padloc
2,BKDLJELO_00331,Specificity_I,DMS_other,padloc
3,BKDLJELO_00331,Specificity_I,RM_type_I,padloc
4,BKDLJELO_00332,REase_I,DMS_other,padloc


In [208]:
format_values = lambda x: '='.join(x)

map_defence_prediction = []
for locus_tag, df in defense_genes_prediction_df.head(100).groupby('locus_tags'):
    df = df.fillna('None')
    prediction_string = ';'.join(list(map(lambda x: ','.join(x),df.drop('locus_tags', axis=1).apply(lambda x: x .name + '=' + x).iloc[:,::-1].values)))
    map_defence_prediction.append((locus_tag, prediction_string))

map_defence_prediction_df = pd.DataFrame(map_defence_prediction, columns=['locus_tags', 'predictions']).drop_duplicates()
map_defence_prediction_df.tail()

Unnamed: 0,locus_tags,predictions
44,IOMDNFNB_04284,"tool=padloc,systems=druantia_type_II,names=DruK"
45,IOMDNFNB_04490,"tool=padloc,systems=DMS_other,names=Specificit..."
46,IOMDNFNB_04491,"tool=padloc,systems=DMS_other,names=MTase_I;to..."
47,IOMDNFNB_04492,"tool=padloc,systems=DMS_other,names=REase_I;to..."
48,IOMDNFNB_04493,"tool=padloc,systems=DMS_other,names=mREase_IV;..."


In [214]:
oi = map_defence_prediction_df.merge(
    headers_df.loc[:,['locus_tags', 'headers']],
    on='locus_tags',
    how='inner'
).groupby('locus_tags')

## NEW

In [53]:
clusters_df = pd.DataFrame()
for rep, df in pd.read_csv(clusters, sep='\t', header=None).rename(columns={0:'representatives', 1:'members'}).groupby('representatives'):
    # is gene of interest 
    is_GOI = any(df.members.apply(lambda header: 'GOI_True' in header))
    members =  df.members.values
    df = pd.DataFrame(data=[(rep, members, len(members), is_GOI)], columns=[*df.columns, 'members_count', 'is_GOI']).set_index('representatives')
    if clusters_df.empty:
        clusters_df = df
        continue
    clusters_df = pd.concat([clusters_df, df])

clusters_df.sort_values(['members_count'], ascending=False).query('is_GOI').head()

Unnamed: 0_level_0,members,members_count,is_GOI
representatives,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Contig_1395103.4_47:Cassette_1:GOI_True:KDHGNBDG_03652,[Contig_1395103.4_47:Cassette_1:GOI_True:KDHGN...,12,True
Contig_936157.3_6:Cassette_1:GOI_True:IOMDNFNB_00334,[Contig_936157.3_6:Cassette_1:GOI_True:IOMDNFN...,10,True
Contig_936157.3_23:Cassette_1:GOI_True:IOMDNFNB_01072,[Contig_936157.3_23:Cassette_1:GOI_True:IOMDNF...,10,True
Contig_936157.3_6:Cassette_1:GOI_True:IOMDNFNB_00333,[Contig_936157.3_6:Cassette_1:GOI_True:IOMDNFN...,10,True
Contig_590.17131_2:Cassette_2:GOI_True:HOHECEAN_01191,[Contig_590.17131_2:Cassette_2:GOI_True:HOHECE...,8,True


In [81]:
genome_filters: dict = clusters_df.index.to_series().str.split(':', expand=True).iloc[:,[0,-1]] \
    .rename(columns={0: 'genome_id', 3: 'locus_tags'}) \
    .assign( genome_id = lambda df: df.genome_id.str.replace('Contig_', '').apply(lambda g_id: '_'.join(g_id.split('_')[:-1]) ) ) \
    .groupby('genome_id') \
    .apply(lambda df: pd.DataFrame(data=[[df.iloc[0,0], set(df.locus_tags.values.tolist())]], columns=['genome_id', 'locus_tags'] )) \
    .set_index('genome_id', drop=True).locus_tags.to_dict()

In [123]:
features = get_feature_df(genome_filters=genome_filters, annotation_dir=annotation_dir) \
            .assign(features = lambda df: df.features.apply(lambda fet: get_feature_string(fet))) \
            .set_index('locus_tags', drop=True).features.to_dict()

In [135]:
clusters_df = clusters_df \
    .assign(features = lambda df: df.index.to_series().str.split(':', expand=True).iloc[:,-1].apply(lambda locus_tag: features.get(locus_tag, 'NA')) ) \
    .assign(
        cluster_ids = lambda df: df.index.to_series().str.split(':', expand=True).iloc[:,-1] + 
        '_' + 
        df.features.str.split(';', expand=True).iloc[:,0].str.replace('gene=', '') 
    )

In [142]:
clusters_df.head()

Unnamed: 0_level_0,members,members_count,is_GOI,features,cluster_ids
representatives,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Contig_1395103.4_28:Cassette_1:GOI_False:KDHGNBDG_02934,[Contig_1395103.4_28:Cassette_1:GOI_False:KDHG...,1,False,gene=prpD;locus_tag=KDHGNBDG_02934;product=2-m...,KDHGNBDG_02934_prpD
Contig_1395103.4_28:Cassette_1:GOI_False:KDHGNBDG_02935,[Contig_1395103.4_28:Cassette_1:GOI_False:KDHG...,1,False,gene=prpC;locus_tag=KDHGNBDG_02935;product=2-m...,KDHGNBDG_02935_prpC
Contig_1395103.4_28:Cassette_1:GOI_False:KDHGNBDG_02947,[Contig_1395103.4_28:Cassette_1:GOI_False:KDHG...,1,False,gene=NA;locus_tag=KDHGNBDG_02947;product=hypot...,KDHGNBDG_02947_NA
Contig_1395103.4_28:Cassette_1:GOI_False:KDHGNBDG_02948,[Contig_1395103.4_28:Cassette_1:GOI_False:KDHG...,1,False,gene=NA;locus_tag=KDHGNBDG_02948;product=hypot...,KDHGNBDG_02948_NA
Contig_1395103.4_28:Cassette_1:GOI_False:KDHGNBDG_02949,[Contig_1395103.4_28:Cassette_1:GOI_False:KDHG...,1,False,gene=NA;locus_tag=KDHGNBDG_02949;product=Elect...,KDHGNBDG_02949_NA


In [158]:
nodes_df = clusters_df.loc[:, ['cluster_ids', 'members_count', 'is_GOI', 'features']] \
    .reset_index(drop=True) \
    .assign(group = lambda df: df.is_GOI.apply(lambda x: 'GOI' if x else 'NA')) \
    .rename(columns={'cluster_ids' : 'id'})

nodes_df.head()

Unnamed: 0,id,members_count,is_GOI,features,group
0,KDHGNBDG_02934_prpD,1,False,gene=prpD;locus_tag=KDHGNBDG_02934;product=2-m...,
1,KDHGNBDG_02935_prpC,1,False,gene=prpC;locus_tag=KDHGNBDG_02935;product=2-m...,
2,KDHGNBDG_02947_NA,1,False,gene=NA;locus_tag=KDHGNBDG_02947;product=hypot...,
3,KDHGNBDG_02948_NA,1,False,gene=NA;locus_tag=KDHGNBDG_02948;product=hypot...,
4,KDHGNBDG_02949_NA,1,False,gene=NA;locus_tag=KDHGNBDG_02949;product=Elect...,


In [195]:
import itertools

ids_df = clusters_df.loc[:, ['cluster_ids', 'members']].reset_index(drop=True).explode('members') \
    .assign(locus_tags =  lambda df: df.members.str.split(':', expand=True).iloc[:,[-1]].values ).drop(['members'], axis=1)

cassettes_gdf = pd.Series(data=list(itertools.chain.from_iterable(clusters_df.members)), name='members') \
    .str.split(':', expand=True) \
    .rename(columns=dict(zip(range(4), ['contigs', 'cassettes', 'GOI', 'locus_tags'] ))) \
    .merge(ids_df, on='locus_tags', how='left').groupby(['contigs', 'cassettes'])    

In [219]:

edges = pd.DataFrame()

for cassette, df in cassettes_gdf:
    # Making shore these locus_tags are sorted !!!!!!
    df = df.sort_values('locus_tags', key = lambda locus_tags: locus_tags.apply(lambda locus_tag: int(locus_tag.split('_')[-1]) ))
    cassette_edges = pd.DataFrame(data=get_edges(df.cluster_ids.values), columns=['source', 'target'])
    if edges.empty:
        edges = cassette_edges
        continue
    edges = pd.concat([edges, cassette_edges])
    
edges = edges.reset_index(drop=True)
edges.head()

Unnamed: 0,source,target
0,KDHGNBDG_02934_prpD,KDHGNBDG_02935_prpC
1,KDHGNBDG_02935_prpC,IOMDNFNB_00344_prpB
2,IOMDNFNB_00344_prpB,IOMDNFNB_00343_prpR
3,IOMDNFNB_00343_prpR,IOMDNFNB_00342_NA
4,IOMDNFNB_00342_NA,IOMDNFNB_00341_rhtC_1


In [229]:
edges.assign(count = lambda df: list(df[['source', 'target']].values + df[['target', 'source']].values) ).head()

Unnamed: 0,source,target,count
0,KDHGNBDG_02934_prpD,KDHGNBDG_02935_prpC,"[KDHGNBDG_02934_prpDKDHGNBDG_02935_prpC, KDHGN..."
1,KDHGNBDG_02935_prpC,IOMDNFNB_00344_prpB,"[KDHGNBDG_02935_prpCIOMDNFNB_00344_prpB, IOMDN..."
2,IOMDNFNB_00344_prpB,IOMDNFNB_00343_prpR,"[IOMDNFNB_00344_prpBIOMDNFNB_00343_prpR, IOMDN..."
3,IOMDNFNB_00343_prpR,IOMDNFNB_00342_NA,"[IOMDNFNB_00343_prpRIOMDNFNB_00342_NA, IOMDNFN..."
4,IOMDNFNB_00342_NA,IOMDNFNB_00341_rhtC_1,"[IOMDNFNB_00342_NAIOMDNFNB_00341_rhtC_1, IOMDN..."
