In [2]:
def pickle_to_dict(fn: str) -> dict:
    """ Loads a pickle file and returns a dictionary """
    with open(fn, 'rb') as f:
        return pickle.load(f)

def get_protein_id_from_gff_info(info: str) -> str:
    """ Returns the protein_id from a gff info field """
    return dict(map(lambda x: x.split('='), info.split(';'))).get('locus_tag')

def parse_merged_gff(fn: str) -> Generator[tuple[str], None, None]:
    """ Parses a merged gff file and returns a generator of tuples """

    with open(fn) as f:
        lines = map(str.strip, f)
        remove_headers = filter(lambda x: not x.startswith('#'), lines)

        while (line := next(remove_headers, None)):
            contig, *_, info = line.split('\t')
            protein_id = get_protein_id_from_gff_info(info)
            assembly, contig = contig.split('_')
            yield (assembly, contig, protein_id)





In [22]:
def mmseqs_to_dict(fn: str) -> dict:
    with open(fn, 'r', encoding='utf-8') as f:
        return dict(map(lambda x: x.strip().split('\t'), f))

# cluster_assignments = pickle_to_dict('/home/hugo/projects/icetea/results/mmseqs/CLUSTERS_REPS_unique.pickle')
cluster_assignments = mmseqs_to_dict('/home/hugo/projects/icetea/results/mmseqs/CLUSTERS_REPS_unique.tsv')

In [104]:
from itertools import islice
from collections import namedtuple
import pandas as pd


all_cds = parse_merged_gff('/home/hugo/projects/icetea/results/gff/all.cds.gff.sample.out')

add_cluster_assignments = map(lambda x: (*x[:-1], cluster_assignments.get(x[-1])), all_cds)

def get_edges(locus_tags: tuple) -> set:
    edges = zip(locus_tags, locus_tags[1:])
    remove_fragments = filter(lambda x: x[0] != x[1], edges)
    remove_none = filter(all, remove_fragments)
    add_on_contig_position = map(lambda x: (*x[-1], x[0]), enumerate(remove_none, start=1))
    return tuple(add_on_contig_position)



    # return series.apply(lambda x:  )

cds_data = (
    pd.DataFrame(
        # islice(add_cluster_assignments, 1000),
        add_cluster_assignments,
        columns=['assembly', 'contig', 'cluster_id'],
    )
    .groupby(['assembly', 'contig'])
    .agg(tuple)
    .loc[lambda df_: df_.cluster_id.apply(len).gt(1)]
    .rename(columns={'cluster_id': 'edge'})
    .assign(
        edge = lambda x: x.edge.apply(get_edges),
    )
    .explode('edge')
    .assign(
        source = lambda x: x.edge.apply(lambda x: x[0]),
        target = lambda x: x.edge.apply(lambda x: x[1]),
        contig_edge_position = lambda x: x.edge.apply(lambda x: x[2]),
    )
    .drop(columns='edge')
    .reset_index()
    .loc[:, ['source', 'target', 'assembly', 'contig', 'contig_edge_position']]
    # .set_index(['source', 'target'])
    # .agg(tuple, axis=1)
    # .groupby(['source', 'target'])
    # .agg(tuple)
)

cds_data

Unnamed: 0,source,target,assembly,contig,contig_edge_position
0,CKDALPML_03566,LHMLBOEB_02289,1243621.3,1,1
1,LHMLBOEB_02289,PEEGAIPP_04538,1243621.3,1,2
2,PEEGAIPP_04538,LGJOCDDL_01219,1243621.3,1,3
3,LGJOCDDL_01219,OGKIMEBI_00005,1243621.3,1,4
4,OGKIMEBI_00005,GCDOEBIL_03948,1243621.3,1,5
...,...,...,...,...,...
505883,PFKGKOIO_03960,JOKGDMAP_04363,997338.3,99,47
505884,JOKGDMAP_04363,LELPPPGI_04228,997338.3,99,48
505885,LELPPPGI_04228,BHIIONFH_05865,997338.3,99,49
505886,BHIIONFH_05865,GEOMEPHB_03958,997338.3,99,50


In [3]:
def mmseqs_to_dict(fn: str) -> dict:
    with open(fn, 'r', encoding='utf-8') as f:
        return dict(map(lambda x: x.strip().split('\t'), f))
    
def get_edges(locus_tags: tuple) -> set:
    edges = zip(locus_tags, locus_tags[1:])
    remove_fragments = filter(lambda x: x[0] != x[1], edges)
    remove_none = filter(all, remove_fragments)
    add_on_contig_position = map(lambda x: (*x[-1], x[0]), enumerate(remove_none, start=1))
    return tuple(add_on_contig_position)


cluster_assignments = mmseqs_to_dict('/home/hugo/projects/icetea/results/mmseqs/CLUSTERS_REPS_unique.tsv')

all_cds = parse_merged_gff('/home/hugo/projects/icetea/results/gff/all.cds.gff.sample.out')

add_cluster_assignments = map(lambda x: (*x[:-1], cluster_assignments.get(x[-1])), all_cds)

get_contigs = defaultdict(list)

for i in add_cluster_assignments:
    get_contigs[i[:-1]].append(i[-1])

get_contigs = {k: get_edges(v) for k, v in get_contigs.items()}

edges =  defaultdict(list)

for k, v in get_contigs.items():
    for edge in v:
        edges[edge[:-1]].append((*k, edge[-1]))

In [4]:
G = nx.Graph()

# Add edges and attributes
G.add_edges_from(map(lambda x: (*x[0], {'info' : tuple(x[1])}), edges.items()))



In [1]:
# native
from typing import Generator
import pickle
from collections import defaultdict
from functools import partial
from itertools import chain

# third party
import networkx as nx
import pandas as pd

# read networkx graph from file pickle
with open('/home/hugo/projects/icetea/results/salmonella.pickle', 'rb') as f:
    G = pickle.load(f)


In [2]:
    
def read_mmseqs_search_output(
    fn: str, usecols: str = "query,target,pident,qcov,tcov"
) -> pd.DataFrame:
    header = "query,target,pident,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,qcov,tcov".split(
        ","
    )
    return pd.read_csv(
        fn,
        sep="\t",
        header=None,
        names=header,
        usecols=usecols.split(","),
    )

def query_network(G: nx.Graph, query_node: str, radius: int) -> set:
    try:
        return set(nx.ego_graph(G, query_node, radius=radius).nodes())
    except nx.exception.NodeNotFound:
        return set()
    
query_radius = 5
query_network_preloaded = partial(query_network, G, radius=query_radius)


systems_matches = (
    read_mmseqs_search_output('/home/hugo/projects/icetea/results/mmseqs/New-DS-V2_X_SALMONELLA_REPS.search.tsv')
    .loc[lambda df_: df_['query'].str.startswith(('DS1', 'DS4')), ['query', 'target']]
    .pipe(
        lambda df_: pd.DataFrame(
            data=df_.target.values,
            columns=['cluster'],
            index=pd.MultiIndex.from_arrays(df_['query'].str.split('-', expand=True).values.T, names=['system', 'componet']),
        )
    )
    .assign(
        closest_nodes = lambda df_: df_.cluster.apply(query_network_preloaded)
    )
    .groupby(['system', 'componet'])
    .agg(
        {
            "cluster": set,
            "closest_nodes": lambda s_: set(chain.from_iterable(s_)),
        }
    )
    .loc[lambda df_: df_.closest_nodes.apply(bool)]
)




systems_matches

Unnamed: 0_level_0,Unnamed: 1_level_0,cluster,closest_nodes
system,componet,Unnamed: 2_level_1,Unnamed: 3_level_1
DS1,1,{HCKDJKIF_02548},"{IDNFEDEF_04437, EEPIDGAD_04500, DHBDDKCN_0024..."
DS1,2,{OECGJKMM_02137},"{EEPIDGAD_04500, JADHFFOP_00313, AHGJPDMF_0452..."
DS1,3,{HAOCBECO_02921},"{HIFGEHPB_00061, ADHBLBMG_00779, OBBFEPPO_0473..."
DS4,1,{PLOGOCIG_03756},"{EEPIDGAD_04500, CBHEPBDH_04236, IKHNHLMK_0037..."
DS4,2,{PGPIBDBP_02126},"{IDNFEDEF_04437, EEPIDGAD_04500, DHBDDKCN_0024..."


In [3]:
for i, system_df in systems_matches.groupby("system"):
    break


In [56]:
from typing import Tuple, List, Generator
from collections import defaultdict

def extract_edge_info(G: nx.Graph) -> Generator[Tuple[str, str, List[Tuple[int, str, str]]], None, None]:
    contigs = defaultdict(list)

    for i in G.edges(data=True):
        source, target, attributes = i
        for i in attributes['info']:
            assembly, contig, oncontig_pos = i
            contigs[(assembly, contig)].append((source, target, oncontig_pos))

    yield from ((*k, tuple(sorted(v, key=lambda x: x[-1]))) for k, v in contigs.items())

def get_windows(t):
    windows = []
    current_window = 1
    for i in range(len(t)):
        if i == 0:
            windows.append(current_window)
            continue
        if t[i] - t[i-1] == 1:
            windows.append(current_window)
        else:
            current_window += 1
            windows.append(current_window)
    return tuple(windows)

def edge_info_to_df(t):
    assembly, contig, edge_info = t
    return (
        pd.DataFrame(
            data=edge_info,
            columns=['source', 'target', 'oncontig_pos'],
        )
        .assign(
            window = lambda df_: df_.oncontig_pos.diff().ne(1).cumsum(),
            assembly = assembly,
            contig = contig,
        )
        .groupby(['assembly', 'contig', 'window'])
        # join the source and target nodes into a set
        .agg(
            {
                'source': set,
                'target': set,
            }
        )           
            
    )
    
    

def get_freq_from_sytem(system_df: pd.DataFrame, main_graph: nx.Graph,) -> pd.DataFrame:
    system_G = main_graph.subgraph(set(chain.from_iterable(system_df.closest_nodes)))

    system_clusters = set(chain.from_iterable(system_df.cluster))

    all_edge_info = extract_edge_info(system_G)

    contigs_with_matches = filter(
        lambda tup: (
            any(cluster in system_clusters for cluster in chain.from_iterable(info_tupe[:-1] for info_tupe in tup[-1]))
        ),
        all_edge_info
    )

    to_df = map(edge_info_to_df, contigs_with_matches)

    for i in to_df:
        print(i.sort_index())
        return


get_freq_from_sytem(system_df, G)


In [19]:
genomes = defaultdict(list)

for i in G.edges(data=True):
    source, target, attributes = i
    print(source, target)
    for i in attributes['info']:
        print(i)
        assembly, contig, oncontig_pos = i
        genomes[(assembly, contig)].append((source, target, oncontig_pos))
        break
    break

def test(d):
    yield from ((*k, tuple(sorted(v, key=lambda x: x[-1]))) for k, v in d.items())


for i in test(genomes):
    print(i)
    break


JFGHGAKM_04203 LBGIAHIP_00357
('59201.294', '77', 204)
('59201.294', '77', (('JFGHGAKM_04203', 'LBGIAHIP_00357', 204),))


In [39]:


# doing this: windows=lambda df_: df_.on_contig_position.diff().ne(1).cumsum(). But in python
# input : t = (1,2,3,5,6,10)
# output: (1,1,1,2,2,3)




# test input
t = [
    (1,2,3,5,6,10),
    (1,2,3,4,5,6,7,8,9,10),
    (),
    (1,3,5,7,8,9,10),
]


for i in t:
    print(get_windows(i))

(1, 1, 1, 2, 2, 3)
(1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
()
(1, 2, 3, 4, 4, 4, 4)


In [50]:
systems_matches.closest_nodes.apply(len)

system  componet
DS1     1           56743
        2           13324
        3            1705
DS4     1           40026
        2           61023
Name: closest_nodes, dtype: int64

In [1]:
def parse_merged_gff(fn: str):
    """ Parses a merged gff file and returns a generator of tuples """
        

        while (line := next(lines, None)):
            if line.startswith('#!genome-build-accession NCBI_Assembly:'):
                assembly = line.split(':')[-1]
                contigs = defaultdict(list)
                continue

            if line.startswith('###'):
                contigs = dict(map(lambda x: (x[0], tuple(x[1])), contigs.items()))
                yield (assembly, contigs)
                continue

            if line.startswith('#'):
                continue

            contig, *_, info = line.split('\t')

            if (protein_id := get_protein_id_from_gff_info(info)): #### <-- Walrus operator
                contigs[contig].append(protein_id)
