Skip to content

Commit

Permalink
rework BGC storage
Browse files Browse the repository at this point in the history
solve issue with distance calculations
  • Loading branch information
adraismawur committed Jan 25, 2022
1 parent b1a1b38 commit 0781e72
Show file tree
Hide file tree
Showing 13 changed files with 479 additions and 467 deletions.
93 changes: 51 additions & 42 deletions bigscape.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,25 +88,23 @@
# 2. genbank
# 3. query BGC
# all of these are generalized into this method, which returns the bgc info and gen bank info
# TODO: generalize into object
BGC_INFO, GEN_BANK_DICT, MIBIG_SET = gbk.import_gbks(RUN)

# CLUSTERS and SAMPLE_DICT contain the necessary structure for all-vs-all and sample analysis
CLUSTERS = list(GEN_BANK_DICT.keys())

print("\nTrying threading on {} cores".format(str(RUN.options.cores)))
BGC_INFO_DICT, GBK_FILE_DICT, MIBIG_SET = gbk.import_gbks(RUN)

# base name as a list and as a set
# also used in all vs all analysis
CLUSTER_NAME_LIST = list(GBK_FILE_DICT.keys())
CLUSTER_NAME_SET = set(CLUSTER_NAME_LIST)


### Step 2: Run hmmscan
print("\nTrying threading on {} cores".format(str(RUN.options.cores)))
print("\nPredicting domains using hmmscan")

CLUSTER_BASE_NAMES = set(CLUSTERS)

# get all fasta files in cache directory
CACHED_FASTA_FILES = hmm.get_cached_fasta_files(RUN)

# verify that all clusters have a corresponding fasta file in cache
hmm.check_fasta_files(RUN, CLUSTER_BASE_NAMES, CACHED_FASTA_FILES)
hmm.check_fasta_files(RUN, CLUSTER_NAME_SET, CACHED_FASTA_FILES)

# Make a list of all fasta files that need to be processed by hmmscan & BiG-SCAPE
# (i.e., they don't yet have a corresponding .domtable)
Expand All @@ -130,23 +128,23 @@

# verify that domtable files were generated successfully. each cluster should have a domtable
# file.
hmm.check_domtable_files(RUN, CLUSTER_BASE_NAMES, CACHED_DOMTABLE_FILES)
hmm.check_domtable_files(RUN, CLUSTER_NAME_SET, CACHED_DOMTABLE_FILES)

# find unprocessed files (assuming that if the pfd file exists, the pfs should too)
# this will just return all domtable files if force_hmmscan is set
DOMTABLE_FILES_TO_PROCESS = hmm.get_domtable_files_to_process(RUN, CACHED_DOMTABLE_FILES)

for domtableFile in DOMTABLE_FILES_TO_PROCESS:
hmm.parse_hmmscan(domtableFile, RUN.directories.pfd, RUN.directories.pfs,
RUN.options.domain_overlap_cutoff, RUN.options.verbose, GEN_BANK_DICT,
CLUSTERS, CLUSTER_BASE_NAMES, MIBIG_SET)
RUN.options.domain_overlap_cutoff, RUN.options.verbose, GBK_FILE_DICT,
CLUSTER_NAME_LIST, CLUSTER_NAME_SET, MIBIG_SET)

# If number of pfd files did not change, no new sequences were added to the
# domain fastas and we could try to resume the multiple alignment phase
# baseNames have been pruned of BGCs with no domains that might've been added temporarily
TRY_RESUME_MULTIPLE_ALIGNMENT = False
ALREADY_DONE = hmm.get_processed_domtable_files(RUN, CACHED_DOMTABLE_FILES)
if len(CLUSTER_BASE_NAMES - set(pfd.split(os.sep)[-1][:-9] for pfd in ALREADY_DONE)) == 0:
if len(CLUSTER_NAME_SET - set(pfd.split(os.sep)[-1][:-9] for pfd in ALREADY_DONE)) == 0:
TRY_RESUME_MULTIPLE_ALIGNMENT = True
else:
# new sequences will be added to the domain fasta files. Clean domains folder
Expand All @@ -165,39 +163,59 @@
print("\nProcessing domains sequence files")

# do one more check of pfd files to see if they are all there
hmm.check_pfd_files(RUN, CLUSTER_BASE_NAMES)
hmm.check_pfd_files(RUN, CLUSTER_NAME_SET)

# BGCs --
BGCS = big_scape.BgcInfo() #will contain the BGCs
# this collection will contain all bgc objects
BGC_COLLECTION = big_scape.BgcCollection()

# init the collection with the acquired names from importing GBK files
BGC_COLLECTION.initialize(CLUSTER_NAME_LIST)

# the BGCs need to know which domains belong where
# this is done in this step
BGC_COLLECTION.init_ordered_domain_list(RUN)

# add info and source gbk files
BGC_COLLECTION.add_bgc_info(BGC_INFO_DICT)
BGC_COLLECTION.add_source_gbk_files(GBK_FILE_DICT)

if RUN.options.skip_ma:
print(" Running with skip_ma parameter: Assuming that the domains folder has all the \
fasta files")
BGCS.load_from_file(RUN)
BGC_COLLECTION.load_domain_names_from_dict(RUN)
else:
print(" Adding sequences to corresponding domains file")
BGCS.load_pfds(RUN, CLUSTER_BASE_NAMES, TRY_RESUME_MULTIPLE_ALIGNMENT)
BGC_COLLECTION.load_domain_names_from_pfd(RUN, TRY_RESUME_MULTIPLE_ALIGNMENT)

BGCS.save_to_file(RUN)
BGC_COLLECTION.save_domain_names_to_dict(RUN)

# Key: BGC. Item: ordered list of simple integers with the number of domains
# in each gene
# Instead of `DomainCountGene = defaultdict(list)`, let's try arrays of
# unsigned ints
GENE_DOMAIN_COUNT = {}
# GENE_DOMAIN_COUNT = {}
# list of gene-numbers that have a hit in the anchor domain list. Zero based
COREBIOSYNTHETIC_POS = {}
# COREBIOSYNTHETIC_POS = {}
# list of +/- orientation
BGC_GENE_ORIENTATION = {}
# BGC_GENE_ORIENTATION = {}

# TODO: remove this comment? not sure what it relates to
# if it's a re-run, the pfd/pfs files were not changed, so the skip_ma flag
# is activated. We have to open the pfd files to get the gene labels for
# each domain
# We now always have to have this data so the alignments are produced
big_scape.parse_pfd(RUN, CLUSTER_BASE_NAMES, GENE_DOMAIN_COUNT, COREBIOSYNTHETIC_POS,
BGC_GENE_ORIENTATION, BGC_INFO)
PARSE_PFD_RESULTS = big_scape.parse_pfd(RUN, BGC_COLLECTION)
GENE_DOMAIN_COUNT, COREBIOSYNTHETIC_POS, BGC_GENE_ORIENTATION = PARSE_PFD_RESULTS

BGC_COLLECTION.add_gene_domain_counts(GENE_DOMAIN_COUNT)
BGC_COLLECTION.add_bio_synth_core_pos(COREBIOSYNTHETIC_POS)
BGC_COLLECTION.add_gene_orientations(BGC_GENE_ORIENTATION)

# at this point we can assemble a gene string for bgc info
BGC_COLLECTION.init_gene_strings()



### Step 5: Create SVG figures
print(" Creating arrower-like figures for each BGC")
Expand All @@ -209,7 +227,7 @@

# verify if there are figures already generated

big_scape.generate_images(RUN, CLUSTER_BASE_NAMES, GEN_BANK_DICT, PFAM_INFO, BGC_INFO)
big_scape.generate_images(RUN, CLUSTER_NAME_SET, GBK_FILE_DICT, PFAM_INFO, BGC_INFO_DICT)
print(" Finished creating figures")
print("\n\n - - Calculating distance matrix - -")

Expand All @@ -221,7 +239,6 @@
print(" Trying to read domain alignments (*.algn files)")
ALIGNED_DOMAIN_SEQS = hmm.read_aligned_files(RUN)

CLUSTER_NAMES = tuple(sorted(CLUSTERS))
# copy html templates
HTML_TEMPLATE_PATH = os.path.join(ROOT_PATH, "html_template", "output")
dir_util.copy_tree(HTML_TEMPLATE_PATH, RUN.directories.output)
Expand All @@ -239,52 +256,44 @@

# This version contains info on all bgcs with valid classes
print(" Writing the complete Annotations file for the complete set")
big_scape.write_network_annotation_file(RUN, CLUSTER_NAMES, BGC_INFO)
big_scape.write_network_annotation_file(RUN, BGC_COLLECTION)

# Find index of all MIBiG BGCs if necessary
if RUN.mibig.use_mibig:
NAME_TO_IDX = {}
for clusterIdx, clusterName in enumerate(CLUSTER_NAMES):
for clusterIdx, clusterName in enumerate(BGC_COLLECTION.bgc_name_tuple):
NAME_TO_IDX[clusterName] = clusterIdx

MIBIG_SET_INDICES = set()
for bgc in MIBIG_SET:
MIBIG_SET_INDICES.add(NAME_TO_IDX[bgc])


# Get the ordered list of domains
DOMAIN_LIST = big_scape.get_ordered_domain_list(RUN, CLUSTER_BASE_NAMES)

# Making network files mixing all classes
if RUN.options.mix:
big_scape.generate_network(RUN, CLUSTER_NAMES, DOMAIN_LIST, BGC_INFO,
GENE_DOMAIN_COUNT, COREBIOSYNTHETIC_POS,
BGC_GENE_ORIENTATION, BGCS, ALIGNED_DOMAIN_SEQS,
big_scape.generate_network(RUN, BGC_COLLECTION, ALIGNED_DOMAIN_SEQS,
MIBIG_SET_INDICES, MIBIG_SET, RUNDATA_NETWORKS_PER_RUN,
HTML_SUBS_PER_RUN, True)

# Making network files separating by BGC class
if not RUN.options.no_classify:
big_scape.generate_network(RUN, CLUSTER_NAMES, DOMAIN_LIST, BGC_INFO,
GENE_DOMAIN_COUNT, COREBIOSYNTHETIC_POS,
BGC_GENE_ORIENTATION, BGCS, ALIGNED_DOMAIN_SEQS,
big_scape.generate_network(RUN, BGC_COLLECTION, ALIGNED_DOMAIN_SEQS,
MIBIG_SET_INDICES, MIBIG_SET, RUNDATA_NETWORKS_PER_RUN,
HTML_SUBS_PER_RUN, False)

# fetch genome list for overview.js
INPUT_CLUSTERS_IDX = []
big_scape.fetch_genome_list(RUN, INPUT_CLUSTERS_IDX, CLUSTER_NAMES, MIBIG_SET, BGC_INFO,
GEN_BANK_DICT)
big_scape.fetch_genome_list(RUN, INPUT_CLUSTERS_IDX, BGC_COLLECTION.bgc_name_tuple, MIBIG_SET, BGC_INFO_DICT,
GBK_FILE_DICT)

# update family data (convert global bgc indexes into input-only indexes)
big_scape.update_family_data(RUNDATA_NETWORKS_PER_RUN, INPUT_CLUSTERS_IDX, CLUSTER_NAMES,
big_scape.update_family_data(RUNDATA_NETWORKS_PER_RUN, INPUT_CLUSTERS_IDX, BGC_COLLECTION.bgc_name_tuple,
MIBIG_SET)

# generate overview data
RUN.end()
big_scape.generate_results_per_cutoff_value(RUN, RUNDATA_NETWORKS_PER_RUN, HTML_SUBS_PER_RUN)
# dump bgc info
pickle.dump(BGC_INFO, open(os.path.join(RUN.directories.cache, 'bgc_info.dict'), 'wb'))
pickle.dump(BGC_INFO_DICT, open(os.path.join(RUN.directories.cache, 'bgc_info.dict'), 'wb'))

# done
RUN.report_runtime()
3 changes: 2 additions & 1 deletion src/big_scape/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from .clustering import clusterJsonBatch
from .scores import calc_jaccard, calc_distance_lcs
from .run.base import Run, ClusterParam, DirParam, DistParam, GbkParam, MibigParam, NetworkParam, PfamParam
from .bgc_info import BgcInfo
from .pfd import parse_pfd
from .svg import generate_images
from .util import get_ordered_domain_list, fetch_genome_list, update_family_data, generate_results_per_cutoff_value, copy_template_per_cutoff, prepare_cutoff_rundata_networks, prepare_html_subs_per_run, write_network_annotation_file
from .bgc_collection import BgcCollection
from .bgc_info import BgcInfo
148 changes: 148 additions & 0 deletions src/big_scape/bgc_collection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import os
import sys

from typing import Dict

import debugpy

from src.big_scape.util import get_ordered_domain_list
from src.big_scape.bgc_info import BgcInfo
from src.bgctools import generate_domain_name_info_dict
from src.utility import fasta_parser, save_domain_seqs

import pickle


class BgcCollection:
# a list of all bgcs recorded in this collection
bgc_name_list: list
# the same list as a set
bgc_name_set: set
# a tuple needed later
# TODO: refactor out
bgc_name_tuple: tuple

# dictionary of domains per bgc
bgc_ordered_domain_list: dict

# the dictionary of actual bgc info objects
bgc_collection_dict: Dict[str, BgcInfo]

def initialize(self, cluster_name_list):
self.bgc_name_list = cluster_name_list
self.bgc_name_set = set(cluster_name_list)
self.bgc_name_tuple = tuple(sorted(cluster_name_list))

self.bgc_collection_dict = {}
for cluster_name in cluster_name_list:
self.bgc_collection_dict[cluster_name] = BgcInfo(cluster_name)

def init_ordered_domain_list(self, run):
# Get the ordered list of domains
self.bgc_ordered_domain_list = get_ordered_domain_list(run, self.bgc_name_set)
for cluster_name in self.bgc_ordered_domain_list:
domain_list = self.bgc_ordered_domain_list[cluster_name]
self.bgc_collection_dict[cluster_name].ordered_domain_list = domain_list

# we will also want a set later on for various uses
self.bgc_collection_dict[cluster_name].ordered_domain_set = set(domain_list)

def add_bgc_info(self, bgc_info_dict):
for cluster_name in self.bgc_collection_dict:
if cluster_name in bgc_info_dict:
bgc_info = bgc_info_dict[cluster_name]
self.bgc_collection_dict[cluster_name].bgc_info = bgc_info
else:
# TODO: replace with logger
print("Warning: BGC info for {} was not found".format(cluster_name))

def add_source_gbk_files(self, source_gbk_file_dict):
for cluster_name in self.bgc_collection_dict:
if cluster_name in source_gbk_file_dict:
source_gbk_file = source_gbk_file_dict[cluster_name]
self.bgc_collection_dict[cluster_name].src_gbk_file = source_gbk_file
else:
# TODO: replace with logger
print("Warning: BGC info for {} was not found".format(cluster_name))

def add_gene_domain_counts(self, gene_domain_count_dict):
for cluster_name in self.bgc_collection_dict:
if cluster_name in gene_domain_count_dict:
gene_domain_counts = gene_domain_count_dict[cluster_name]
self.bgc_collection_dict[cluster_name].num_genes = len(gene_domain_counts)
self.bgc_collection_dict[cluster_name].gene_domain_counts = gene_domain_counts
else:
self.bgc_collection_dict[cluster_name].num_genes = 0
# TODO: replace with logger
print("Warning: Domain count info for {} was not found".format(cluster_name))

def add_gene_orientations(self, gene_orientation_dict):
for cluster_name in self.bgc_collection_dict:
if cluster_name in gene_orientation_dict:
gene_orientations = gene_orientation_dict[cluster_name]
self.bgc_collection_dict[cluster_name].gene_orientations = gene_orientations
else:
# TODO: replace with logger
print("Warning: Gene orientation info for {} was not found".format(cluster_name))

def add_bio_synth_core_pos(self, bio_synth_core_positions):
for cluster_name in self.bgc_collection_dict:
if cluster_name in bio_synth_core_positions:
cluster_bio_synth_core_positions = bio_synth_core_positions[cluster_name]
self.bgc_collection_dict[cluster_name].bio_synth_core_positions = cluster_bio_synth_core_positions
else:
# TODO: replace with logger
print("Warning: Biosynthetic gene core position info for {} was not found".format(cluster_name))

def load_domain_names_from_dict(self, run):
try:
with open(os.path.join(run.directories.cache, "BGCs.dict"), "r") as bgc_file:
domain_name_info_dict = pickle.load(bgc_file)
for cluster_name in domain_name_info_dict:
domain_name_info = domain_name_info_dict[cluster_name]
self.bgc_collection_dict[cluster_name].domain_name_info = domain_name_info
bgc_file.close()
except IOError:
sys.exit("BGCs file not found...")

def load_domain_names_from_pfd(self, run, try_resume_multiple_alignment):
for outputbase in self.bgc_name_list:
if run.options.verbose:
print(" Processing: " + outputbase)

pfd_file = os.path.join(run.directories.pfd, outputbase + ".pfd")
filtered_matrix = [[part.strip() for part in l.split('\t')] for l in open(pfd_file)]

# save each domain sequence from a single BGC in its corresponding file
fasta_file = os.path.join(run.directories.bgc_fasta, outputbase + ".fasta")

# only create domain fasta if the pfd content is different from original and
# domains folder has been emptied. Else, if trying to resume alignment phase,
# domain fasta files will contain duplicate sequence labels
if not try_resume_multiple_alignment:
with open(fasta_file, "r") as fasta_file_handle:
# all fasta info from a BGC
fasta_dict = fasta_parser(fasta_file_handle)
save_domain_seqs(filtered_matrix, fasta_dict,
run.directories.domains, outputbase)

domain_name_info = generate_domain_name_info_dict(filtered_matrix)
self.bgc_collection_dict[outputbase].domain_name_info = domain_name_info

del filtered_matrix[:]

def save_domain_names_to_dict(self, run):
# store processed BGCs dictionary for future re-runs
with open(os.path.join(run.directories.cache, "BGCs.dict"), "wb") as bgc_file:
domain_name_info = {}
for cluster_name in self.bgc_collection_dict:
domain_name_info[cluster_name] = self.bgc_collection_dict[cluster_name].domain_name_info
pickle.dump(domain_name_info, bgc_file)
bgc_file.close()

def init_gene_strings(self):
for cluster_name in self.bgc_collection_dict:
self.bgc_collection_dict[cluster_name].init_gene_string()

def is_ready(self):
return
Loading

0 comments on commit 0781e72

Please sign in to comment.