Permalink
Browse files

Clade finding now uses ANI95 clades

- Removes `G` headers from ANI95 and AAI90 clade files
- New `:medoids_ani95` file with ANI95 clade medoids
- `:clade_finding` only runs on ANI95 medoid datasets
- ANI is now calculated for all pairs with ANI ≥ 85% (it was 90% before)
- Gem v0.3.6.0 pushed
  • Loading branch information...
lmrodriguezr committed Nov 28, 2018
1 parent 3580547 commit 5cd7901f3aad40634e3d021980aa96297f3e2095
@@ -1,8 +1,8 @@
# @package MiGA
# @license Artistic-2.0

require "miga/result"
require "miga/project/base"
require 'miga/result'
require 'miga/project/base'

##
# Helper module including specific functions to add project results.
@@ -31,7 +31,7 @@ def results
# Supported values include:
# - +force+: A Boolean indicating if the result must be re-indexed. If true,
# it implies save=true.
def add_result(name, save=true, opts={})
def add_result(name, save = true, opts = {})
return nil if @@RESULT_DIRS[name].nil?
base = "#{path}/data/#{@@RESULT_DIRS[name]}/miga-project"
if opts[:force]
@@ -49,21 +49,21 @@ def add_result(name, save=true, opts={})
##
# Get the next distances task, saving intermediate results if +save+. Returns
# a Symbol.
def next_distances(save=true) ; next_task(@@DISTANCE_TASKS, save) ; end
def next_distances(save = true) ; next_task(@@DISTANCE_TASKS, save) ; end

##
# Get the next inclade task, saving intermediate results if +save+. Returns a
# Symbol.
def next_inclade(save=true) ; next_task(@@INCLADE_TASKS, save) ; end
def next_inclade(save = true) ; next_task(@@INCLADE_TASKS, save) ; end

##
# Get the next task from +tasks+, saving intermediate results if +save+.
# Returns a Symbol.
def next_task(tasks=@@DISTANCE_TASKS+@@INCLADE_TASKS, save=true)
def next_task(tasks = @@DISTANCE_TASKS+@@INCLADE_TASKS, save = true)
tasks.find do |t|
if metadata["run_#{t}"]==false or
if metadata["run_#{t}"] == false or
(!is_clade? and @@INCLADE_TASKS.include?(t) and
metadata["run_#{t}"]!=true)
metadata["run_#{t}"] != true)
false
else
add_result(t, save).nil?
@@ -79,10 +79,10 @@ def next_task(tasks=@@DISTANCE_TASKS+@@INCLADE_TASKS, save=true)
def add_result_distances(base)
return nil unless result_files_exist?(base, %w[.Rdata .log .txt])
r = MiGA::Result.new("#{base}.json")
r.add_file(:rdata, "miga-project.Rdata")
r.add_file(:matrix, "miga-project.txt")
r.add_file(:log, "miga-project.log")
r.add_file(:hist, "miga-project.hist")
r.add_file(:rdata, 'miga-project.Rdata')
r.add_file(:matrix, 'miga-project.txt')
r.add_file(:log, 'miga-project.log')
r.add_file(:hist, 'miga-project.hist')
r
end

@@ -91,48 +91,49 @@ def add_result_clade_finding(base)
return nil unless is_clade? or result_files_exist?(base,
%w[.pdf .classif .medoids .class.tsv .class.nwk])
r = add_result_iter_clades(base)
r.add_file(:aai_tree, "miga-project.aai.nwk")
r.add_file(:proposal, "miga-project.proposed-clades")
r.add_file(:clades_aai90, "miga-project.aai90-clades")
r.add_file(:clades_ani95, "miga-project.ani95-clades")
r.add_file(:aai_tree, 'miga-project.aai.nwk')
r.add_file(:proposal, 'miga-project.proposed-clades')
r.add_file(:clades_aai90, 'miga-project.aai90-clades')
r.add_file(:clades_ani95, 'miga-project.ani95-clades')
r.add_file(:medoids_ani95, 'miga-project.ani95-medoids')
r
end

def add_result_subclades(base)
return nil unless result_files_exist?(base,
%w[.pdf .classif .medoids .class.tsv .class.nwk])
r = add_result_iter_clades(base)
r.add_file(:ani_tree, "miga-project.ani.nwk")
r.add_file(:ani_tree, 'miga-project.ani.nwk')
r
end

def add_result_iter_clades(base)
r = MiGA::Result.new("#{base}.json")
r.add_file(:report, "miga-project.pdf")
r.add_file(:class_table, "miga-project.class.tsv")
r.add_file(:class_tree, "miga-project.class.nwk")
r.add_file(:classif, "miga-project.classif")
r.add_file(:medoids, "miga-project.medoids")
r.add_file(:report, 'miga-project.pdf')
r.add_file(:class_table, 'miga-project.class.tsv')
r.add_file(:class_tree, 'miga-project.class.nwk')
r.add_file(:classif, 'miga-project.classif')
r.add_file(:medoids, 'miga-project.medoids')
r
end

def add_result_ogs(base)
return nil unless result_files_exist?(base, %w[.ogs .stats])
r = MiGA::Result.new("#{base}.json")
r.add_file(:ogs, "miga-project.ogs")
r.add_file(:abc, "miga-project.abc")
r.add_file(:stats, "miga-project.stats")
r.add_file(:core_pan, "miga-project.core-pan.tsv")
r.add_file(:core_pan_plot, "miga-project.core-pan.pdf")
r.add_file(:ogs, 'miga-project.ogs')
r.add_file(:abc, 'miga-project.abc')
r.add_file(:stats, 'miga-project.stats')
r.add_file(:core_pan, 'miga-project.core-pan.tsv')
r.add_file(:core_pan_plot, 'miga-project.core-pan.pdf')
r
end

def add_result_project_stats(base)
return nil unless
result_files_exist?(base, %w[.taxonomy.json .metadata.db])
r = MiGA::Result.new("#{base}.json")
r.add_file(:taxonomy_index, "miga-project.taxonomy.json")
r.add_file(:metadata_index, "miga-project.metadata.db")
r.add_file(:taxonomy_index, 'miga-project.taxonomy.json')
r.add_file(:metadata_index, 'miga-project.metadata.db')
r
end

@@ -10,7 +10,7 @@ module MiGA
# - Float representing the major.minor version.
# - Integer representing gem releases of the current version.
# - Integer representing minor changes that require new version number.
VERSION = [0.3, 5, 2]
VERSION = [0.3, 6, 0]

##
# Nickname for the current major.minor version.
@@ -417,10 +417,15 @@ Supported file keys:
the AAI against the corresponding medoid.
* `medoids` (*req* for `genomes`): List of medoids per cluster.
* `aai_tree`: Bio-NJ tree based on AAI distances in Newick format.
* `proposal` (*req*): Proposed species-level clades in the project. One line per
proposed clade, with comma-delimited dataset names.
* `clades_aai90`: Clades formed at AAI > 90% in the same format as `proposal`.
* `clades_ani95`: Same as `proposal`.
* `proposal` (*req*): Proposed species-level clades in the project, based on
`clades_ani95`. One line per proposed clade, with tab-delimited dataset names.
Only clades with 5 or more members are included.
* `clades_aai90`: Clades formed at AAI > 90%. One clade per line, with
comma-delimited dataset names.
* `clades_ani95`: Clades formed at ANI > 95%. One clade per line, with
comma-delimited dataset names.
* `medoids_ani95`: List of `clades_ani95` datasets with the smallest distance to
all members of its own ANI95 clade. The list is in the same order.

MiGA symbol: `clade_finding`.

@@ -1,41 +1,41 @@
$:.unshift File.expand_path("../lib", __FILE__)
$:.unshift File.expand_path('../lib', __FILE__)

require "miga/version"
require 'miga/version'

Gem::Specification.new do |s|
# Basic information
s.name = "miga-base"
s.name = 'miga-base'
s.version = MiGA::MiGA.FULL_VERSION
s.date = MiGA::MiGA.VERSION_DATE.to_s

# Metadata
s.license = "Artistic-2.0"
s.summary = "MiGA"
s.description = "Microbial Genomes Atlas"
s.authors = ["Luis M. Rodriguez-R"]
s.email = "lmrodriguezr@gmail.com"
s.homepage = "http://enve-omics.ce.gatech.edu/miga"
s.license = 'Artistic-2.0'
s.summary = 'MiGA'
s.description = 'Microbial Genomes Atlas'
s.authors = ['Luis M. Rodriguez-R']
s.email = 'lmrodriguezr@gmail.com'
s.homepage = 'http://enve-omics.ce.gatech.edu/miga'

# Files
s.files = Dir[
"lib/**/*.rb", "test/**/*.rb", "lib/miga/_data/**/*",
"scripts/*.bash", "utils/**/*", "bin/*", "actions/*",
"Gemfile", "Rakefile", "README.md", "LICENSE"
'lib/**/*.rb', 'test/**/*.rb', 'lib/miga/_data/**/*',
'scripts/*.bash', 'utils/**/*', 'bin/*', 'actions/*',
'Gemfile', 'Rakefile', 'README.md', 'LICENSE'
]
s.executables << "miga"
s.executables << 'miga'

# Dependencies
s.add_runtime_dependency "daemons", "~> 1.2", ">= 1.2.4"
s.add_runtime_dependency 'daemons', '~> 1.2', '>= 1.2.4'
# Testing requires "shush" from daemons, first introduced in v1.2.4
s.add_runtime_dependency "json", "> 1.8", "< 3"
s.add_runtime_dependency "sqlite3", "~> 1.3"
s.required_ruby_version = ">= 1.9"
s.add_runtime_dependency 'json', '> 1.8', '< 3'
s.add_runtime_dependency 'sqlite3', '~> 1.3'
s.required_ruby_version = '>= 1.9'

# Docs + tests
s.extra_rdoc_files << "README.md"
s.extra_rdoc_files << 'README.md'
s.rdoc_options = %w(lib README.md --main README.md)
s.rdoc_options << "--title" << s.summary
s.add_development_dependency "rake", "~> 11"
s.add_development_dependency "test-unit", "~> 3"
s.rdoc_options << '--title' << s.summary
s.add_development_dependency 'rake', '~> 11'
s.add_development_dependency 'test-unit', '~> 3'

end
@@ -18,6 +18,7 @@ def aai(target)
dataset.name, target.name, tmp_dbs[:aai]).tap{ checkpoint :aai }
end

##
# Estimates AAI against +target+ using hAAI
def haai(target)
haai = aai_cmd(tmp_file("ess_genes.fa"),
@@ -34,6 +35,7 @@ def haai(target)
aai
end

##
# Calculates ANI against +target+
def ani(target)
# Check if the request makes sense
@@ -49,6 +51,15 @@ def ani(target)
dataset.name, target.name, tmp_dbs[:ani]).tap{ checkpoint :ani }
end

##
# Calculates and returns ANI against +target+ if AAI >= 85%. Returns
# +nil+ otherwise
def ani_after_aai(target)
aai = aai(target)
ani(target) unless aai.nil? or aai < 85.0
end

##
# Execute an AAI command
def aai_cmd(f1, f2, n1, n2, db, o={})
o = opts.merge(o)
@@ -59,6 +70,7 @@ def aai_cmd(f1, f2, n1, n2, db, o={})
(v.nil? or v.empty?) ? 0 : v.to_f
end

##
# Execute an ANI command
def ani_cmd(f1, f2, n1, n2, db, o={})
o = opts.merge(o)
@@ -30,7 +30,9 @@ def initialize(project_path, dataset_name, opts_hash={})
if opts[:run_taxonomy] && project.metadata[:ref_project]
@home = File.expand_path('05.taxonomy', @home)
@ref_project = MiGA::Project.load(project.metadata[:ref_project])
raise "Cannot load reference project: #{project.metadata[:ref_project]}" if @ref_project.nil?
if @ref_project.nil?
raise "Cannot load reference project: #{project.metadata[:ref_project]}"
end
else
@ref_project = project
end
@@ -55,47 +57,61 @@ def go!
def go_ref!
# Initialize databases
initialize_dbs! true

# first-come-first-serve traverse
ref_project.each_dataset do |ds|
next if !ds.is_ref? or ds.is_multi? or ds.result(:essential_genes).nil?
puts "[ #{Time.now} ] #{ds.name}"
aai = aai(ds)
ani(ds) unless aai.nil? or aai < 90.0
ani_after_aai(ds)
end

# Finalize
[:haai, :aai, :ani].each{ |m| checkpoint! m if db_counts[m] > 0 }
end

##
# Launch analysis for query datasets
def go_query!
# Check if project is ready
v = ref_project.is_clade? ? [:subclades, :ani] : [:clade_finding, :aai]
res = ref_project.result(v[0])
tsk = ref_project.is_clade? ? [:subclades, :ani] : [:clade_finding, :aai]
res = ref_project.result(tsk[0])
return if res.nil?

# Initialize the databases
initialize_dbs! false
# Calculate the classification-informed AAI/ANI traverse
results = File.expand_path("#{dataset.name}.#{v[1]}-medoids.tsv", home)
fh = File.open(results, "w")
classif, val_cls = *classify(res.dir, ".", v[1], fh)
results = File.expand_path("#{dataset.name}.#{tsk[1]}-medoids.tsv", home)
fh = File.open(results, 'w')
classif, val_cls = *classify(res.dir, '.', tsk[1], fh)
fh.close

# Calculate all the AAIs/ANIs against the lowest subclade (if classified)
par_dir = File.dirname(File.expand_path(classif, res.dir))
par = File.expand_path("miga-project.classif", par_dir)
par = File.expand_path('miga-project.classif', par_dir)
closest = {dataset: nil, ani: 0.0}
if File.size? par
File.open(par, "r") do |fh|
File.open(par, 'r') do |fh|
fh.each_line do |ln|
r = ln.chomp.split("\t")
next unless r[1].to_i==val_cls
target = ref_project.dataset(r[0])
aai = (v[1]==:aai) ? aai(target) : 100.0
ani(target) if aai >= 90.0
next unless r[1].to_i == val_cls
ani = ani_after_aai(ref_project.dataset(r[0]))
closest = {ds: r[0], ani: ani} unless ani.nil? or ani < closest[:ani]
end
end
end

# Calculate all the AAIs/ANIs against the closest ANI95-clade (if ANI > 95%)
cl_path = File.expand_path('miga-project.ani95-clades', home)
if File.size? cl_path and tsk[0] == :clade_finding and closest[:ani] >= 95.0
File.foreach(cl_path).
map { |i| i.chomp.split(',') }.
find { |i| i.include? closest[:ds] }.
each { |i| ani_after_aai(ref_project.dataset(i)) }
end

# Finalize
[:haai, :aai, :ani].each{ |m| checkpoint! m if db_counts[m] > 0 }
build_medoids_tree(v[1])
build_medoids_tree(tsk[1])
transfer_taxonomy(tax_test)
end

@@ -0,0 +1,31 @@
#!/usr/bin/env Rscript
#
# @package MiGA
# @license Artistic-2.0
#

#= Load stuff
argv <- commandArgs(trailingOnly=T)
suppressPackageStartupMessages(library(ape))

find_medoids <- function(dist, out, clades) {
dist <- as.matrix(dist)
cl <- read.table(clades, header = FALSE, sep = '\t', as.is = TRUE)[,1]
medoids <- c()
for(i in cl){
lab <- strsplit(i, ',')[[1]]
cat('Clade of:', lab[1], '\n')
if(length(lab) == 1) {
med <- lab
} else {
med <- lab[which.min(colSums(dist[lab, lab], na.rm = TRUE))]
}
medoids <- c(medoids, med)
}
write.table(medoids, out, quote = FALSE, row.names = FALSE, col.names = FALSE)
}

#= Main
load(argv[1])
find_medoids(dist = ani.d, out = argv[2], clades = argv[3])

Oops, something went wrong.

0 comments on commit 5cd7901

Please sign in to comment.