scripts/beta_significance.py

#!/usr/bin/env python
# File created on 4 June 2010
from __future__ import division

__author__ = "Justin Kuczynski"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Justin Kuczynski", "Jose Antonio Navas", "Greg Caporaso"]
__license__ = "GPL"
__version__ = "1.8.0"
__maintainer__ = "Justin Kuczynski"
__email__ = "justinak@gmail.com"


from qiime.util import make_option
import os
from numpy import array

import warnings
warnings.filterwarnings('ignore', 'Not using MPI as mpi4py not found')
from cogent.maths.unifrac.fast_unifrac import fast_unifrac_permutations_file, TEST_ON_PAIRWISE, TEST_ON_TREE, TEST_ON_ENVS
from cogent.maths.unifrac.fast_unifrac import fast_p_test_file
from qiime.util import parse_command_line_parameters
from qiime.format import format_unifrac_sample_mapping
from biom.parse import parse_biom_table


script_info = {}
script_info['brief_description'] = "This script runs any of a set of common tests to determine if a sample is statistically significantly different from another sample"
script_info['script_description'] = "The tests are conducted on each pair of samples present in the input otu table. See the unifrac tutorial online for more details (http://bmf2.colorado.edu/unifrac/tutorial.psp)"

script_info['script_usage'] = []

script_info['script_usage'].append(("Example:","Perform 100 randomizations of sample/sequence assignments, and record the probability that sample 1 is phylogenetically different from sample 2, using the unifrac monte carlo significance test. The test is run for all pairs of samples.","%prog -i otu_table.biom -t rep_set.tre -s unweighted_unifrac -o unw_sig.txt"))

script_info['output_description']= "The script outputs a tab delimited text file with each pair of samples and a p value representing the probability that a random sample/sequence assignment will result in more dissimilar samples than the actual pair of samples."

script_info['required_options'] = [\
 make_option('-i', '--input_path',type='existing_filepath',
     help='input otu table in biom format'), 
 make_option('-o', '--output_path',type='new_filepath',
     help='output results path'),
 make_option('-s', '--significance_test',type='choice',
     choices=['unweighted_unifrac', 'weighted_unifrac', 'weighted_normalized_unifrac', 'p-test'],
     help="significance test to use, options are 'unweighted_unifrac', 'weighted_unifrac', 'weighted_normalized_unifrac', or 'p-test'"), 
 make_option('-t', '--tree_path',type='existing_filepath',help='path to newick tree file'),

]
script_info['optional_options'] = [
 make_option('-n', '--num_iters', default=100, type="int",
     help='number of monte carlo randomizations [default: %default]'),
 make_option('-k', '--type_of_test', type='choice',
     choices=['all_together', 'each_pair', 'each_sample'], default='each_pair',
     help="type of significance test to perform, options are 'all_together', 'each_pair' or 'each_sample'. [default: %default]"),
]
script_info['version'] = __version__

def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    otu_table_fp = opts.input_path
    otu_table = parse_biom_table(open(otu_table_fp, 'U'))
    sample_ids = otu_table.SampleIds
    otu_ids = otu_table.ObservationIds
    ## This is not memory safe: need to be able to load the otu table as ints
    otu_table_array = array(list(otu_table.iterObservationData()),dtype='int')

    if opts.type_of_test == 'all_together':
        type_of_test = TEST_ON_TREE
        header_text = "sample\tp value\tp value (Bonferroni corrected)\n"
    elif opts.type_of_test == 'each_pair':
        type_of_test = TEST_ON_PAIRWISE
        header_text = "sample 1\tsample 2\tp value\tp value (Bonferroni corrected)\n"
    elif opts.type_of_test == 'each_sample':
        type_of_test = TEST_ON_ENVS
        header_text = "sample\tp value\tp value (Bonferroni corrected)\n"
        if opts.significance_test == 'p-test':
            raise RuntimeError('significance test type "each_sample" not allowed for p-test')
    else:
        raise RuntimeError('significance test type "%s" not found' %\
            opts.type_of_test)

    # note, uses ugly temp file
    if opts.significance_test == 'unweighted_unifrac':
        tree_in = open(opts.tree_path,'U')
        output_fp = opts.output_path + '_envs.tmp'
        
        result = format_unifrac_sample_mapping(
            sample_ids, otu_ids, otu_table_array)
        of = open(output_fp, 'w')
        of.write('\n'.join(result))
        of.close()
        envs_in = open(output_fp,'U')

        result = fast_unifrac_permutations_file(tree_in, envs_in,
            weighted=False, num_iters=opts.num_iters, verbose=opts.verbose, test_on=type_of_test)
        envs_in.close()
        os.remove(output_fp)
        
        of = open(opts.output_path,'w')
        of.write("#unweighted unifrac significance test\n")
        of.write(header_text)
        for line in result:
            of.write('\t'.join(map(str,line)) + '\n')
        of.close()

    elif opts.significance_test == 'p-test':
        tree_in = open(opts.tree_path,'U')
        output_fp = opts.output_path + '_envs.tmp'
    
        result = format_unifrac_sample_mapping(
            sample_ids, otu_ids, otu_table_array)
        of = open(output_fp, 'w')
        of.write('\n'.join(result))
        of.close()
        envs_in = open(output_fp,'U')

        result = fast_p_test_file(tree_in, envs_in,
            num_iters=opts.num_iters, verbose=opts.verbose, test_on=type_of_test)
        envs_in.close()
        os.remove(output_fp)
        of = open(opts.output_path,'w')
        of.write(\
            "#andy martin's p-test significance test\n")
        of.write(header_text)
        for line in result:
            of.write('\t'.join(map(str,line)) + '\n')
        of.close()

    elif opts.significance_test == 'weighted_unifrac':
        tree_in = open(opts.tree_path,'U')
        output_fp = opts.output_path + '_envs.tmp'
    
        result = format_unifrac_sample_mapping(
            sample_ids, otu_ids, otu_table_array)
        of = open(output_fp, 'w')
        of.write('\n'.join(result))
        of.close()
        envs_in = open(output_fp,'U')

        result = fast_unifrac_permutations_file(tree_in, envs_in,
            weighted=True, num_iters=opts.num_iters, verbose=opts.verbose, test_on=type_of_test)
        envs_in.close()
        os.remove(output_fp)
        of = open(opts.output_path,'w')
        of.write(\
            "#weighted unifrac significance test\n")
        of.write(header_text)
        for line in result:
            of.write('\t'.join(map(str,line)) + '\n')
        of.close()

    elif opts.significance_test == 'weighted_normalized_unifrac':
		tree_in = open(opts.tree_path, 'U')
		output_fp = opts.output_path + '_envs.tmp'

		result = format_unifrac_sample_mapping(
			sample_ids, otu_ids, otu_table_array)
		of = open(output_fp, 'w')
		of.write('\n'.join(result))
		of.close()
		envs_in = open(output_fp, 'U')

		result = fast_unifrac_permutations_file(tree_in, envs_in,
			weighted='correct', num_iters = opts.num_iters, verbose=opts.verbose, test_on=type_of_test)
		envs_in.close()
		os.remove(output_fp)
		of = open(opts.output_path, 'w')
		of.write(\
			"#weighted normalized unifrac significance test\n")
		of.write(\
			"sample 1\tsample 2\tp value\tp value (Bonferroni corrected)\n")
		for line in result:
			of.write('\t'.join(map(str,line)) + '\n')
		of.close()

    else:
        raise RuntimeError('significance test "%s" not found' %\
            opts.significance_test)
if __name__ == "__main__":
    main()