-
Notifications
You must be signed in to change notification settings - Fork 7
/
multiPhate.py
1212 lines (1045 loc) · 58.4 KB
/
multiPhate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
################################################################
#
# Program Title: multiPhate.py (/MultiPhate/)
#
# Last Update: 09 April 2020
#
# Description: Script multiPhate.py runs the phate annotation pipeline over a set of input phage genomes. This code runs under
# Python 3.7, and requires dependent packages and databases as listed in the README file.
# multiPhate.py inputs a configuration file (see sample_ multiPhate.config), and uses it to construct a set of
# configuration files, one for each genome. Then, multiPhate.py executes phate_runPipeline.py over all of the genomes in the set.
#
# Usage: python multiPhate.py myMultiPhate.config
# (see sample_multiPhate.config for how to create your configuration file)
#
################################################################
# This code was developed by Carol L. Ecale Zhou at Lawrence Livermore National Laboratory.
# THIS CODE IS COVERED BY THE BSD LICENSE. SEE INCLUDED FILE BSD.PDF FOR DETAILS.
# DO NOT MODIFY ANYTHING IN THIS FILE EXCEPT ITEMS LABELED AS "USER CONFIGURATION"
import sys, os, re, string, copy, time, datetime
import subprocess
# CONFIGURABLE
# 1) If you are running multiPhATE on a high-performance computing (HPC) system (e.g., using SLURM), you will need to quiet the multiPhate log.
# Set HPC = True to prevent multiPhATE from writing the multiPhate.log file, as each process attempting to write to this one file will cause
# contention for I/O.
HPC = False
# 2) If you are running under a linux system, set PHATE_OUT and PHATE_ERR to 'True'. This will capture standard errors to files. Cannot
# guarantee this will work under other operating systems.
PHATE_OUT = 'False'
PHATE_ERR = 'True'
#
# Default Verbosity; These are normally set in the config file, but defaults take effect if not specified in config.
CLEAN_RAW_DATA_DEFAULT = 'True' # if 'False', the raw Blast and Hmm outputs will be saved in the PipelineOutput folder
PHATE_WARNINGS_DEFAULT = 'False'
PHATE_MESSAGES_DEFAULT = 'False'
PHATE_PROGRESS_DEFAULT = 'True'
CGC_WARNINGS_DEFAULT = 'False'
CGC_MESSAGES_DEFAULT = 'False'
CGC_PROGRESS_DEFAULT = 'False'
#DEBUG = True # Controls debug settings in this (local) code only
DEBUG = False # Leave False, unless debugging
# Env: BLAST parameters. Normally leave these settings alone. These are minimum cutoffs. Configure stringency in config file.
BLASTP_IDENTITY_DEFAULT = '60'
BLASTP_HIT_COUNT_DEFAULT = '3'
BLASTN_HIT_COUNT_DEFAULT = '3'
# Constants; defaults will apply if not specified in config file
# Leave all this stuff alone!
# Standard directories
BASE_DIR_DEFAULT = os.path.join(os.getcwd(),"") # Ex: /Home/MyName/MyCodeDirectory/multiPhATE/
DATABASE_DIR_DEFAULT = BASE_DIR_DEFAULT + "Databases/"
SOFTWARE_DIR_DEFAULT = BASE_DIR_DEFAULT + "ExternalCodes/"
PIPELINE_INPUT_DIR_DEFAULT = BASE_DIR_DEFAULT + "PipelineInput/"
PIPELINE_OUTPUT_DIR_DEFAULT = BASE_DIR_DEFAULT + "PipelineOutput/"
PHATE_PIPELINE_CODE = 'phate_runPipeline.py'
CONSENSUS_CALLS_FILE = 'phanotate.cgc' #*** For now this is PHANOTATE calls, though may be consensus calls in future
GENE_FILE = 'gene.fnt' #
PROTEIN_FILE = 'protein.faa' #
GENETIC_CODE = '11' # default is bacterial (11)
GENE_CALLER = 'phanotate' # default is annotation of phage, so PHANOTATE is preferred gene caller; if bac, could be 'consensus', 'genemark', 'glimmer', or 'prodigal'
GENOME_TYPE = 'phage' # default is phage; could be 'bacterium'
NAME = 'unknown' # user provided
CONTIG_NAME = 'unknown' # user provided: temporary, finished genomes/single contig only for now
SPECIES = 'unknown' # user provided
# gene callers
GENEMARKS_CALLS_DEFAULT = False # Requires license
PRODIGAL_CALLS_DEFAULT = False
GLIMMER_CALLS_DEFAULT = False
PHANOTATE_CALLS_DEFAULT = False
GENEMARKS_PATH = '' # Available via license
GLIMMER_PATH = '' # Can install using Conda
PRODIGAL_PATH = '' # Can install using Conda
PHANOTATE_PATH = ''
#blast parameters
MAX_BLAST_HIT_COUNT = 100 # maximum number of hits to capture (user should specify far fewer than max)
MIN_BLASTP_IDENTITY = 5 # default; sets a lower limit based on value at which a structure model can provide information
MAX_BLASTP_HIT_COUNT = 100 # default; sets an upper limit; user's value should typically be well below this
MAX_BLASTN_HIT_COUNT = 100 # default; sets an upper limit
#blast databases to be used for search
NCBI_VIRUS_BLAST_DEFAULT = False
NCBI_VIRUS_PROTEIN_BLAST_DEFAULT = False
KEGG_VIRUS_BLAST_DEFAULT = False # Requires license
NR_BLAST_DEFAULT = False # Large data set; blast run takes time
REFSEQ_PROTEIN_BLAST_DEFAULT = False # Large data set; blast run takes time
PHANTOME_BLAST_DEFAULT = False
PVOGS_BLAST_DEFAULT = False
UNIPARC_BLAST_DEFAULT = False # Keep turned 'off' for now; not yet in service
REFSEQ_GENE_BLAST_DEFAULT = False
SWISSPROT_BLAST_DEFAULT = False
UNIPROT_BLAST_DEFAULT = False # not yet in service
PFAM_BLAST_DEFAULT = False # not yet in service
#hmm programs
HMM_PROGRAM_DEFAULT = 'jackhmmer' # This is the only hmm program currently supported
#hmm databases to be used for search
NCBI_VIRUS_HMM_DEFAULT = False # not yet in service
NCBI_VIRUS_PROTEIN_HMM_DEFAULT = False # not yet in service
KEGG_VIRUS_HMM_DEFAULT = False # Requires license
NR_HMM_DEFAULT = False # Large data set; hmm run takes time
REFSEQ_PROTEIN_HMM_DEFAULT = False # Large data set; hmm run takes time
PHANTOME_HMM_DEFAULT = False # not yet in service
PVOGS_HMM_DEFAULT = False #
UNIPARC_HMM_DEFAULT = False # not yet in service
REFSEQ_GENE_HMM_DEFAULT = False # not yet in service
SWISSPROT_HMM_DEFAULT = False # not yet in service
UNIPROT_HMM_DEFAULT = False # not yet in service
PFAM_HMM_DEAFULT = False # not yet in service
#other
PSAT_ANNOTATION_DEFAULT = False # Requires LLNL processing
PSAT = False
PSAT_FILE = ""
# ENVIRONMENT VARIABLES
# It is most convenient to locate the supporting software codes and databases in the above-indicated subdirectories.
# However, if any of your supporting databases or softwares reside elsewhere, then explicit locations will need to
# be filled in in the multiPhate.config file. This will likely be the case for large databases that you may already
# have on your compute cluster (e.g, NR), and for software packages, such as EMBOSS or gene finders that you may
# already have installed on your system. Parameters that differ from defaults will be re-assigned based on information
# provided in the users' multiPhate.config file.
PIPELINE_INPUT_DIR = BASE_DIR_DEFAULT + PIPELINE_INPUT_DIR_DEFAULT # Default
PIPELINE_OUTPUT_DIR = BASE_DIR_DEFAULT + PIPELINE_OUTPUT_DIR_DEFAULT # Default
PHATE_BASE_DIR = BASE_DIR_DEFAULT
EMBOSS_CODE = "" # Modify this for the version you have
EMBOSS_PHATE_HOME = SOFTWARE_DIR_DEFAULT + EMBOSS_CODE # if installed in SOFTWARE_DIR, else enter actual location
os.environ["BASE_DIR"] = BASE_DIR_DEFAULT
os.environ["DATABASE_DIR"] = DATABASE_DIR_DEFAULT
os.environ["SOFTWARE_DIR"] = SOFTWARE_DIR_DEFAULT
os.environ["PIPELINE_INPUT_DIR"] = PIPELINE_INPUT_DIR
os.environ["PIPELINE_OUTPUT_DIR"] = PIPELINE_OUTPUT_DIR
os.environ["PHATE_BASE_DIR"] = PHATE_BASE_DIR
os.environ["EMBOSS_PHATE_HOME"] = EMBOSS_PHATE_HOME
os.environ["PIPELINE_DIR"] = BASE_DIR_DEFAULT
os.environ["PSAT_OUT_DIR"] = BASE_DIR_DEFAULT
# Data sets
os.environ["KEGG_VIRUS_BASE_DIR"] = DATABASE_DIR_DEFAULT + "KEGG/"
os.environ["KEGG_VIRUS_BLAST_HOME"] = os.environ["KEGG_VIRUS_BASE_DIR"] + "T40000.pep"
os.environ["NCBI_VIRUS_BASE_DIR"] = DATABASE_DIR_DEFAULT + "NCBI/"
os.environ["NCBI_VIRUS_BLAST_HOME"] = os.environ["NCBI_VIRUS_BASE_DIR"] + "Virus_Genome/" + "viral.1.1.genomic.fna"
os.environ["NCBI_VIRUS_PROTEIN_BLAST_HOME"] = os.environ["NCBI_VIRUS_BASE_DIR"] + "Virus_Protein/" + "viral.protein.faa"
os.environ["NCBI_TAXON_DIR"] = os.environ["NCBI_VIRUS_BASE_DIR"] + "Virus_Genome/"
os.environ["PHANTOME_BASE_DIR"] = DATABASE_DIR_DEFAULT + "Phantome/"
os.environ["PHANTOME_BLAST_HOME"] = os.environ["PHANTOME_BASE_DIR"] + "Phantome_Phage_genes.faa"
os.environ["PVOGS_BASE_DIR"] = DATABASE_DIR_DEFAULT + "pVOGs/"
os.environ["PVOGS_BLAST_HOME"] = os.environ["PVOGS_BASE_DIR"] + "pVOGs.faa"
os.environ["UNIPARC_BASE_DIR"] = DATABASE_DIR_DEFAULT + "UniParc/" # Uniparc not yet in service
os.environ["UNIPARC_VIRUS_BLAST_HOME"] = os.environ["UNIPARC_BASE_DIR"] + "uniparc_active.fasta" #*** ???
os.environ["NR_BLAST_BASE_DIR"] = DATABASE_DIR_DEFAULT + "NR/"
os.environ["NR_BLAST_HOME"] = os.environ["NR_BLAST_BASE_DIR"] + "nr"
os.environ["REFSEQ_PROTEIN_BASE_DIR"] = DATABASE_DIR_DEFAULT + "Refseq/Protein/"
os.environ["REFSEQ_PROTEIN_BLAST_HOME"] = os.environ["REFSEQ_PROTEIN_BASE_DIR"] + "refseq_protein"
os.environ["REFSEQ_GENE_BASE_DIR"] = DATABASE_DIR_DEFAULT + "Refseq/Gene/"
os.environ["REFSEQ_GENE_BLAST_HOME"] = os.environ["REFSEQ_GENE_BASE_DIR"] + "refseqgene"
os.environ["SWISSPROT_BASE_DIR"] = DATABASE_DIR_DEFAULT + "Swissprot/"
os.environ["SWISSPROT_BLAST_HOME"] = os.environ["SWISSPROT_BASE_DIR"] + "swissprot"
os.environ["UNIPROT_BASE_DIR"] = DATABASE_DIR_DEFAULT + "Uniprot/" # not yet in service
os.environ["UNIPROT_BLAST_HOME"] = os.environ["UNIPROT_BASE_DIR"] + "uniprot"
os.environ["PFAM_BASE_DIR"] = DATABASE_DIR_DEFAULT + "Pfam/" # not yet in service
os.environ["PFAM_BLAST_HOME"] = os.environ["PFAM_BASE_DIR"] + "pfam"
# Gene calling
#os.environ["PRODIGAL_PATH"] = SOFTWARE_DIR_DEFAULT + "prodigal.v2_50/"
os.environ["PRODIGAL_PATH"] = "" # global, if installed via conda
#os.environ["GLIMMER_PATH"] = SOFTWARE_DIR_DEFAULT + "glimmer3.02/bin/"
os.environ["GLIMMER_PATH"] = "" # global, if installed via conda
os.environ["GENEMARKS_PATH"] = SOFTWARE_DIR_DEFAULT + "GeneMarkS/genemark_suite_linux_64/gmsuite/"
os.environ["PHANOTATE_PATH"] = SOFTWARE_DIR_DEFAULT + "PHANOTATE/PHANOTATE-master/"
#os.environ["PHANOTATE_PATH"] = "" # global, if installed via conda
os.environ["CGC_PATH"] = BASE_DIR_DEFAULT + "CompareCalls/"
#os.environ["tRNAscanSE_HOME"] = /Users/myName/tRNAscanDir/trnascan-se-2.0/bin/tRNAscan-SE"
os.environ["tRNAscanSE_HOME"] = "" # global, if installed via conda
# Blast
#os.environ["BLAST_HOME"] = SOFTWARE_DIR_DEFAULT + "/ncbi-blast-2.7.1+/bin/"
os.environ["BLAST_HOME"] = "" # global, if installed via conda; use blast+, not legacy blast
os.environ["MIN_BLASTP_IDENTITY"] = str(MIN_BLASTP_IDENTITY)
os.environ["MAX_BLASTP_HIT_COUNT"] = str(MAX_BLASTP_HIT_COUNT)
os.environ["MAX_BLASTN_HIT_COUNT"] = str(MAX_BLASTN_HIT_COUNT)
os.environ["BLASTP_IDENTITY_DEFAULT"] = str(BLASTP_IDENTITY_DEFAULT)
os.environ["BLASTP_HIT_COUNT_DEFAULT"] = str(BLASTP_HIT_COUNT_DEFAULT)
os.environ["BLASTN_HIT_COUNT_DEFAULT"] = str(BLASTN_HIT_COUNT_DEFAULT)
# HMM
os.environ["HMM_HOME"] = ""
# Global control: verbosity and error capture
os.environ["CLEAN_RAW_DATA"] = CLEAN_RAW_DATA_DEFAULT
os.environ["PHATE_WARNINGS"] = PHATE_WARNINGS_DEFAULT # Print warnings and errors to standard out
os.environ["PHATE_MESSAGES"] = PHATE_MESSAGES_DEFAULT # Print helpful messages (may be verbose)
os.environ["PHATE_PROGRESS"] = PHATE_PROGRESS_DEFAULT # Print each step in processing a genome
os.environ["PHATE_ERR"] = PHATE_ERR # Capture standard errors to files on linux/mac machine
os.environ["PHATE_OUT"] = PHATE_OUT # Capture standard errors to files on linux/mac machine
os.environ["CGC_WARNINGS"] = CGC_WARNINGS_DEFAULT
os.environ["CGC_MESSAGES"] = CGC_MESSAGES_DEFAULT
os.environ["CGC_PROGRESS"] = CGC_PROGRESS_DEFAULT
# Constants
CODE_BASE = "multiPhate"
CODE = CODE_BASE + ".py"
CONFIG_FILE = "multiPhate.config" # by default, but user should name their own, ending in ".config"
SAMPLE_CONFIG_FILE = "sample_" + CONFIG_FILE # Sample config file; user should copy, then modify.
# HELP STRINGS
HELP_STRING = """This code, """ + CODE + """, runs the phage annotation pipeline (phate_runPipeline.py) over multiple genomes. The configuration file input to this code specifies a list of genomes to be processed and the parameters for pipeline execution. The pipeline performs 1) gene calling by 4 gene callers (PHANOTATE, GeneMarkS, Glimmer3, and Prodigal), followed by identification of closest phage genome by means of blast against an NCBI-phage database, and sequence-based functional annotation by means of blastp against several peptide databases (NR, NCBI virus protein, KEGG-virus, Phantome, pVOGs, Swissprot, Refseq protein), and HMM search against the pVOG database. \nType: python """ + CODE + """ usage - for more information about constructing the command line.\nType: python """ + CODE + """ detail - for more information about how this code can be run.\n"""
INPUT_STRING = """The input files and other parameters for running this code are specified in a configuration file, which is provided as the only input parameter. See sample configuration file (""" + SAMPLE_CONFIG_FILE + """) for details on how to customize your configuration file.\n"""
USAGE_STRING = """Usage: python """ + CODE + """ """ + CONFIG_FILE + """\n"""
DETAIL_STRING = """Currently the PSAT module is run separately as a web service. In order to incorporate PSAT output into your annotations, you should first run this pipeline specifying "translation_only" in the configuration file. Then, use the generated peptide/protein fasta file as input for PSAT processing. Once you have the PSAT output, save it to the pipeline input directory, and re-run this pipeline, specifying that translation_only is false.\n"""
##### PATTERNS #####
# Locations
p_phateDir = re.compile("phate_dir='(.*)'")
p_databaseDir = re.compile("database_dir='(.*)'")
p_softwareDir = re.compile("software_dir='(.*)'")
# General
p_comment = re.compile("^#")
p_blank = re.compile("^$")
p_help = re.compile("help")
p_input = re.compile("input")
p_usage = re.compile("usage")
p_detail = re.compile("detail")
p_config = re.compile("config")
p_outputSubdir = re.compile("output_subdir='(.*)'")
p_genomeFile = re.compile("genome_file='(.*)'")
p_genomeType = re.compile("genome_type='(.*)'")
p_name = re.compile("name='(.*)'")
p_contig = re.compile("contig='(.*)'") #*** For now, finished genome, single contig only
p_species = re.compile("species='(.*)'")
# Genome information
p_genomeList = re.compile("Genome\sList") # non-case-sensitive "Genome List"
p_genomeNumber = re.compile("Genome\s+(\d+)") # genome number
p_root = re.compile("([\w\d_-]+)\.fasta") # captures the root name of the fasta file (e.g., takes 'P2' from P2.fasta)
p_end = re.compile("END")
# Gene calling
p_geneCaller = re.compile("gene_caller='(.*)'")
p_genemarksCalls = re.compile("genemarks_calls='(.*)'")
p_glimmerCalls = re.compile("glimmer_calls='(.*)'")
p_prodigalCalls = re.compile("prodigal_calls='(.*)'")
p_phanotateCalls = re.compile("phanotate_calls='(.*)'")
p_geneticCode = re.compile("genetic_code='(\d+)'")
p_translateOnly = re.compile("translate_only='(.*)'")
# Blast
p_blastpIdentity = re.compile("blast_identity='(\d+)'") #*** For now; but should distinguish between blastn/blastp
p_blastpHitCount = re.compile("blastp_hit_count='(\d+)'")
p_blastnHitCount = re.compile("blastn_hit_count='(\d+)'")
p_ncbiVirusBlast = re.compile("ncbi_virus_blast='(.*)'")
p_ncbiVirusProteinBlast = re.compile("ncbi_virus_protein_blast='(.*)'")
p_keggVirusBlast = re.compile("kegg_virus_blast='(.*)'")
p_nrBlast = re.compile("nr_blast='(.*)'")
p_refseqProteinBlast = re.compile("refseq_protein_blast='(.*)'")
p_refseqGeneBlast = re.compile("refseq_gene_blast='(.*)'")
p_swissprotBlast = re.compile("swissprot_blast='(.*)'")
p_phantomeBlast = re.compile("phantome_blast='(.*)'")
p_pvogsBlast = re.compile("pvogs_blast='(.*)'")
p_uniparcBlast = re.compile("uniparc_blast='(.*)'")
p_uniprotBlast = re.compile("uniprot_blast='(.*)'")
p_refseqGeneBlast = re.compile("refseq_gene_blast='(.*)'")
# HMM
p_hmmProgram = re.compile("hmm_program='(.*)'")
p_ncbiVirusHmm = re.compile("ncbi_virus_hmm='(.*)'")
p_ncbiVirusProteinHmm = re.compile("ncbi_virus_protein_hmm='(.*)'")
p_keggVirusHmm = re.compile("kegg_virus_hmm='(.*)'")
p_phantomeHmm = re.compile("phantome_hmm='(.*)'")
p_pvogsHmm = re.compile("pvogs_hmm='(.*)'")
p_swissprotHmm = re.compile("swissprot_hmm='(.*)'")
p_refseqProteinHmm = re.compile("refseq_protein_hmm='(.*)'")
p_refseqGeneHmm = re.compile("refseq_gene_hmm='(.*)'")
p_uniparcHmm = re.compile("uniparc_hmm='(.*)'")
p_uniprotHmm = re.compile("uniprot_hmm='(.*)'")
p_nrHmm = re.compile("nr_hmm='(.*)'")
# Dependent Code Locations
p_blastPlusHome = re.compile("blast_plus_home='(.*)'")
p_embossHome = re.compile("emboss_home='(.*)'")
p_tRNAscanSEhome = re.compile("tRNAscanSE_home='(.*)'")
p_glimmerHome = re.compile("glimmer_home='(.*)'")
p_prodigalHome = re.compile("prodigal_home='(.*)'")
p_phanotateHome = re.compile("phanotate_home='(.*)'")
p_genemarkHome = re.compile("genemark_home='(.*)'")
# Database Locations
p_ncbiVirusDatabase = re.compile("ncbi_virus_database='(.*)'")
p_refseqGeneDatabase = re.compile("refseq_gene_database='(.*)'")
p_ncbiVirusProteinDatabase = re.compile("ncbi_virus_protein_database='(.*)'")
p_keggVirusDatabase = re.compile("kegg_virus_database='(.*)'")
p_phantomeDatabase = re.compile("phantome_database='(.*)'")
p_pvogsDatabase = re.compile("pvogs_database='(.*)'")
p_swissprotDatabase = re.compile("swissprot_database='(.*)'")
p_refseqProteinDatabase = re.compile("refseq_protein_database='(.*)'")
p_nrDatabase = re.compile("nr_database='(.*)'")
# Verbosity
p_phateWarnings = re.compile("phate_warnings='(.*)'")
p_phateMessages = re.compile("phate_messages='(.*)'")
p_phateProgress = re.compile("phate_progress='(.*)'")
p_cgcWarnings = re.compile("cgc_warnings='(.*)'")
p_cgcMessages = re.compile("cgc_messages='(.*)'")
p_cgcProgress = re.compile("cgc_progress='(.*)'")
p_cleanRawData = re.compile("clean_raw_data='(.*)'")
# PSAT
p_psatAnnotation = re.compile("psat_annotation='(.*)'")
p_psatFile = re.compile("psat_file='(.*)'")
##### GET INPUT PARAMETERS #####
# Open log file
logfile = BASE_DIR_DEFAULT + CODE_BASE + ".log"
if not HPC:
LOG = open(logfile,'w')
LOG.write("%s%s\n" % ("Begin log file ",datetime.datetime.now()))
if len(sys.argv) != 2:
print(HELP_STRING)
dateTime = os.popen('date')
if not HPC:
LOG.write("%s%s%s%s\n" % ("Incorrect number of input parameters: ", len(sys.argv), ". End log ",dateTime))
LOG.close()
exit(0)
else:
match_config = re.search(p_config,sys.argv[1])
if match_config:
configFile = sys.argv[1]
if not HPC:
LOG.write("%s%s\n" % ("Config file is ",configFile))
else:
match_input = re.search(p_input, sys.argv[1].lower())
match_usage = re.search(p_usage, sys.argv[1].lower())
match_detail = re.search(p_detail, sys.argv[1].lower())
if match_input:
print(INPUT_STRING)
elif match_usage:
print(USAGE_STRING)
elif match_detail:
print(DETAIL_STRING)
else:
print(HELP_STRING)
if not HPC:
LOG.write("%s%s\n" % ("A help string was provided to user; End log ",datetime.datetime.now()))
LOG.close()
exit(0)
# Open and check input file
fileError = False
try:
CONFIG = open(configFile,"r")
except IOError as e:
fileError = True
print(e)
if fileError:
print("multiPhate says, ERROR: Check your config file.")
print(HELP_STRING)
if not HPC:
LOG.write("%s%s\n" % ("A help string was provided to user; End log ",datetime.datetime.now()))
LOG.close()
exit(0)
##### Read input parameters from configuration file
# First, set as defaults; note: setting these values in config file is optional
geneticCode = GENETIC_CODE
geneCaller = GENE_CALLER
genomeType = GENOME_TYPE
name = NAME
contigName = CONTIG_NAME
species = SPECIES
blastpIdentity = BLASTP_IDENTITY_DEFAULT
blastpHitCount = BLASTP_HIT_COUNT_DEFAULT
blastnHitCount = BLASTN_HIT_COUNT_DEFAULT
ncbiVirusBlast = NCBI_VIRUS_BLAST_DEFAULT
ncbiVirusProteinBlast = NCBI_VIRUS_PROTEIN_BLAST_DEFAULT
keggVirusBlast = KEGG_VIRUS_BLAST_DEFAULT
nrBlast = NR_BLAST_DEFAULT
refseqProteinBlast = REFSEQ_PROTEIN_BLAST_DEFAULT
refseqGeneBlast = REFSEQ_GENE_BLAST_DEFAULT
phantomeBlast = PHANTOME_BLAST_DEFAULT
pvogsBlast = PVOGS_BLAST_DEFAULT
uniparcBlast = UNIPARC_BLAST_DEFAULT
uniprotBlast = UNIPROT_BLAST_DEFAULT
swissprotBlast = SWISSPROT_BLAST_DEFAULT
hmmProgram = HMM_PROGRAM_DEFAULT
ncbiVirusHmm = NCBI_VIRUS_HMM_DEFAULT
ncbiVirusProteinHmm = NCBI_VIRUS_PROTEIN_HMM_DEFAULT
keggVirusHmm = KEGG_VIRUS_HMM_DEFAULT
nrHmm = NR_HMM_DEFAULT
refseqGeneHmm = REFSEQ_GENE_HMM_DEFAULT
refseqProteinHmm = REFSEQ_PROTEIN_HMM_DEFAULT
phantomeHmm = PHANTOME_HMM_DEFAULT
pvogsHmm = PVOGS_HMM_DEFAULT
uniparcHmm = UNIPARC_HMM_DEFAULT
uniprotHmm = UNIPROT_HMM_DEFAULT
swissprotHmm = SWISSPROT_HMM_DEFAULT
genemarksCalls = GENEMARKS_CALLS_DEFAULT
prodigalCalls = PRODIGAL_CALLS_DEFAULT
glimmerCalls = GLIMMER_CALLS_DEFAULT
phanotateCalls = PHANOTATE_CALLS_DEFAULT
psatAnnotation = PSAT_ANNOTATION_DEFAULT
# Capture user's configured values
FIRST_GENOME = True
DATA_ITEMS_NUM = 6
genomeDataDict = {
"genomeNumber" : "",
"genomeFile" : "",
"genomeType" : "",
"genomeSpecies" : "",
"genomeName" : "",
"outputSubdir" : "",
}
genomeList = [] # List of genomeData objects
genomeNumber = "" # Number of current genome; assigned by user; could be a string
genomeDataItems = 0 # Number of data items collected for current genome; should be DATA_ITEMS_NUM
BEGIN_GENOME_LIST = False
nextGenomeData = genomeDataDict
cLines = CONFIG.read().splitlines()
for cLine in cLines:
match_comment = re.search(p_comment,cLine)
match_blank = re.search(p_blank,cLine)
match_genomeList = re.search(p_genomeList,cLine)
match_genomeNumber = re.search(p_genomeNumber,cLine)
match_genomeFile = re.search(p_genomeFile,cLine)
match_genomeType = re.search(p_genomeType,cLine)
match_species = re.search(p_species,cLine)
match_name = re.search(p_name,cLine)
match_outputSubdir = re.search(p_outputSubdir,cLine)
match_end = re.search(p_end,cLine)
match_psatFile = re.search(p_psatFile,cLine)
match_geneticCode = re.search(p_geneticCode,cLine)
match_translateOnly = re.search(p_translateOnly,cLine)
match_geneCaller = re.search(p_geneCaller,cLine)
match_contig = re.search(p_contig,cLine)
match_blastpIdentity = re.search(p_blastpIdentity,cLine)
match_blastpHitCount = re.search(p_blastpHitCount,cLine)
match_blastnHitCount = re.search(p_blastnHitCount,cLine)
match_ncbiVirusBlast = re.search(p_ncbiVirusBlast,cLine)
match_ncbiVirusProteinBlast = re.search(p_ncbiVirusProteinBlast,cLine)
match_keggVirusBlast = re.search(p_keggVirusBlast,cLine)
match_nrBlast = re.search(p_nrBlast,cLine)
match_refseqProteinBlast = re.search(p_refseqProteinBlast,cLine)
match_refseqGeneBlast = re.search(p_refseqGeneBlast,cLine)
match_phantomeBlast = re.search(p_phantomeBlast,cLine)
match_pvogsBlast = re.search(p_pvogsBlast,cLine)
match_uniparcBlast = re.search(p_uniparcBlast,cLine)
match_uniprotBlast = re.search(p_uniprotBlast,cLine)
match_swissprotBlast = re.search(p_swissprotBlast,cLine)
match_refseqGeneBlast = re.search(p_refseqGeneBlast,cLine)
match_hmmProgram = re.search(p_hmmProgram,cLine)
match_ncbiVirusHmm = re.search(p_ncbiVirusHmm,cLine)
match_ncbiVirusProteinHmm = re.search(p_ncbiVirusProteinHmm,cLine)
match_keggVirusHmm = re.search(p_keggVirusHmm,cLine)
match_nrHmm = re.search(p_nrHmm,cLine)
match_refseqProteinHmm = re.search(p_refseqProteinHmm,cLine)
match_phantomeHmm = re.search(p_phantomeHmm,cLine)
match_pvogsHmm = re.search(p_pvogsHmm,cLine)
match_uniparcHmm = re.search(p_uniparcHmm,cLine)
match_uniprotHmm = re.search(p_uniprotHmm,cLine)
match_swissprotHmm = re.search(p_swissprotHmm,cLine)
match_refseqGeneHmm = re.search(p_refseqGeneHmm,cLine)
match_genemarksCalls = re.search(p_genemarksCalls,cLine)
match_prodigalCalls = re.search(p_prodigalCalls,cLine)
match_glimmerCalls = re.search(p_glimmerCalls,cLine)
match_phanotateCalls = re.search(p_phanotateCalls,cLine)
match_psatAnnotation = re.search(p_psatAnnotation,cLine)
match_phateDir = re.search(p_phateDir,cLine)
match_databaseDir = re.search(p_databaseDir,cLine)
match_softwareDir = re.search(p_softwareDir,cLine)
match_blastPlusHome = re.search(p_blastPlusHome,cLine)
match_embossHome = re.search(p_embossHome,cLine)
match_tRNAscanSEhome = re.search(p_tRNAscanSEhome,cLine)
match_glimmerHome = re.search(p_glimmerHome,cLine)
match_prodigalHome = re.search(p_prodigalHome,cLine)
match_phanotateHome = re.search(p_phanotateHome,cLine)
match_genemarkHome = re.search(p_genemarkHome,cLine)
match_ncbiVirusDatabase = re.search(p_ncbiVirusDatabase,cLine)
match_refseqGeneDatabase = re.search(p_refseqGeneDatabase,cLine)
match_ncbiVirusProteinDatabase = re.search(p_ncbiVirusProteinDatabase,cLine)
match_keggVirusDatabase = re.search(p_keggVirusDatabase,cLine)
match_phantomeDatabase = re.search(p_phantomeDatabase,cLine)
match_pvogsDatabase = re.search(p_pvogsDatabase,cLine)
match_swissprotDatabase = re.search(p_swissprotDatabase,cLine)
match_refseqProteinDatabase = re.search(p_refseqProteinDatabase,cLine)
match_nrDatabase = re.search(p_nrDatabase,cLine)
match_phateWarnings = re.search(p_phateWarnings,cLine)
match_phateMessages = re.search(p_phateMessages,cLine)
match_phateProgress = re.search(p_phateProgress,cLine)
match_cgcWarnings = re.search(p_cgcWarnings,cLine)
match_cgcMessages = re.search(p_cgcMessages,cLine)
match_cgcProgress = re.search(p_cgcProgress,cLine)
match_cleanRawData = re.search(p_cleanRawData,cLine)
##### Capture list of genomes and associated data #####
if (match_comment or match_blank):
pass
elif match_genomeList: # Capture all genomes listed; for each, gather genome file, genome type, species, name, output subdir
pass
elif match_genomeNumber: # The next genome's data
# First, record previous genome data, if this is not the first genome
if not FIRST_GENOME:
#if not HPC:
# LOG.write("%s\n" % ("Appending a genome data set"))
if genomeDataItems != DATA_ITEMS_NUM: # If record appears incomplete, flag a problem
if not HPC:
LOG.write("%s%s\n" % ("WARNING: check config file for possible incorrect data items: ", genomeDataItems))
genomeList.append(nextGenomeData)
# Next, begin collecting next genome's data
if not HPC:
LOG.write("%s%s\n" % ("Creating a new genome data set for ", match_genomeNumber.group(0)))
genomeDataItems = 0
nextGenomeData = copy.deepcopy(genomeDataDict) # make new genome data object
genomeNumber = match_genomeNumber.group(1)
nextGenomeData["genomeNumber"] = genomeNumber
genomeDataItems += 1
FIRST_GENOME = False
elif match_genomeFile:
value = match_genomeFile.group(1)
if value != '':
GENOME_FILE = value
else:
GENOME_FILE = "unknown"
if PHATE_WARNINGS:
print("multiPhate says, WARNING: GENOME_FILE is", GENOME_FILE)
nextGenomeData["genomeFile"] = GENOME_FILE
#if not HPC:
# LOG.write("%s%s\n" % ("GENOME_FILE is ",GENOME_FILE))
genomeDataItems += 1
elif match_genomeType:
value = match_genomeType.group(1)
if value.lower() == 'phage' or value.lower() == 'bacteriophage':
nextGenomeData["genomeType"] = 'phage'
elif value.lower() == 'virus' or value.lower() == 'viral' or value.lower() == 'viridae':
nextGenomeData["genomeType"] = 'virus'
elif value.lower() == 'bacteria' or value.lower() == 'bacterium' or value.lower() == 'bacterial':
nextGenomeData["genomeType"] = 'bacterium'
else:
nextGenomeData["genomeType"] = 'other'
#if not HPC:
# LOG.write("%s%s\n" % ("genome type is ",nextGenomeData["genomeType"]))
genomeDataItems += 1
elif match_species:
species = match_species.group(1)
nextGenomeData["genomeSpecies"] = species
#if not HPC:
# LOG.write("%s%s\n" % ("Species is ",species))
genomeDataItems += 1
elif match_name:
name = match_name.group(1)
nextGenomeData["genomeName"] = name
#if not HPC:
# LOG.write("%s%s\n" % ("genome name is ",name))
genomeDataItems += 1
elif match_phateDir:
if match_phateDir.group(1) != '':
os.environ["BASE_DIR"] = match_phateDir.group(1)
elif match_databaseDir:
if match_databaseDir.group(1) != '':
os.environ["DATABASE_DIR"] = match_databaseDir.group(1)
elif match_softwareDir:
if match_softwareDir.group(1) != '':
os.environ["SOFTWARE_DIR"] = match_softwareDir.group(1)
elif match_outputSubdir: #*** Note that if the output dir is not read before subdir; depends on user not changing order in config - Clean this up!
value = match_outputSubdir.group(1)
if value != '':
value = value.rstrip('/') # be sure that name of subdir ends in exactly one '/' (user might omit the slash)
value = value + '/'
nextGenomeData["outputSubdir"] = value
else:
nextGenomeData["outputSubdir"] = "unknown"
if PHATE_WARNINGS:
print("multiPhate says, WARNING: pipeline output subdir is ", "unknown")
genomeDataItems += 1
elif match_end: # List of genomes complete; record last genome's data
if genomeDataItems != DATA_ITEMS_NUM:
if not HPC:
LOG.write("%s%s%s%s%s%s\n" % ("multiPhate says, WARNING: check config file for possible incorrect data items: ", genomeDataItems, " for genome ",nextGenomeData["genomeName"],' ',nextGenomeData["genomeNumber"]))
genomeList.append(nextGenomeData)
if not HPC:
LOG.write("%s%s\n" % ("END: Length of genomeList is ",len(genomeList)))
##### Other processing #####
elif match_geneticCode:
value = match_geneticCode.group(1)
if value != '':
geneticCode = value
elif match_translateOnly:
value = match_translateOnly.group(1)
if value.lower() == 'yes' or value.lower() == 'true' or value.lower() == 'on':
TRANSLATE_ONLY = True
elif value.lower() == 'no' or value.lower() == 'false' or value.lower() == 'off' or value == '':
TRANSLATE_ONLY = False
else:
if PHATE_WARNINGS == 'True':
print("multiPhate says, WARNING: Invalid string following translate_only parameter in config file:", value)
if not HPC:
LOGFILE.write("%s%s\n" % ("Invalid string following translate_only parameter in config file: ", value))
elif match_contig:
value = match_contig.group(1)
contigName = value
##### Gene Calls #####
elif match_geneCaller:
value = match_geneCaller.group(1)
if value.lower() == 'phanotate':
geneCaller = 'phanotate'
CONSENSUS_CALLS_FILE = 'phanotate.cgc'
elif value.lower() == 'consensus':
geneCaller = 'consensus'
CONSENSUS_CALLS_FILE = 'consensus.cgc'
elif value.lower() == 'genemarks' or value.lower() == 'genemark':
geneCaller = 'genemarks'
CONSENSUS_CALLS_FILE = 'genemark.cgc'
elif value.lower() == 'glimmer2':
geneCaller = 'glimmer2'
CONSENSUS_CALLS_FILE = 'glimmer.cgc'
elif value.lower() == 'glimmer3' or value.lower() == 'glimmer':
geneCaller = 'glimmer3'
CONSENSUS_CALLS_FILE = 'glimmer.cgc'
elif value.lower() == 'prodigal':
geneCaller = 'prodigal'
CONSENSUS_CALLS_FILE = 'prodigal.cgc'
elif value.lower() == 'rast':
geneCaller = 'rast'
CONSENSUS_CALLS_FILE = 'rast.cgc'
elif match_genemarksCalls:
value = match_genemarksCalls.group(1)
if value.lower() == 'true' or value.lower() == 'yes' or value.lower() == 'on':
genemarksCalls = True
else:
genemarksCalls = False
elif match_prodigalCalls:
value = match_prodigalCalls.group(1)
if value.lower() == 'true' or value.lower() == 'yes' or value.lower() == 'on':
prodigalCalls = True
else:
prodigalCalls = False
elif match_glimmerCalls:
value = match_glimmerCalls.group(1)
if value.lower() == 'true' or value.lower() == 'yes' or value.lower() == 'on':
glimmerCalls = True
else:
glimmerCalls = False
elif match_phanotateCalls:
value = match_phanotateCalls.group(1)
if value.lower() == 'true' or value.lower() == 'yes' or value.lower() == 'on':
phanotateCalls = True
else:
phanotateCalls = False
##### BLAST #####
elif match_blastpIdentity:
value = match_blastpIdentity.group(1)
if int(value) > int(MIN_BLASTP_IDENTITY) and int(value) <= 100:
blastpIdentity = value
elif match_blastpHitCount:
value = match_blastpHitCount.group(1)
if int(value) > 0 and int(value) <= int(MAX_BLASTP_HIT_COUNT):
blastpHitCount = value
elif match_blastnHitCount:
value = match_blastnHitCount.group(1)
if int(value) > 0 and int(value) <= int(MAX_BLASTN_HIT_COUNT):
blastnHitCount = value
elif match_ncbiVirusBlast:
value = match_ncbiVirusBlast.group(1)
if value.lower() == 'true' or value.lower() == 'yes' or value.lower() == 'on':
ncbiVirusBlast = True
else:
ncbiVirusBlast = False
elif match_ncbiVirusProteinBlast:
value = match_ncbiVirusProteinBlast.group(1)
if value.lower() == 'true' or value.lower() == 'yes' or value.lower() == 'on':
ncbiVirusProteinBlast = True
else:
ncbiVirusProteinBlast = False
elif match_keggVirusBlast:
value = match_keggVirusBlast.group(1)
if value.lower() == 'true' or value.lower() == 'yes' or value.lower() == 'on':
keggVirusBlast = True
else:
keggVirusBlast = False
elif match_nrBlast:
value = match_nrBlast.group(1)
if value.lower() == 'true' or value.lower() == 'yes' or value.lower() == 'on':
nrBlast = True
else:
nrBlast = False
elif match_refseqProteinBlast:
value = match_refseqProteinBlast.group(1)
if value.lower() == 'true' or value.lower() == 'yes' or value.lower() == 'on':
refseqProteinBlast = True
else:
refseqProteinBlast = False
elif match_refseqGeneBlast:
value = match_refseqGeneBlast.group(1)
if value.lower() == 'true' or value.lower() == 'yes' or value.lower() == 'on':
refseqGeneBlast = True
else:
refseqGeneBlast = False
elif match_phantomeBlast:
value = match_phantomeBlast.group(1)
if value.lower() == 'true' or value.lower() == 'yes' or value.lower() == 'on':
phantomeBlast = True
else:
phantomeBlast = False
elif match_pvogsBlast:
value = match_pvogsBlast.group(1)
if value.lower() == 'true' or value.lower() == 'yes' or value.lower() == 'on':
pvogsBlast = True
else:
pvogsBlast = False
elif match_uniparcBlast:
value = match_uniparcBlast.group(1).lower()
if value == 'true' or value == 'yes' or value == 'on':
uniparcBlast = True
else:
uniparcBlast = False
elif match_uniprotBlast:
value = match_uniprotBlast.group(1).lower()
if value == 'true' or value == 'yes' or value == 'on':
uniprotBlast = True
else:
uniprotBlast = False
elif match_swissprotBlast:
value = match_swissprotBlast.group(1).lower()
if value == 'true' or value == 'yes' or value == 'on':
swissprotBlast = True
else:
swissprotBlast = False
##### HMM #####
elif match_hmmProgram:
value = match_hmmProgram.group(1)
if value.lower() == 'jackhmmer':
hmmProgram = 'jackhmmer'
else:
if PHATE_WARNINGS == 'True':
print("multiPhate says, WARNING: currenly only jackhmmer hmm search is supported; running jackhmmer")
hmmProgram = HMM_PROGRAM_DEFAULT
elif match_ncbiVirusHmm:
value = match_ncbiVirusHmm.group(1)
if value.lower() == 'true' or value.lower() == 'yes' or value.lower() == 'on':
ncbiVirusHmm = True
else:
ncbiVirusHmm = False
elif match_ncbiVirusProteinHmm:
value = match_ncbiVirusProteinHmm.group(1)
if value.lower() == 'true' or value.lower() == 'yes' or value.lower() == 'on':
ncbiVirusProteinHmm = True
else:
ncbiVirusProteinHmm = False
elif match_keggVirusHmm:
value = match_keggVirusHmm.group(1)
if value.lower() == 'true' or value.lower() == 'yes' or value.lower() == 'on':
keggVirusHmm = True
else:
keggVirusHmm = False
elif match_nrHmm:
value = match_nrHmm.group(1)
if value.lower() == 'true' or value.lower() == 'yes' or value.lower() == 'on':
nrHmm = True
else:
nrHmm = False
elif match_refseqProteinHmm:
value = match_refseqProteinHmm.group(1)
if value.lower() == 'true' or value.lower() == 'yes' or value.lower() == 'on':
refseqProteinHmm = True
else:
refseqProteinHmm = False
elif match_phantomeHmm:
value = match_phantomeHmm.group(1)
if value.lower() == 'true' or value.lower() == 'yes' or value.lower() == 'on':
phantomeHmm = True
else:
phantomeHmm = False
elif match_pvogsHmm:
value = match_pvogsHmm.group(1)
if value.lower() == 'true' or value.lower() == 'yes' or value.lower() == 'on':
pvogsHmm = True
else:
pvogsHmm = False
elif match_uniparcHmm:
value = match_uniparcHmm.group(1).lower()
if value == 'true' or value == 'yes' or value == 'on':
uniparcHmm = True
else:
uniparcHmm = False
elif match_uniprotHmm:
value = match_uniprotHmm.group(1).lower()
if value == 'true' or value == 'yes' or value == 'on':
uniprotHmm = True
else:
uniprotHmm = False
elif match_swissprotHmm:
value = match_swissprotHmm.group(1).lower()
if value == 'true' or value == 'yes' or value == 'on':
swissprotHmm = True
else:
swissprotHmm = False
elif match_refseqGeneHmm:
pass # Not yet in service
##### PSAT #####
elif match_psatAnnotation:
value = match_psatAnnotation.group(1)
if value.lower() == 'true' or value.lower() == 'yes' or value.lower() == 'on':
psatAnnotation = True
else:
psatAnnotation = False
elif match_psatFile:
value = match_psatFile.group(1)
if value != '':
PSAT_FILE = value
PSAT = True # Yes, a psat file will be passed to subordinate code
else:
PSAT_FILE = ""
if PHATE_WARNINGS:
print("multiPhate says, WARNING: PSAT_FILE is ",PSAT_FILE)
#if not HPC:
# LOG.write("%s%s\n" % ("PSAT_FILE is ",PSAT_FILE))
##### DEPENDENT CODE LOCATIONS #####
elif match_blastPlusHome:
if match_blastPlusHome.group(1) != '':
os.environ["BLAST_HOME"] = match_blastPlusHome.group(1)
elif match_embossHome:
if match_embossHome.group(1) != '':
os.environ["EMBOSS_PHATE_HOME"] = match_embossHome.group(1)
elif match_tRNAscanSEhome:
if match_tRNAscanSEhome.group(1) != '':
os.environ["tRNAscanSE_HOME"] = match_tRNAscanSEhome.group(1)
elif match_glimmerHome:
if match_glimmerHome.group(1) != '':
os.environ["GLIMMER_PATH"] = match_glimmerHome.group(1)
elif match_prodigalHome:
if match_prodigalHome.group(1) != '':
os.environ["PRODIGAL_PATH"] = match_prodigalHome.group(1)
elif match_phanotateHome:
if match_phanotateHome.group(1) != '':
os.environ["PHANOTATE_PATH"] = match_phanotateHome.group(1)
elif match_genemarkHome:
if match_genemarkHome.group(1) != '':
os.environ["GENEMARKS_PATH"] = match_genemarkHome.group(1)
##### DATABASE LOCATIONS #####
elif match_ncbiVirusDatabase:
if match_ncbiVirusDatabase.group(1) != '':
os.environ["NCBI_VIRUS_BLAST_HOME"] = match_ncbiVirusDatabase.group(1)
elif match_refseqGeneDatabase:
if match_refseqGeneDatabase.group(1) != '':
os.environ["REFSEQ_GENE_BLAST_HOME"] = match_refseqGeneDatabase.group(1)
elif match_ncbiVirusProteinDatabase:
if match_ncbiVirusProteinDatabase.group(1) != '':
os.environ["NCBI_VIRUS_PROTEIN_BLAST_HOME"] = match_ncbiVirusProteinDatabase.group(1)
elif match_keggVirusDatabase:
if match_keggVirusDatabase.group(1) != '':
os.environ["KEGG_VIRUS_BLAST_HOME"] = match_keggVirusDatabase.group(1)
os.environ["KEGG_VIRUS_BASE_DIR"] = os.path.dirname(match_keggVirusDatabase.group(1)) + '/'
elif match_phantomeDatabase:
if match_phantomeDatabase.group(1) != '':
os.environ["PHANTOME_BLAST_HOME"] = match_phantomeDatabase.group(1)
os.environ["PHANTOME_BASE_DIR"] = os.path.dirname(match_phantomeDatabase.group(1)) + '/'
elif match_pvogsDatabase:
if match_pvogsDatabase.group(1) != '':
os.environ["PVOGS_BLAST_HOME"] = match_pvogsDatabase.group(1)
os.environ["PVOGS_HMM_HOME"] = match_pvogsDatabase.group(1)
elif match_swissprotDatabase:
if match_swissprotDatabase.group(1) != '':
os.environ["SWISSPROT_BLAST_HOME"] = match_swissprotDatabase.group(1)
elif match_refseqProteinDatabase:
if match_refseqProteinDatabase.group(1) != '':
os.environ["REFSEQ_PROTEIN_BLAST_HOME"] = match_refseqProteinDatabase.group(1)
elif match_nrDatabase:
if match_nrDatabase.group(1) != '':
os.environ["NR_BLAST_HOME"] = match_nrDatabase.group(1)
##### VERBOSITY #####
elif match_phateWarnings:
value = match_phateWarnings.group(1)
if value.lower() == 'true' or value.lower() == 'yes' or value.lower() == 'on':
os.environ["PHATE_WARNINGS"] = 'True'
else:
os.environ["PHATE_WARNINGS"] = 'False'
elif match_phateMessages:
value = match_phateMessages.group(1)
if value.lower() == 'true' or value.lower() == 'yes' or value.lower() == 'on':
os.environ["PHATE_MESSAGES"] = 'True'
else:
os.environ["PHATE_MESSAGES"] = 'False'
elif match_phateProgress:
value = match_phateProgress.group(1)
if value.lower() == 'true' or value.lower() == 'yes' or value.lower() == 'on':