-
Notifications
You must be signed in to change notification settings - Fork 7
/
GRAbB.pl
executable file
·2438 lines (2422 loc) · 304 KB
/
GRAbB.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/perl
use strict;
use File::Spec;
use POSIX qw(strftime);
#use autodie;
#======================================HEADER================================================================================================
##############################################################################################################
##############################################################################################################
## ##
##############################################################################################################
##############################################################################################################
## This program let's you selcetively assemble parts of a genome from raw reads. The program works in an ##
## iterative fashon: finds reads that match to bait/reference using exact kmer match, assembles the reads ##
## de novo. ##
## For more information read the documentation or the usage message bellow (also by invoking the program ##
## without any arguments. ##
##############################################################################################################
##############################################################################################################
#======================================USAGE=================================================================================================
#######################################################################################################################
# USAGE #
#######################################################################################################################
# Usage message to be printed for incorect invocation #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
my $usage = "Usage:\n" . #
"$0 --ref <reference file> --reads <read file 1> [<read file 2>] --folder <directory> --prefix <prefix> [options]\n" .#
"\n" . #
"\t--ref <ref.fas>\n" . #
"\t\t\t\tReference file: FASTA format, the id lines must be unique\n" . #
"\t\t\t\t(only the first word is used as an id)\n" . #
"\t--reads <r1.fastq> [<r2.fastq> ...]\n" . #
"\t\t\t\tRead files in fasta or fastq format, may be compressed using\n" . #
"\t\t\t\tgzip. If the reads are paired-end specify separately both the\n" . #
"\t\t\t\tforward and the reverse read files. Otherwise specify only one\n" . #
"\t\t\t\tfile or use '--single' option\n" . #
"\t--folder <folder_name>\n" . #
"\t\t\t\tThe directory where all the output will be saved. If the\n" . #
"\t\t\t\tdirectory is non-empty then the files it contains can be used\n" . #
"\t\t\t\tlike internal files. In this manner previous runs can be continued,\n" . #
"\t\t\t\tmake sure to remove or replace files that would suggest completion\n" . #
"\t--prefix <prefix_of_output>\n" . #
"\t\t\t\tThe prefix for the output files\n" . #
"Additional options:\n" . #
"\t--single\n" . #
"\t\t\t\tTreat reads as unpaired reads even if two read files are specified\n" . #
"\t--bait <bait.fas>\n" . #
"\t\t\t\tA separate bait file can be specified besides the reference file,\n" . #
"\t\t\t\tthis file together with the reference file will be used as first bait.\n" . #
"\t\t\t\tUseful when using special criterion for the assembly, such as homology\n" . #
"\t--min_length=<int>\n" . #
"\t\t\t\tMinimum size required for a contig to be included for completion testing and baiting\n" . #
"\t--type <run_type_string>\n" . #
"\t\t\t\tSpecial critera can be specified for the run\n" . #
"\t\t\t\t\tWhen should the iteration stop?\n" . #
"\t\t\t\t\t\tThe intrinsic criteria are:\n" . #
"\t\t\t\t\t\t\tThere are no new reads\n" . #
"\t\t\t\t\t\t\tThere is no assembly\n" . #
"\t\t\t\t\t\t\tThe bait sequence did not change\n" . #
"\t\t\t\t\t\tExtra criteria:\n" . #
"\t\t\t\t\t\t\tLength of the assembly\n" . #
"\t\t\t\t\t\t\tLength of the longest contig of the assembly\n" . #
"\t\t\t\t\t\t\tN50 value of the assembly\n" . #
"\t\t\t\t\t\t\tReference matches to assembly (uses exonerate)\n" . #
"\t\t\t\t\tOr to have parallel runs (threads)\n" . #
"\t\t\t\t\t\tPossible values:\n" . #
"\t\t\t\t\t\tmulti Activate multi mode (parallel runs)\n" . #
"\t\t\t\t\t\ttotal=<int> Minimum size (bp) of the assembly\n" . #
"\t\t\t\t\t\tlongest=<int> Minimum size (bp) of the longest contig\n" . #
"\t\t\t\t\t\tN50=<int> Minimum N50 value (bp) of the assembly\n" . #
"\t\t\t\t\t\texonerate Matches reference using exonerate.\n" . #
"\t\t\t\t\t\t\t\t(Matched region is saved separately)\n" . #
"\t\t\t\t\t\tdiff Uses the identifier lines from the reference\n" . #
"\t\t\t\t\t\t to get the completion criterion for each thread\n" . #
"\t\t\t\t\t\t (These criteria overwrite the globally defined\n" . #
"\t\t\t\t\t\t criteria. Write \"exhaustive\" in the identifier\n" . #
"\t\t\t\t\t\t line if exhaustive should be used.)\n" . #
"\t\t\t\t\t\t (Need to be used together with multi)\n" . #
"\t\t\t\tMultiple options can be used at the same time, but then they have to be\n" . #
"\t\t\t\ttyped as a concatenated string (or using quotation)\n" . #
"\t\t\t\t(e.g. --type <multiexonerate | multi_exonerate | \"multi exonerate\">)\n" . #
"\t--arg1 \"argument1 argument2 argument3\"\n" . #
"\t\t\t\tArguments needed for the assembler program used for graph/hash generation\n" . #
"\t--arg2 \"argument1 argument2 argument3\"\n" . #
"\t\t\t\tArguments needed for the assembler program used for the assembly\n" . #
"\t--assembler assembler_name\n" . #
"\t\t\t\tSpecify the assembler to be used: edena, velvet, alternative or external\n" . #
"\t--clean\n" . #
"\t\t\t\tRemove some internal files to save disk space.\n" . #
"\t\t\t\tIf specified twice, then only result files are kept, the rest is deleted\n"; #
#######################################################################################################################
#======================================START=UP==============================================================================================
####################################################################################################
####################################################################################################
## VARIABLES ##
####################################################################################################
####################################################################################################
# A scalar variable to store the log information before log file is created #
my $p_log; #
# A scalar variable to store the print message before printing it to STDOUT and the log file #
# Didn't use IO:Tee, because it is not part of the basic Perl installation) #
my $print; #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Constant variables #
# Only declared at the start and never changed afterwards #
# These variables have to be adjusted to the environment #
my $bait_cmd = "mirabait"; # The command to invoke the baiting program # !!!!
my $collect_cmd = "seqtk subseq"; # The command to invoke the read collecting program # !!!!
my $edena_cmd = "edena"; # The command to invoke the default assembler program # !!!!
my $velveth_cmd = "velveth"; # The command to invoke velvet graph/hash generator program # !!!!
my $velvetg_cmd = "velvetg"; # The command to invoke velvet assembler program # !!!!
my $alt_1_cmd = ""; # The command to invoke graph/hash generator program # !!!!!! alternative assembler (graph)
my $alt_2_cmd = ""; # The command to invoke assembler program # !!!!!! alternative assembler (assembly)
my $ext_assembler = ""; # A perl script that does the assembly #
my $exonerate_cmd = "exonerate"; # The command to invoke exonerate # !!!!
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# The variables holding file names and / or depend on the command line invocation #
my $ref; # What is the reference file? #
my @reads; # What are the reads files used as input? #
my $format; # What is the file format of the read files? (fasta or fastq) #
my $gzip; # True if the reads are zipped using gzip #
my $single; # If reads are paired FALSE else TRUE #
my $folder; # What is the working directory? #
my $prefix; # What is the prefix for the output? #
my $type = ""; # What is the criteria to stop? If undef then when there are no new reads. #
my $min_len; # What is the minimal length for contigs to be used? #
my @parameters; # What are the parameters for the assembler? #
my $extra_bait; # Was there extra bait specified when launching in multi mode? #
my $clean = 0; # 1 if some files should be deleted, 2 if only results should be kept #
my $log; # The file handle for the log file #
my $bait; # What is the bait file? #
my $old_collection; # The collection file of the previous iteration (first iteration = null file) #
my $new_collection; # The collection file of the current iteration #
my @readpools; # The read files generated by baiting and used by assembly #
my $assembly_file; # The FASTA file containing the assembly #
my @current_asmbls; # The list of assembly files to be modified if min_len is set #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Parse the commandline input and load the variables #
# This also creates the folder and all the files that are based on the input #
# (reference, bait, links to the read files) #
&parse_input(\@ARGV, \$ref, \@reads, \$single, \$folder, \$prefix, \$type, \@parameters, \$p_log, #
\$bait, \$extra_bait, \$clean, \$gzip, \$min_len, \$ext_assembler, \$format); #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Test if all the specified programs are usable #
for ($bait_cmd, $collect_cmd) { #
die "$_ program is not executable\n" unless &which($_); #
} #
# Test if the assembler is usable or die #
if ($parameters[0] eq "alternative") { #
die "$alt_1_cmd program is not executable\n" unless &which($alt_1_cmd); #
die "$alt_2_cmd program is not executable\n" unless &which($alt_2_cmd); #
} elsif ($parameters[0] eq "velvet") { #
die "$velveth_cmd program is not executable\n" unless &which($velveth_cmd); #
die "$velvetg_cmd program is not executable\n" unless &which($velvetg_cmd); #
} elsif ($parameters[0] eq "external") { #
# This will not be checked #
} else { #
die "$edena_cmd program is not executable\n" unless &which($edena_cmd); #
} #
# If exonerate is necessary then test it also #
if ($type =~ /exonerate/) { #
die "$exonerate_cmd program is not executable\n" unless &which($exonerate_cmd); #
} #
####################################################################################################
# Dynamic variables (They are undefined at start) #
my $done; # Is the test criteria met? #
my $round; # The iteration counter #
my $current_folder; # The name of the current round folder #
my $previous_folder;# The name of the previous round folder #
my %multi; # Contains all the threads: keys: thread id; values: status (undef if active) #
# # There are subroutines that operate on this hash #
# # load_multi creates the threads at the start #
# # get_actives returns all the threads in a sorted order #
my %types; # Contains all the threads: keys: thread id; values: criterion type #
####################################################################################################
# Declare some more variables #
# Load the file names that do not depend on the command line input #
$assembly_file = "assembly.fas"; #
$new_collection = "new_collection.list"; #
$old_collection = "old_collection.list"; #
{ # Using a bare block to create local variables that will not interfere later #
# Create old_collection file if it does not exist already #
# Using append to open the file so content is not destroyed #
open my $temp_handle, '>>', "$folder\/old_collection.list"; #
close $temp_handle; #
# For each read file there should be a readpool file used #
# Create a counter #
my $i; #
for (@reads) { #
$i++; #
# Save the relative path of the readpool file into the readpool array #
push @readpools, "readpool$i\.$format"; #
} #
} #
####################################################################################################
#=========================================MAIN=PART==========================================================================================
###############################################################################################
# PREPARATION #
############################################################################################### Working directory
# Move to working directory and create more variables and files # (the specified folder is marked "/")
chdir ($folder); # /
# Open log file for appending, even if it already exists # /
open $log, '>>', "$prefix\.log"; # /
# /
# Print greeting text # /
$print = "\n" . "X" x 90 . "\n"; # /
print {$log} $print; print $print; # /
$print = "GRAbB: Genome Region Assembly by Baiting\nInvocation:\t$0 @ARGV\n"; # /
print {$log} $print; print $print; # /
# Print what belongs into the log # /
print {$log} $p_log; print $p_log; # /
# /
&load_multi(\$log, \$ref, \$bait, \$type, \%multi, \%types); # /
# /
# Detect previous runs to be continued or start a new one # /
{ # /
my @list = glob("Round*"); # /
# /
# Find the "Round" folder with the highest number and continue the run from that cycle # /
my ($highest) = sort {$b <=> $a} map {/^Round(\d+)/; $1} @list; # /
# /
# First step of the loop is incrementing the round variable # /
if ($highest) { # /
$print = "Detected previous run, the folders and files will be used\n"; # /
print {$log} $print; print $print; # /
$round = $highest - 1; # /
} else { # /
$round = 0; # /
} # /
} # /
############################################################################################### /
#################################################################################################### /
# Main loop of the program # /
while (not $done) { # /
$round++; # increment the counter # /
# Save the name of the folder name of the previous round # /
$previous_folder = $current_folder; # /
# Compile the name of the folder of the current round # /
if (length $round > 3) { # /
$current_folder = "Round$round"; # /
} else { # /
$current_folder = "Round" . ("0" x (3 - length $round)) . $round; # /
} # /
# /
# Print a round separator line # /
$print = "-" x 90 . "\n"; # /
print {$log} $print; print $print; # /
# /
# Print time and the current round # /
$current_folder =~ /Round(\d+)/; # /
$print = "" . strftime('%H:%M:%S',localtime) . "\tRound $1\n"; # /
print {$log} $print; print $print; # /
# /
# Create the current round folder if it is not present # /
unless (-d $current_folder) { # /
mkdir $current_folder; # /
} # /
# /
# Move to the current round folder # /
chdir $current_folder; # / => /Round###
# Update the relative paths of the files that are in the main directory # /Round###
# The file names are fixed inside the subroutine # /Round###
# The prefix is passed then the reference file, bait file and old collection variables # /Round###
&update_path(\"../", \$ref, \$bait, \$old_collection);#/ # /Round###
# /Round###
# Check if all the necessary files are in order # /Round###
# Creates a folder called test_reads which will be destroyed if all goes well # /Round###
# The program dies if some error is found # /Round###
&check_files(\$log, \$bait, \$ref, \@reads, \$single, \$gzip); # /Round###
# /Round###
# Find reads and create read(pool) file(s) for the assembly # /Round###
# Stop if there are no new reads (Returns "No new reads") # /Round###
$done = &find_reads(\$log, \$bait, \@reads, \$old_collection, \$new_collection, \@readpools); # /Round###
# If done then break the loop and proceed to the finishing steps # /Round###
if ($done) { # /Round###
chdir ".."; # /Round### => /
last; #==================== Breaks the Main loop
} #
# /Round###
# If clean mode is selected remove the readpool of the previous round # /Round###
# If there was a previous round already # /Round###
# The new readpool files contain all the reads that were in the previous ones # /Round###
if ($clean && $previous_folder) { # /Round###
for (@readpools) { # /Round###
system("rm", "../$previous_folder/$_") if -e "../$previous_folder/$_"; # /Round###
} # /Round###
} # /Round###
# /Round###
# Copy the new collection to the old collection # /Round###
system("cp", $new_collection, $old_collection); # /Round###
# /Round###
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # ## /Round###
# Run a separate analysis for each of the threads # /Round###
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # ## /Round###
# If multi mode is selected then does a separate run for each thread of the active threads # /Round###
# Otherwise, only runs on the single main thread # /Round###
for my $thread (&get_actives(\%multi)) { # /Round###
# Create the current thread folder if it is not present # /Round###
unless (-d $thread) { # /Round###
mkdir $thread; # /Round###
} # /Round###
# /Round###
# Move to the current thread folder # /Round###
chdir $thread; # /Round### => /Round###/thread#
# /Round###/thread#
# Create the local "links" to the necessary files # /Round###/thread#
# Use a separate name for the thread specific files # /Round###/thread#
my ($thread_ref, $thread_bait, $thread_old) = ($ref, $bait, $old_collection); # /Round###/thread#
# /Round###/thread#
# Update the relative paths of the files that are in the main thread directory # /Round###/thread#
# The file names are fixed inside the subroutine # /Round###/thread#
# The prefix is passed then the reference file, bait file and old collection variables # /Round###/thread#
&update_path(\"../../$thread\/", \$thread_ref, \$thread_bait, \$thread_old);#" # /Round###/thread#
# /Round###/thread#
# Create the symlinks to the readpool files of the Round folder so that the same read file # /Round###/thread#
# name can be used for the threads # /Round###/thread#
my $i = 0; # /Round###/thread#
for (@readpools) { # /Round###/thread#
$i++; # /Round###/thread#
unless (-l "../reads$i\.$format") { # /Round###/thread#
symlink(File::Spec->rel2abs("../$_"), "../reads$i\.$format"); # /Round###/thread#
} # /Round###/thread#
} # /Round###/thread#
# /Round###/thread#
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # ## /Round###/thread#
# An inner loop for the individual thread in multi mode with extra bait at startup # /Round###/thread#
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # ## /Round###/thread#
# A variable that shows if another iteration step is needed or not # /Round###/thread#
my $continue = 1; # /Round###/thread#
# A counter for the number of the iteration # /Round###/thread#
my $iteration; # /Round###/thread#
while ($continue > 0) { # /Round###/thread#
# Default is to only have one cycle => basically not a loop # /Round###/thread#
$continue--; # /Round###/thread#
$iteration++; # /Round###/thread#
# /Round###/thread#
# Check if all the necessary files are in order # /Round###/thread#
&check_files(\$log, \$thread_bait, \$thread_ref, \@reads, \$single, \undef); # /Round###/thread#
# /Round###/thread#
# Find reads and create read(pool) file(s) for the assembly # /Round###/thread#
# Stop if there are no new reads (Returns "No new reads") # /Round###/thread#
# Only needed in multi mode # /Round###/thread#
if ($type && $type =~ /multi/) { # /Round###/thread#
# Print thread separator line # /Round###/thread#
$print = "\t\t\t$thread\t" . ("-" x 50) . "\n"; # /Round###/thread#
print {$log} $print; print $print; # /Round###/thread#
# /Round###/thread#
# Find reads for the current thread and update the threads state # /Round###/thread#
$multi{$thread} = &find_reads(\$log, \$thread_bait, \@reads, # /Round###/thread#
\$thread_old, \$new_collection, # /Round###/thread#
\@readpools, \"\t"); #" # /Round###/thread#
# /Round###/thread#
# If no new reads were found then go to the next thread # /Round###/thread#
if ($multi{$thread}) { # /Round###/thread#
# If this is the first Round with multi mode and extra bait, # /Round###/thread#
# Then mark it as active thread # /Round###/thread#
if ($extra_bait) { # /Round###/thread#
# During the previous inner iteration (if there was one) the files created # /Round###/thread#
# were renamed by adding the "old_" prefix. Because there was no new # /Round###/thread#
# information during this inner iteration the old files are renamed ( by # /Round###/thread#
# removing the "old_" prefix) and overwrite the current iteration files. # /Round###/thread#
for (glob("old_*")) { # /Round###/thread#
my $rest = $_; # /Round###/thread#
$rest =~ s/old_//; # /Round###/thread#
system("mv", $_, $rest); # /Round###/thread#
} # /Round###/thread#
# If there were no reads found then stop the thread # /Round###/thread#
$multi{$thread} = undef unless $multi{$thread} eq "No reads"; # /Round###/thread#
} # /Round###/thread#
# Because $continue is zero, there won't be a next inner loop # /Round###/thread#
next; #==================== Breaks the Inner loop
} #
} else { # /Round###/thread#
# IF NOT in multi mode # /Round###/thread#
# THEN create symlinks to the selected reads in the thread folder # /Round###/thread#
# Because the find reads subroutine was skipped # /Round###/thread#
for (@readpools) { # /Round###/thread#
symlink(File::Spec->rel2abs("../$_"), "$_") unless -e $_; # /Round###/thread#
} # /Round###/thread#
# And copy the new collection file into the current working directory # /Round###/thread#
system("cp", "../$new_collection", $new_collection); # /Round###/thread#
} # /Round###/thread#
# /Round###/thread#
# Do assembly according to parameters and create the assembly file # /Round###/thread#
&assemble(\$log, \@readpools, \$single, \@parameters, # /Round###/thread#
\$assembly_file, \"\t", \$format);#" # /Round###/thread#
# /Round###/thread#
# Copy the latest assembly to the thread folder in the main directory # /Round###/thread#
system("cp", $assembly_file, "../../$thread/$assembly_file"); # /Round###/thread#
# /Round###/thread#
if ($clean < 2) { # /Round###/thread#
# Unless hyperclean mode is selected, keep a copy of each assembly file # /Round###/thread#
# Get the next unused serial number for the assembly file # /Round###/thread#
# The first serial number should be 1 # /Round###/thread#
my $next = 1; # /Round###/thread#
while (-e "../../$thread/assembly_$next\.fas") { # /Round###/thread#
$next++; # /Round###/thread#
} # /Round###/thread#
# copy the assembly to the new file # /Round###/thread#
system("cp", "../../$thread/$assembly_file", "../../$thread/assembly_$next\.fas"); # /Round###/thread#
if ($min_len) { # /Round###/thread#
system("cp", "../../$thread/$assembly_file", # /Round###/thread#
"../../$thread/assembly_$next\_all\.fas"); # /Round###/thread#
@current_asmbls = ("../../$thread/$assembly_file", # /Round###/thread#
"../../$thread/assembly_$next\.fas"); # /Round###/thread#
} # /Round###/thread#
} # /Round###/thread#
# /Round###/thread#
# Check if another iteration is needed or not # /Round###/thread#
# There are different types of criteria # /Round###/thread#
# constant # /Round###/thread#
# no new reads => stop (already checked before the assembly) # /Round###/thread#
# bait hasn't changed => stop (returns "Same bait") # /Round###/thread#
# (in the next iteration there would be no new reads)# /Round###/thread#
# assembly file is empty => stop (returns "No assembly") # /Round###/thread#
# optional # /Round###/thread#
# length of the assembly # /Round###/thread#
# "completeness" using homology # /Round###/thread#
# If not done then replace the old bait file with the new bait # /Round###/thread#
if ($type && $type =~ /diff/) { # /Round###/thread#
$multi{$thread} = &test_completion(\$log, \$thread_ref, \$thread_bait, # /Round###/thread#
\$types{$thread}, \$assembly_file, \$min_len, # /Round###/thread#
\"\t", \@current_asmbls);#" # /Round###/thread#
} else { # /Round###/thread#
$multi{$thread} = &test_completion(\$log, \$thread_ref, \$thread_bait, \$type, # /Round###/thread#
\$assembly_file, \$min_len, \"\t",#" # /Round###/thread#
\@current_asmbls); # /Round###/thread#
} # /Round###/thread#
# If homology is used as test there might be a "result file created that holds the # /Round###/thread#
# the sequence matching the reference from the assembly # /Round###/thread#
# Copy this result file to the thread folder # /Round###/thread#
system("cp", "result.fas", "../../$thread/result.fas") if -e "result.fas"; # /Round###/thread#
# /Round###/thread#
# Copy the new collection to the old collection # /Round###/thread#
system("cp", $new_collection, $thread_old); # /Round###/thread#
# /Round###/thread#
# If this is the first round of a multi run with extra bait specified # /Round###/thread#
# Then rename the files in the thread folder by adding the "old_" prefix # /Round###/thread#
if ($extra_bait) { # /Round###/thread#
# If there are files with "old_" prefix in the folder then delete them, # /Round###/thread#
# because all the files are present with updated data # /Round###/thread#
for (glob("old_*")) { # /Round###/thread#
system("rm", $_); # /Round###/thread#
} # /Round###/thread#
# /Round###/thread#
# If the current thread is still active do a new inner iteration # /Round###/thread#
if (not $multi{$thread}){ # /Round###/thread#
# This means there will be a new iteration # /Round###/thread#
$continue++; # /Round###/thread#
# Rename current files by adding the "old_" prefix, # /Round###/thread#
# so that they can be restored later if no new reads are found # /Round###/thread#
for (glob("*")) { # /Round###/thread#
system("mv", $_, "old_$_"); # /Round###/thread#
} # /Round###/thread#
} else { # /Round###/thread#
# If the thread terminated with "Same bait" value after more than one inner # /Round###/thread#
# loop => Then reactivate the thread # /Round###/thread#
if ($multi{$thread} eq "Same bait" || $multi{$thread} eq "No improvement") { # /Round###/thread#
$multi{$thread} = undef if $iteration > 1; # /Round###/thread#
} # /Round###/thread#
} # /Round###/thread#
} # /Round###/thread#
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # ## /Round###/thread#
# The end of the inner loop for the current thread # /Round###/thread#
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # ## /Round###/thread#
} # /Round###/thread#
# /Round###/thread#
# Move back to the Round directory, because the first step of the thread loop is to move # /Round###/thread#
# into the thread directory. # /Round###/thread#
# This way the working directory is the same as it was before the for and while loops # /Round###/thread#
chdir ".."; # /Round###/thread# => /Round###/
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # ## /Round###
# End of the (for) thread loop # /Round###
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # ## /Round###
} # /Round###
# /Round###
# If clean mode is active, delete thread folders from Round folders which are not valuable # /Round###
if ($clean && $previous_folder) { # /Round###
for my $thread (glob("thread_*")) { # /Round###
# Check which threads were active this turn # /Round###
if ($multi{$thread} && $multi{$thread} eq "No new reads") { # /Round###
# If no new reads then there is nothing important in the current folder # /Round###
system("rm", "-r", $thread); # /Round###
} else { # /Round###
# The information content of the previous folder is a subset of the current's # /Round###
system("rm", "-r", "../$previous_folder/$thread"); # /Round###
} # /Round###
} # /Round###
} # /Round###
# /Round###
# Move back to the parent (main) directory # /Round###
chdir ".."; # /Round### => /
# /
# Update the relative paths of the files that are in the main directory # /
# The file names are fixed inside the subroutine # /
# The prefix is passed then the reference file, bait file and old collection variables # /
&update_path(\"", \$ref, \$bait, \$old_collection);#" # /
# /
# If this was the first round of a multi run with extra bait specified then make the # /
# $extra_bait variable undefined, because no more inner iterations will be need in the future# /
if ($extra_bait) { # /
$extra_bait = undef; # /
} # /
# /
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # ## /
# Prepare files for the next round if it is needed # /
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # ## /
# Stop the main loop if there are no active threads left # /
last if 0 == scalar( &get_actives(\%multi) ); #==================== Breaks the Main loop
# /
# Merge the individual bait files of the threads into a single bait file # /
# Collect all the baits needed to be merged # /
my @baits = map{"$_/bait.fas"} &get_actives(\%multi); # /
# Merge the collected files and rename the entries so each one is uniq # /
`cat @baits | perl -ne 'if (/^>/) {\$n++; print ">\$n\n"} else {print}' >$bait`; # /
} # /
####################################################################################################
# Print separator line at the end of the main loop
$print = "=" x 90 . "\n";
print {$log} $print; print $print;
###############################################################################################################
# Final steps #
###############################################################################################################
# Print the status of the threads and create the output files and do the necessary cleaning steps # Final steps
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Final steps
# Iterate through each thread # Final steps
for my $thread (sort{ my ($a1) = $a =~ /thread_(\d+)/; # Final steps
my ($b1) = $b =~ /thread_(\d+)/; # Final steps
$a1 <=> $b1 } keys %multi) { # Final steps
# Final steps
# If the thread was still active when the main loop stopped the reason of the stop should be copied # Final steps
# to the thread status # Final steps
$multi{$thread} = "$done" unless $multi{$thread}; # Final steps
# Possible values # Final steps
# Basic values # Final steps
# No assembly => There was some error with the assembly (could be that there were to few reads) # Final steps
# Same bait => The assembly was identical to the current bait # Final steps
# No new reads => There were no new reads found, thus the assembly would be the same # Final steps
# Exonerate values # Final steps
# No improvement => The exonerate analysis had the same result as during the previous round # Final steps
# Multiple contigs => The reference was covered by multiple contigs, the matched region won't be # Final steps
# saved to a separate file # Final steps
# Found => The reference was completely matched to a single contig of the assembly # Final steps
# The matched region will be saved to a separate file # Final steps
# Length value # Final steps
# Reached: => The specified minimal length was attained # Final steps
# Final steps
# Print the threads final status # Final steps
$print = "$thread:\t$multi{$thread}\n"; # Final steps
print {$log} $print; print $print; # Final steps
# Final steps
# If there is a non-empty assembly file in the thread folder than copy it to the main folder # Final steps
if (-e "$thread/$assembly_file" && not -z "$thread/$assembly_file") { # Final steps
system("cp", "$thread/$assembly_file", "$prefix\_assembly_$thread\.fas"); # Final steps
# Also create a final assembly file in the thread folder # Final steps
system("cp", "$thread/$assembly_file", "$thread/final_assembly.fas"); # Final steps
# Final steps
# Print the location of the assembly output file # Final steps
$print = "\tThe final assembly is $folder/$prefix\_assembly_$thread\.fas\n"; # Final steps
print {$log} $print; print $print; # Final steps
} else { # Final steps
# Print that there is no assembly file for the thread # Final steps
$print = "\tThere is no assembly for this thread\n"; # Final steps
print {$log} $print; print $print; # Final steps
} # Final steps
# Final steps
# If there is a result file in the thread folder then copy it to the main folder as an output file # Final steps
if (-e "$thread/result.fas") { # Final steps
system("cp", "$thread/result.fas", "$prefix\_result_$thread\.fas"); # Final steps
# Print the name of the output file # Final steps
$print = "\tMatched region is saved to $folder/$prefix\_result_$thread\.fas\n"; # Final steps
print {$log} $print; print $print; # Final steps
} # Final steps
# Final steps
# If the reference was matched to multiple contigs then copy the exonerate output file to the main folder # Final steps
if ($multi{$thread} eq "Multiple contigs") { # Final steps
# Worn the user that the matched region was not recovered and should be done manually # Final steps
$print = "\tTry to recover the homologous region from the assembly\n"; # Final steps
print {$log} $print; print $print; # Final steps
# Final steps
# Copy the exonerate output to the main folder and print its location # Final steps
system("cp", "$thread/$ref\.exonerate", "$prefix\_exonerate_$thread\.txt"); # Final steps
$print = "\tYou can find the exonerate output in $folder/$prefix\_exonerate_$thread\.txt\n"; # Final steps
print {$log} $print; print $print; # Final steps
} # Final steps
} # Final steps
# Final steps
# Delete every file except for the result files if hyperclean mode is selected # Final steps
system("rm", "-r", grep{$_ !~ /$prefix.*?\.((txt)|(fas)|(log))/} glob("*")) if $clean > 1; # Final steps
# Final steps
# Print separator to mark the end of the run # Final steps
$print = "=" x 90 . "\n" . "X" x 90 . "\n"; # Final steps
print {$log} $print; print $print; # Final steps
# Final steps
# Print citation information # Final steps
$print = "\n\nIf you use GRAbB.pl or any of the helper programs in your publications, please, " . # Final steps
"cite the paper describing the program.\n\n" . # Final steps
"Brankovics B, Zhang H, van Diepeningen AD, van der Lee TAJ, Waalwijk C\n" . # Final steps
"G Sybren de Hoog. (2016) GRAbB: Selective Assembly of Genomic Regions,\n" . # Final steps
"a New Niche for Genomic Research. PLoS Comput Biol 12(6):e1004753.\n" . # Final steps
"doi: 10.1371/journal.pcbi.1004753\n"; # Final steps
print {$log} $print; print $print; # Final steps
###############################################################################################################
#=================================================SUBROUTINES================================================================================
###############################################################################################################
###############################################################################################################
## SUBROUTINES ##
###############################################################################################################
###############################################################################################################
# Defining variables #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Defining variables
sub parse_input{ # Defining variables
# This subroutine reads the command line input: loads the variables, creates the folder and the files # Defining variables
# Every input is a reference to a variable # Defining variables
# Outputs: no return value # Defining variables
# all the referenced variables maybe be modified, but the command line arguments # Defining variables
# Created files: # Defining variables
# 1) the folder if needed # Defining variables
# 2) reference file # Defining variables
# 3) bait file # Defining variables
# 4) extra bait file if needed # Defining variables
# 5) symbolic links to the read files # Defining variables
# Inputs: # Defining variables
my ($in_ref, # 1) (array) of arguments (@ARGV) # Defining variables
$reference_ref, # 2) (scalar) to hold the location of the reference file # Defining variables
$reads_ref, # 3) (array) to hold the location of the read files # Defining variables
$single_ref, # 4) (scalar) which is TRUE if the reads are not paired # Defining variables
$folder_ref, # 5) (scalar) for the relative path of the folder to be used as main directory # Defining variables
$prefix_ref, # 6) (scalar) to hold the prefix for the output files # Defining variables
$type_ref, # 7) (scalar) to hold the type of the run (defined by the --type argument) # Defining variables
$parameters_ref,# 8) (array) to hold all the arguments that need to be handed to the assembler # Defining variables
# [0] name of the assembler program # Defining variables
# [1] additional parameters for the graph/hash generation # Defining variables
# [2] additional parameters for the assembly generation # Defining variables
$pre_log_ref, # 9) (scalar) to hold all the test for the log file, until it is created # Defining variables
$bait_ref, # 10) (scalar) to hold the location of the bait file # Defining variables
$extra_bait_ref,# 11) (scalar) TRUE if there is was a bait defined in multi mode # Defining variables
$clean_ref, # 12) (scalar) defined if clean mode is selected, # Defining variables
# it is 2 if hyperclean mode should be used # Defining variables
$gzip_ref, # 13) (scalar) TRUE if the read files are compressed # Defining variables
$min_len_ref, # 14) (scalar) the minimal size for a contig to be kept # Defining variables
$external_ref, # 15) (scalar) the perl script to be used for the assembly # Defining variables
$format_ref, # 16) (scalar) the format of the read files # Defining variables
) = @_; # Defining variables
# Defining variables
# Create a variable to hold the reference to the value that is waiting for a value # Defining variables
my $current; # Defining variables
# True if there should be a value (argument without "--" prefix) # Defining variables
my $wait; # Defining variables
# True if there could be more than one value # Defining variables
my $array; # Defining variables
# Variables to hold the information for the parameter array elements before they are stored in the array # Defining variables
my ($asmblr, $arg1, $arg2); # @$parameters_ref = ($asmblr, $arg1, $arg2) # Defining variables
# Defining variables
########################################################################################################### Defining variables
# Save the command line arguments into variables # Defining variables
########################################################################################################### Defining variables
# Loop through the command line arguments and save the values to the correct place # Defining variables
for (@$in_ref) { # Defining variables: get params
if (/^--/) { # Defining variables: get params
# If one of the arguments is waiting for a value then there is a problem with the input # Defining variables: get params
die "Parameter value is missing\n" if $wait; # Defining variables: get params
# Defining variables: get params
if (/--ref/) { # Defining variables: get params
$current = $reference_ref; # Defining variables: get params
$array = undef; # Defining variables: get params
$wait++; # Defining variables: get params
# Defining variables: get params
} elsif (/--reads/) { # Defining variables: get params
$current = $reads_ref; # Defining variables: get params
$array++; # Defining variables: get params
$wait++; # Defining variables: get params
# Defining variables: get params
} elsif (/--folder/) { # Defining variables: get params
$current = $folder_ref; # Defining variables: get params
$array = undef; # Defining variables: get params
$wait++; # Defining variables: get params
# Defining variables: get params
} elsif (/--prefix/) { # Defining variables: get params
$current = $prefix_ref; # Defining variables: get params
$array = undef; # Defining variables: get params
$wait++; # Defining variables: get params
# Defining variables: get params
} elsif (/--type/) { # Defining variables: get params
$current = $type_ref; # Defining variables: get params
$array = undef; # Defining variables: get params
$wait++; # Defining variables: get params
# Defining variables: get params
} elsif (/--arg1/) { # Defining variables: get params
$current = \$arg1; # Defining variables: get params
$array = undef; # Defining variables: get params
$wait++; # Defining variables: get params
# Defining variables: get params
} elsif (/--arg2/) { # Defining variables: get params
$current = \$arg2; # Defining variables: get params
$array = undef; # Defining variables: get params
$wait++; # Defining variables: get params
# Defining variables: get params
} elsif (/--assembler/) { # Defining variables: get params
$current = \$asmblr; # Defining variables: get params
$array = undef; # Defining variables: get params
$wait++; # Defining variables: get params
# Defining variables: get params
} elsif (/--single/) { # Defining variables: get params
$$single_ref++; # Defining variables: get params
$current = undef; # Defining variables: get params
$array = undef; # Defining variables: get params
# Defining variables: get params
} elsif (/--clean/) { # Defining variables: get params
$$clean_ref++; # Defining variables: get params
$current = undef; # Defining variables: get params
$array = undef; # Defining variables: get params
# Defining variables: get params
} elsif (/--bait/) { # Defining variables: get params
$current = $bait_ref; # Defining variables: get params
$array = undef; # Defining variables: get params
$$extra_bait_ref++; # Defining variables: get params
$wait++; # Defining variables: get params
# Defining variables: get params
} elsif (/--min_length=(\d+)/) { # Defining variables: get params
$$min_len_ref = $1; # Defining variables: get params
$current = undef; # Defining variables: get params
$array = undef; # Defining variables: get params
# Defining variables: get params
} else { # Defining variables: get params
# The given argument is not recognized so die # Defining variables: get params
die "Unknown parameter encountered \"$_\"\n"; # Defining variables: get params
} # Defining variables: get params
} else { # Defining variables: get params
# There was no argument initialized that is waiting for a value # Defining variables: get params
unless ($current) { die "Missing parameter value\n"} # Defining variables: get params
# Defining variables: get params
# Store away the value according to its type (array or scalar) # Defining variables: get params
if ($array) { # Defining variables: get params
push @$current, $_; # Defining variables: get params
} else { # Defining variables: get params
$$current = $_; # Defining variables: get params
} # Defining variables: get params
# Defining variables: get params
# Except for array variables there is no more value needed # Defining variables: get params
$wait = undef; # Defining variables: get params
# if the assembler is external than wait for the next word # Defining variables: get params
if ($_ eq "external") { # Defining variables: get params
$current = $external_ref; # Defining variables: get params
$wait++; # Defining variables: get params
} # Defining variables: get params
} # Defining variables: get params
########################################################################################################### Defining variables: get params
# End of the for loop of the command line arguments # Defining variables: get params
########################################################################################################### Defining variables: get params
} # Defining variables: get params
# Defining variables
# Check if all the necessary variables are defined and the files are reachable # Defining variables: essentials
# reference file # Defining variables: essentials
unless ($$reference_ref) { # Defining variables: essentials
print "\n$usage\n"; # Defining variables: essentials
die "Necessary argument is missing: no reference file was defined\n"; # Defining variables: essentials
} # Defining variables: essentials
# Check whether the specified file exists # Defining variables: essentials
if (-e $$reference_ref) { # Defining variables: essentials
# Print the name of the reference file to the log and STDOUT # Defining variables: essentials
$$pre_log_ref .= "The reference file is $$reference_ref\n"; # Defining variables: essentials
} else { # Defining variables: essentials
die "The specified reference file ($$reference_ref) is missing\n"; # Defining variables: essentials
} # Defining variables: essentials
# Defining variables: essentials
# read files # Defining variables: essentials
if (scalar @$reads_ref == 0) { # Defining variables: essentials
print "\n$usage\n"; # Defining variables: essentials
die "Necessary argument is missing: no read file was defined\n"; # Defining variables: essentials
} # Defining variables: essentials
# Check whether the specified files exist # Defining variables: essentials
for (@$reads_ref) { # Defining variables: essentials
if (-e $_) { # Defining variables: essentials
# Print the name of the read file to the log and STDOUT # Defining variables: essentials
$$pre_log_ref .= "The read file is $_\n"; # Defining variables: essentials
} else { # Defining variables: essentials
die "The specified read file ($_) is missing\n"; # Defining variables: essentials
} # Defining variables: essentials
} # Defining variables: essentials
# Defining variables: essentials
# folder # Defining variables: essentials
unless ($$folder_ref) { # Defining variables: essentials
print "\n$usage\n"; # Defining variables: essentials
die "Necessary argument is missing: no folder was defined\n"; # Defining variables: essentials
} # Defining variables: essentials
# Defining variables: essentials
# prefix # Defining variables: essentials
unless ($$prefix_ref) { # Defining variables: essentials
print "\n$usage\n"; # Defining variables: essentials
die "Necessary argument is missing: no prefix was defined\n"; # Defining variables: essentials
} # Defining variables: essentials
# Defining variables
# Extra bait is only important to know if in multi mode, because then there should be an inner loop # Defining variables: extra bait
if ($$type_ref) { # Defining variables: extra bait
if ($$type_ref !~ /multi/) { # Defining variables: extra bait
$$extra_bait_ref = undef; # Defining variables: extra bait
} # Defining variables: extra bait
} else { # Defining variables: extra bait
$$extra_bait_ref = undef; # Defining variables: extra bait
} # Defining variables: extra bait
# Defining variables
# If an extra bait file was defined then combine it with the reference to use as bait # Defining variables: Bait
# ELSE use reference as bait # Defining variables: Bait
if ($$bait_ref) { # Defining variables: Bait
# Check if the file really exists or not # Defining variables: Bait
if (-e $$bait_ref) { # Defining variables: Bait
# Print the name of the file to the log and STDOUT # Defining variables: Bait
$$pre_log_ref .= "The first bait is a concatenation of $$bait_ref and $$reference_ref\n"; # Defining variables: Bait
} else { # Defining variables: Bait
die "The specified bait file ($$bait_ref) is missing\n"; # Defining variables: Bait
} # Defining variables: Bait
} else { # Defining variables: Bait
# Use the reference file as bait # Defining variables: Bait
$$bait_ref = $$reference_ref; # Defining variables: Bait
$$pre_log_ref .= "The first bait file is $$reference_ref\n"; # Defining variables: Bait
} # Defining variables
# Defining variables
# Check if single or paired mode should be used # Defining variables: Single mode
# Paired mode if there are two read files and --single was not declared # Defining variables: Single mode
if (scalar @$reads_ref == 2 && not $$single_ref) { # Defining variables: Single mode
$$pre_log_ref .= "Using paired-end mode\n"; # Defining variables: Single mode
} else { # Defining variables: Single mode
$$single_ref++; # Defining variables: Single mode
$$pre_log_ref .= "Using single-end mode\n"; # Defining variables: Single mode
} # Defining variables
# Defining variables
# Sort out the assembler parameters # Defining variables: assembler
# If no assembler was specified then use edena # Defining variables: assembler
$asmblr = "edena" unless $asmblr; # Defining variables: assembler
# Check if the assembler specified is an acceptable value or not # Defining variables: assembler
unless ($asmblr eq "edena" || $asmblr eq "velvet" || $asmblr eq "alternative" || $asmblr eq "external") { # Defining variables: assembler
die "Unknown assembler specified, possible values are: " . # Defining variables: assembler
"'edena', 'velvet', 'alternative' or 'external'\n"; # Defining variables: assembler
} # Defining variables: assembler
# Defining variables: assembler
# Put the assembler values in the parameter array # Defining variables: assembler
@$parameters_ref = ($asmblr, $arg1, $arg2); # Defining variables: assembler
# Defining variables: assembler
if ($asmblr eq "external") { # Defining variables: assembler
die "Could not find external assembler script '$$external_ref'\n" unless -s $$external_ref; # Defining variables: assembler
} # Defining variables
# Defining variables
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Defining variables: Folder
# Check the folder # Defining variables: Folder
# IF it is not present it needs to be created # Defining variables: Folder
# ELSE check if empty # Defining variables: Folder
# ELSE ask if you want 1) to use the files in there # Defining variables: Folder
# 2) to delete the content # Defining variables: Folder
# 3) to quit # Defining variables: Folder
# Remove trailing "/" form the folder name # Defining variables: Folder
$$folder_ref =~ s/\/$//; # Defining variables: Folder
if (-d $$folder_ref) { # Defining variables: Folder
# Get the content of the folder # Defining variables: Folder
my @list = glob("$$folder_ref/*"); # Defining variables: Folder
# If there is something in it then ask what to do # Defining variables: Folder
if (@list) { # Defining variables: Folder
print "Specified folder ($$folder_ref) is not empty.\n"; # Defining variables: Folder
print "Do you wish to continue?[y/n] "; # Defining variables: Folder
# Get the response of the user # Defining variables: Folder
my $response = <STDIN>; # Defining variables: Folder
# If it is yes then continue else quit the program # Defining variables: Folder
if ($response =~ /^y/i) { # Defining variables: Folder
# Ask whether the content should be reused or should it be deleted # Defining variables: Folder
# If the content is not overlapping with the file naming of the program then it won't # Defining variables: Folder
# interfere with the run. Beware: running hyperclean mode will delete the old content of # Defining variables: Folder
# the folder at the end of the run # Defining variables: Folder
print "Do you want the program to use the content of the folder?\n(Already existing files", # Defining variables: Folder
" will not be overwritten, this includes reference, bait and read files. ", # Defining variables: Folder
"If you wish to replace them delete those files, then run the command again) [y/n] "; # Defining variables: Folder
# Get the user's response # Defining variables: Folder
$response = <STDIN>; # Defining variables: Folder
if ($response =~ /^y/i) { # Defining variables: Folder
# Print to the log that the selected folder is chosen and that it is not-empty # Defining variables: Folder
$$pre_log_ref .= "The working directory is $$folder_ref (the folder is not empty)\n"; # Defining variables: Folder
} else { # Defining variables: Folder
# Ask if the content should be deleted or not # Defining variables: Folder
print "Do you wish to delete the content of the folder?[y/n] "; # Defining variables: Folder
$response = <STDIN>; # Defining variables: Folder
if ($response =~ /^y/i) { # Defining variables: Folder
# Just to make sure ask it for a second time # Defining variables: Folder
print "Are you sure?[y/n] "; # Defining variables: Folder
$response = <STDIN>; # Defining variables: Folder
if ($response =~ /^y/i) { # Defining variables: Folder
# Delete the folder and its content # Defining variables: Folder
system("rm", "-r", $$folder_ref); # Defining variables: Folder
# Create the folder anew # Defining variables: Folder
mkdir $$folder_ref; # Defining variables: Folder
# Print to the log that this folder is used and it was emptied # Defining variables: Folder
$$pre_log_ref .= "The working directory is $$folder_ref (content deleted)\n"; # Defining variables: Folder
} else { # Defining variables: Folder
die "The program is terminated\n"; # Defining variables: Folder
} # Defining variables: Folder
} else { # Defining variables: Folder
die "The program is terminated\n"; # Defining variables: Folder
} # Defining variables: Folder
} # Defining variables: Folder
} else { # Defining variables: Folder
die "The program is terminated\n"; # Defining variables: Folder
} # Defining variables: Folder
} # Defining variables: Folder
} else { # Defining variables: Folder
# Create folder and print the name to the log # Defining variables: Folder
mkdir $$folder_ref; # Defining variables: Folder
$$pre_log_ref .= "The working directory is $$folder_ref\n"; # Defining variables: Folder
} # Defining variables: Folder
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Defining variables: Folder
# Create a copy of the perl script of the assembler if it was specified # Defining variables: Copy assembler
if ($asmblr eq "external") { # Defining variables: Copy assembler
system("cp", $$external_ref, "$$folder_ref\/assembler.pl"); # Defining variables: Copy assembler
} # Defining variables: Copy assembler
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Defining variables: Create
# Create copies and symlinks inside the working directory to be used by the program # Defining variables: Create
# Create the reference file if it isn't present already # Defining variables: Create ref
unless (-e "$$folder_ref\/reference.fas") { # Defining variables: Create ref
# Copy the specified reference into the main folder # Defining variables: Create ref
system("cp", $$reference_ref, "$$folder_ref\/reference.fas"); # Defining variables: Create ref
# Print that the reference was copied into the folder # Defining variables: Create ref
$$pre_log_ref .= "\tReference file is copied to $$folder_ref\/reference.fas\n"; # Defining variables: Create ref
} else { # Defining variables: Create ref
# Print that the previous reference file (found in the folder) is used and not the one defined # Defining variables: Create ref
# at invocation # Defining variables: Create ref
$$pre_log_ref .= "\tPreviously defined reference file is used $$folder_ref\/reference.fas\n"; # Defining variables: Create ref
} # Defining variables: Create ref
# Defining variables: Create
# Create the bait file if it is not present yet # Defining variables: Create bait
unless (-e "$$folder_ref\/bait.fas") { # Defining variables: Create bait
# Combine the reference the bait file (if specified) together to create the first bait file # Defining variables: Create bait
my @baits = ($$bait_ref, $$reference_ref); # Defining variables: Create bait
`cat @baits | perl -ne 'if (/>/) {\$n++; \$_ = ">\$n\n"}; print' >$$folder_ref\/bait.fas `; # Defining variables: Create bait
# Defining variables: Create bait
# copy the bait file into the folder so it is preserved within the folder (will not be used) # Defining variables: Create bait
system("cp", $$bait_ref, "$$folder_ref\/extra_bait.fas"); # Defining variables: Create bait
# Print that the bait was create inside the folder # Defining variables: Create bait
$$pre_log_ref .= "\tBait file is created: $$folder_ref\/bait.fas\n"; # Defining variables: Create bait
} else { # Defining variables: Create bait
# Print that the previous reference file (found in the folder) is used and not the one defined # Defining variables: Create bait
# at invocation # Defining variables: Create bait
$$pre_log_ref .= "\tPreviously defined bait file is used together with the new files\n"; # Defining variables: Create bait
my @baits = ($$bait_ref, $$reference_ref, "$$folder_ref\/bait.fas"); # Defining variables: Create bait
`cat @baits | perl -ne 'if (/>/) {\$n++; \$_ = ">\$n\n"}; print' >$$folder_ref\/temp_bait.fas `; # Defining variables: Create bait
system('cp', "$$folder_ref\/temp_bait.fas", "$$folder_ref\/bait.fas"); # Defining variables: Create bait
} # Defining variables: Create bait
# Defining variables
# Set the names of the bait and reference file # Defining variables
$$bait_ref = "bait.fas"; # Defining variables
$$reference_ref = "reference.fas"; # Defining variables
# Defining variables
# Create symlinks to the read files and store the names of the links in an array # Defining variables: Create reads
# This enables us to use the same array to refer to the reads in all the iterations # Defining variables: Create reads
# Initialize a counter for the read files. This will keep track of what is the serial number of the # Defining variables: Create reads
# current read file # Defining variables: Create reads
my $i; # Defining variables: Create reads
for (@$reads_ref) { # Defining variables: Create reads
$i++; # Defining variables: Create reads
# If the specified read file is zipped then increment the gzip_ref value # Defining variables: Create reads
$$gzip_ref++ if /\.gz/; # Defining variables: Create reads
# Defining variables: Create reads
# Identify the file format of the read file based on the first character # Defining variables: Create reads
my $read_cmd = "cat"; # Defining variables: Create reads
$read_cmd = "zcat" if $$gzip_ref; # Defining variables: Create reads
my $first_line = `$read_cmd $_ | head -1`; # Defining variables: Create reads
if ($first_line =~ /^>/) { # Defining variables: Create reads
if ($$format_ref) { # Defining variables: Create reads
die "All the read files should have the same format\n" if $$format_ref ne "fasta"; # Defining variables: Create reads
} # Defining variables: Create reads
$$format_ref = "fasta"; # Defining variables: Create reads
} elsif ($first_line =~ /^\@/) { # Defining variables: Create reads
if ($$format_ref) { # Defining variables: Create reads
die "All the read files should have the same format\n" if $$format_ref ne "fastq"; # Defining variables: Create reads
} # Defining variables: Create reads
$$format_ref = "fastq"; # Defining variables: Create reads
} else { # Defining variables: Create reads
# Defining variables: Create reads
} # Defining variables: Create reads
# Defining variables: Create reads
# Create the name of the symlink for the read files # Defining variables: Create reads
my $symlink = "$$folder_ref\/reads$i\.$$format_ref"; # Defining variables: Create reads
# Defining variables: Create reads
if ($$gzip_ref) { # Defining variables: Create reads
# If there is a zipped read file then all the read files should be zipped files # Defining variables: Create reads
die "All the reads should be either gzipped or uncompressed\n" unless $$gzip_ref == $i; # Defining variables: Create reads
} # Defining variables: Create reads
# Defining variables: Create reads
# Create a symlink unless there is already on inside the folder # Defining variables: Create reads
unless (-l $symlink) { # Defining variables: Create reads
# Create the symlink to the specified read files # Defining variables: Create reads