Skip to content

Commit

Permalink
Organelle_PBA, added check_circularity step based in the sprai script…
Browse files Browse the repository at this point in the history
…. It process the result removing the circularity overlap
  • Loading branch information
aubombarely committed Oct 27, 2015
1 parent ec6a1cf commit 29b044b
Showing 1 changed file with 147 additions and 87 deletions.
234 changes: 147 additions & 87 deletions OrganelleRef_PBA
Expand Up @@ -783,7 +783,9 @@ print_seqstats("Assembly Step 1 Results", \@seqstats01);
## Action: Rescaffold using SSPACE-Long
## Then goes to option 1
##
## The fraction that decide if it goes to 2 or 1 should be changeble by the
## 3- Check circularity to close the genome
##
## The fraction that decide if it goes to 2 or 1 should be changeable by the
## user

print_header("3) Running scaffolding (SSPACE)");
Expand Down Expand Up @@ -846,14 +848,74 @@ while( my $seqobj02 = $asb02io->next_seq() ) {

print_header("4) Evaluating repetitive regions");


##################################################################
## Check for circularity
##################################################################

print STDERR "\t4.1- Checking circularity\n";

## It will use the check circularity script;
## It will produce the output as stdout, so it will redirect the
## the output into a file

my $temp_cc = File::Spec->catfile($outdir, "04_checkcirc_tempdir");
my $checkcirc_out = File::Spec->catfile($outdir, "04_checkcirc_out.txt");
my $checkcirc_cmd = "check_circularity.pl $assembly03 $temp_cc --min_overlap_len 100 --force > $checkcirc_out";
my @checkcirc01run = run( command => $checkcirc_cmd, verbose => $opt_V );


## By default define the assembly 04 as 03
my $assembly04 = $assembly03;

open my $checkcirc_fh, '<', $checkcirc_out;
while(<$checkcirc_fh>) {

chomp($_);
if ($_ =~ m/.+?\s+circular\s+\[(\d+)\-(\d+)\]\s+\w+\s+\[(\d+)\-(\d+)\]\s+\w+\s+(.+)\%\s+\w+/) {

my $p_st = $1;
my $p_en = $2;
my $e_st = $3;
my $e_en = $4;
my $perc = $5;
print STDERR "\tCircularity found: $p_st-$p_en matches $e_st-$e_en ($perc %)\n";

## It will remove the repeat
my $asb04io = Bio::SeqIO->new( -file => "$assembly03", -type => 'fasta' );
$assembly04 = File::Spec->catfile($outdir,"04_assembly04_no_circ.fa");
my $asb05io = Bio::SeqIO->new( -file => ">$assembly04", -type => 'fasta' );
print STDERR "\tRemoving redundancy produced by circularity\n\n";

while (my $wseqobj = $asb04io->next_seq()) {

my $c_end = $e_st - 1;
my $subseq = $wseqobj->trunc(1, $c_end);
my $newid = $subseq->id()."_1_".$c_end;
$subseq->id($newid);

$asb05io->write_seq($subseq);
}
}
elsif ($checkcirc_out =~ m/linear/) {

print STDERR "\tNo circularity detected.\n\n";
}
}


####################################################################
## Evaluate repetitive regions through mapping
####################################################################

## To evaluate the repetitive region it will map back the
## reads and check the coverage.

my $blasr2out = File::Spec->catfile($outdir, "04_BlasR_out.sam");
$blasr_args{'-out'} = $blasr2out;


my @blasr2cmd = ($exepath{'blasr'}, $blasr_in, $assembly03);
my @blasr2cmd = ($exepath{'blasr'}, $blasr_in, $assembly04);
foreach my $blasr_k (sort keys %blasr_args) {

push @blasr2cmd, $blasr_k;
Expand All @@ -867,7 +929,7 @@ foreach my $blasr_k (sort keys %blasr_args) {

push @blasr2cmd, "-sam";

print STDERR "\t4.1- Running BlasR re-mapping.\n\n";
print STDERR "\t4.2- Running BlasR re-mapping.\n\n";

my @blasr02run = run( command => \@blasr2cmd, verbose => $opt_V );

Expand All @@ -878,14 +940,14 @@ my @blasr02run = run( command => \@blasr2cmd, verbose => $opt_V );
my $bamout = File::Spec->catfile($outdir, "04_BlasR_out.bam");
my $samtools01cmd = "$exepath{samtools} view -F4 -Sb -o $bamout $blasr2out";

print STDERR "\t4.2- Running samtools view (Sam to Bam).\n\n";
print STDERR "\t4.3- Running samtools view (Sam to Bam).\n\n";

my @sam01run = run( command => $samtools01cmd, verbose => $opt_V );

my $bamout_s = File::Spec->catfile($outdir, "04_BlasR_out");
my $samtools02cmd = "$exepath{samtools} sort $bamout $bamout_s";

print STDERR "\t4.3- Sorting bam file.\n\n";
print STDERR "\t4.4- Sorting bam file.\n\n";

my @sam02run = run( command => $samtools02cmd, verbose => $opt_V );

Expand All @@ -904,7 +966,7 @@ print $gemfh "$seqstats02[3]\t$seqstats02[2]\n";

## And then, run bedtools genome cov

print STDERR "\t4.4- Calculating coverage.\n\n";
print STDERR "\t4.5- Calculating coverage.\n\n";

my $bedgc01cmd = "$exepath{bedtools} genomecov -d -ibam $bamout -g $gemsize";

Expand Down Expand Up @@ -941,7 +1003,7 @@ print STDERR "\tAverage coverage:\t$covstats{c_average}\n";
print STDERR "\tLower limit coverage:\t$covstats{lower_limit}\n";
print STDERR "\tUpper limit coverage:\t$covstats{upper_limit}\n\n";

print STDERR "\t4.5- Getting the breaking points\n";
print STDERR "\t4.6- Getting the breaking points\n";

## Now it will get the sequence break points based in the blocks
my $breakpoints_href = get_breakpoints($covstats{covdata});
Expand All @@ -957,7 +1019,9 @@ print STDERR "\t$n_breaks breaking points have been detected\n\n";
if (scalar(keys %breaks) == 1) {

print STDERR "\n\tNo repeats breaking points detected.\n";
print STDERR "\n\tFINAL ASSEMBLY FILE: $assembly03\n";
print STDERR "\n\tFINAL ASSEMBLY FILE: $assembly04\n";
my @seqstats03 = get_seqstats($assembly04, $refsize);
print_seqstats("Final Assembly Results", \@seqstats03);
}
else {

Expand All @@ -967,11 +1031,11 @@ else {
## do the revcom
## and rescaffold the four fragments.

print STDERR "\t4.6- Breaking the assembly.\n";
print STDERR "\t4.6.1- Breaking the assembly.\n";

my $asb04io = Bio::SeqIO->new( -file => "$assembly03", -type => 'fasta' );
my $assembly04 = File::Spec->catfile($outdir,"05_assembly03breaks.fa");
my $asb05io = Bio::SeqIO->new( -file => ">$assembly04", -type => 'fasta' );
my $asb04io = Bio::SeqIO->new( -file => "$assembly04", -type => 'fasta' );
my $assembly05 = File::Spec->catfile($outdir,"05_assembly03breaks.fa");
my $asb05io = Bio::SeqIO->new( -file => ">$assembly05", -type => 'fasta' );

## Additionally it will store the seqobjects into an array for
## further comparisons
Expand All @@ -996,105 +1060,103 @@ else {
$asb05io->write_seq($subseq);
$c++;
}
}
print STDERR "\t$assembly03 has been broken in $c subsequences.\n\n";

print STDERR "\t4.7- Comparing fragments.\n";
print STDERR "\t$assembly04 has been broken in $c subsequences.\n\n";
print STDERR "\t4.6.2- Comparing fragments.\n";

## It will run bl2seq using a Blast factory object.
my $blastfact = Bio::Tools::Run::StandAloneBlastPlus->new();
## It will run bl2seq using a Blast factory object.
my $blastfact = Bio::Tools::Run::StandAloneBlastPlus->new();

## Define sequences to remove
my %rem = ();
my %revcom = ();
## Define sequences to remove
my %rem = ();
my %revcom = ();

my ($n, $m) = (0, 0);
foreach my $subseq1 (@subseqs) {
my ($n, $m) = (0, 0);
foreach my $subseq1 (@subseqs) {

$n++;
my $sseqid1 = $subseq1->id();
$n++;
my $sseqid1 = $subseq1->id();

foreach my $subseq2 (@subseqs) {
foreach my $subseq2 (@subseqs) {

$m++;
my $sseqid2 = $subseq2->id();
$m++;
my $sseqid2 = $subseq2->id();

my $blastoutname = "05_BlastCmp_".$sseqid1."_".$sseqid2;
my $outfilecmp = File::Spec->catfile($outdir, $blastoutname);
my $blastoutname = "05_BlastCmp_".$sseqid1."_".$sseqid2;
my $outfilecmp = File::Spec->catfile($outdir, $blastoutname);

if ($sseqid1 ne $sseqid2) {

print STDERR "\tComparing $sseqid1 with $sseqid2\n";
$blastfact->bl2seq( -method => 'blastn',
-query => $subseq1,
-subject => $subseq2,
-outfile => $outfilecmp,
-outformat => 6
);
if ($sseqid1 ne $sseqid2) {



while (my $blast_res = $blastfact->next_result()) {
print STDERR "\tComparing $sseqid1 with $sseqid2\n";
$blastfact->bl2seq( -method => 'blastn',
-query => $subseq1,
-subject => $subseq2,
-outfile => $outfilecmp,
-outformat => 6
);

while (my $blast_res = $blastfact->next_result()) {

my $qlen = $blast_res->query_length();
my $qlen = $blast_res->query_length();

while (my $blast_hit = $blast_res->next_hit()) {
while (my $blast_hit = $blast_res->next_hit()) {

my $hlen = $blast_hit->length();
my $hlen = $blast_hit->length();

while (my $blast_hsp = $blast_hit->next_hsp()) {
while (my $blast_hsp = $blast_hit->next_hsp()) {

my $qstrand = $blast_hsp->strand('query');
my $hstrand = $blast_hsp->strand('hit');
my $percident = $blast_hsp->percent_identity();
print STDERR "\tQUERY:\t".$subseq1->id." ($qlen)\n";
print STDERR "\tHIT:\t".$subseq2->id." ($hlen)\n";
print STDERR "\tqSTRAND: $qstrand\n";
print STDERR "\thSTRAND: $hstrand\n";
print STDERR "\tPerc. Identity: $percident\n\n";

if ($qlen < $hlen && $percident >= 97) {
my $qstrand = $blast_hsp->strand('query');
my $hstrand = $blast_hsp->strand('hit');
my $percident = $blast_hsp->percent_identity();
print STDERR "\tQUERY:\t".$subseq1->id." ($qlen)\n";
print STDERR "\tHIT:\t".$subseq2->id." ($hlen)\n";
print STDERR "\tqSTRAND: $qstrand\n";
print STDERR "\thSTRAND: $hstrand\n";
print STDERR "\tPerc. Identity: $percident\n\n";

if ($qlen < $hlen && $percident >= 97) {

$rem{$subseq1->id()} = 1;
print STDERR "\tREPEAT: ".$subseq1->id()."\n";
$rem{$subseq1->id()} = 1;
print STDERR "\tREPEAT: ".$subseq1->id()."\n";

if ($qstrand != $hstrand) {
if ($qstrand != $hstrand) {

$revcom{$subseq2->id()} = 1;
$revcom{$subseq2->id()} = 1;
}
}
}
}
}
}
}
## Now it will process the output
}
## Now it will process the output
}
}
}



## At this point the script it ready to produce the sequences that will be
## rescaffolded
## At this point the script it ready to produce the sequences that will be
## rescaffolded

print STDERR "\t4.7- Selecting the no-redundant sequence set\n";
my $asb06io = Bio::SeqIO->new( -file => "$assembly04", -type => 'fasta' );
my $assembly05 = File::Spec->catfile($outdir,"06_assembly04breaks.fa");
my $asb07io = Bio::SeqIO->new( -file => ">$assembly05", -type => 'fasta' );
print STDERR "\t4.6.3- Selecting the no-redundant sequence set\n";
my $asb06io = Bio::SeqIO->new( -file => "$assembly05", -type => 'fasta' );
my $assembly06 = File::Spec->catfile($outdir,"06_assembly04breaks.fa");
my $asb07io = Bio::SeqIO->new( -file => ">$assembly06", -type => 'fasta' );

while (my $sseqobj = $asb06io->next_seq()) {
while (my $sseqobj = $asb06io->next_seq()) {

my $sqid = $sseqobj->id();
unless (exists $rem{$sqid}) {
my $sqid = $sseqobj->id();
unless (exists $rem{$sqid}) {

$asb07io->write_seq($sseqobj);
}
$asb07io->write_seq($sseqobj);
}

if (exists $revcom{$sqid}) {
if (exists $revcom{$sqid}) {

my $revcomseq = $sseqobj->revcom();
}
my $revcomseq = $sseqobj->revcom();
}
}
}

print STDERR "\t4.8- Rescaffolding the sequences with SSPACE-Long.\n";
print STDERR "\t4.6.4- Rescaffolding the sequences with SSPACE-Long.\n";
my $sspace2cmd = "perl $exepath{'SSPACE-LongRead.pl'} -c $assembly05";
$sspace2cmd .= " -p $blasr_in";

Expand All @@ -1110,20 +1172,18 @@ else {
my @sspace02run = run( command => $sspace2cmd, verbose => $opt_V );

## Get the results
my $preassembly06 = File::Spec->catfile($sspace2out, "scaffolds.fasta");
my $assembly06 = File::Spec->catfile($outdir,"06_organelle_assembly06.fa");
my $preassembly07 = File::Spec->catfile($sspace2out, "scaffolds.fasta");
my $assembly07 = File::Spec->catfile($outdir,"06_organelle_assembly06.fa");

## Copy to outside dir
copy($preassembly06, $assembly06);
copy($preassembly07, $assembly07);

print STDERR "\n\tFINAL ASSEMBLY FILE: $assembly06\n\n";
my @seqstats03 = get_seqstats($assembly06, $refsize);
print STDERR "\n\tFINAL ASSEMBLY FILE: $assembly07\n\n";
my @seqstats03 = get_seqstats($assembly07, $refsize);
print_seqstats("Assembly Step 3 Results", \@seqstats03);

}

}



$date = `date`;
chomp($date);
Expand Down

0 comments on commit 29b044b

Please sign in to comment.