v2.1

arthuryxt · May 13, 2016 · 0463c0e · 0463c0e
1 parent 1b6cc3a
commit 0463c0e
Show file tree

Hide file tree

Showing 13 changed files with 289 additions and 166 deletions.
diff --git a/ACF_MAKE.pl b/ACF_MAKE.pl
@@ -192,12 +192,16 @@
     $command="perl ".$SPEC{"ACF_folder"}."/ACF_trans_splice_step2.pl unmap.trans.splicing ".$SPEC{"BWA_genome_folder"}."/ unmap.trans.splicing ".$SPEC{"Seq_len"}." $ts_minSSSum ".$SPEC{"Agtf"};
     print OUT $command,"\n";
     if (($do_blat_search eq "yes") and (-e $SPEC{"BWA_genome_Index"})) {
-        my $newlen=int(1.3*($SPEC{"Seq_len"}));
+        #increasing the -minScore parameter value beyond one-half of the query size has no further effect
+        #my $newlen=int(1.3*($SPEC{"Seq_len"}));
+        my $newlen=$SPEC{"Seq_len"};
         $command=$blat_path." -minScore=".$newlen." -noHead ".$SPEC{"BWA_genome_Index"}." unmap.trans.splicing.tsloci.fa unmap.trans.splicing.tsloci.psl";
         print OUT $command,"\n";
-        $command="perl -ne \'chomp; my \@a=split(\"\\t\",\$_); if(abs(\$a[11]-\$a[12]) > $newlen){print \$a[9],\"\\n\"\;}\' unmap.trans.splicing.tsloci.psl \> unmap.trans.splicing.tsloci.psl.badid";
-        print OUT $command,"\n";
-        $command="sort \-u unmap.trans.splicing.tsloci.psl.badid \> unmap.trans.splicing.tsloci.psl.badids";
+        #$command="perl -ne \'chomp; my \@a=split(\"\\t\",\$_); if(abs(\$a[11]-\$a[12]) > $newlen){print \$a[9],\"\\n\"\;}\' unmap.trans.splicing.tsloci.psl \> unmap.trans.splicing.tsloci.psl.badid";
+        #print OUT $command,"\n";
+        #$command="sort \-u unmap.trans.splicing.tsloci.psl.badid \> unmap.trans.splicing.tsloci.psl.badids";
+        #print OUT $command,"\n";
+        $command="perl ".$SPEC{"ACF_folder"}."/get_linear_id_from_psl.pl unmap.trans.splicing.tsloci.psl unmap.trans.splicing.tsloci.psl.badids";
         print OUT $command,"\n";
         $command="perl ".$SPEC{"ACF_folder"}."/get_selected_from_pool_singleline.pl unmap.trans.splicing.tsloci.psl.badids unmap.trans.splicing.tsloci unmap.trans.splicing.tsloci.good 0 -1";
         print OUT $command,"\n";

diff --git a/ACF_Step5m2.pl b/ACF_Step5m2.pl
@@ -28,9 +28,10 @@
 		#}
     }
     if ($info ne "") {
-		$OK{$a[0]}=1000;
+		$OK{$a[0]}=0;
 		$OKinfo{$a[0]}=$a[0]."\t".int($ASsum/$ASc)."\t".$info;
     }
+    else { $OK{$a[0]}=0; } 
 }
 close IN0;
 
@@ -64,11 +65,11 @@
     chomp;
     my @a=split("\t",$_);
 	my @b=split(/\_\_\_/,$a[2]);
-    #if ((exists $OK{$a[0]}) and ($OK{$a[0]} eq 1000)) {
+    if ($OK{$a[0]} eq 0) {
 		if (exists $uniq3{$b[0]}) { $uniq3{$b[0]}=$uniq3{$b[0]}."\t".$a[0]; }
 		else { $uniq3{$b[0]}=$a[0]; }
 		$OK{$a[0]}++;
-    #}
+    }
 }
 close IN3;
 open IN31,$filein3.".refFlat";
@@ -81,18 +82,18 @@
 
 my %Anno;
 open IN4, $filein4;
+my $header=<IN4>;
+chomp $header;
+my @Header=split("\t",$header);
+my $tmpid=$Header[0];
+$Header[0]=$Header[0]."\tGname";
+$Anno{$tmpid}=join("\t",@Header);
 while(<IN4>) {
     chomp;
     my @a=split("\t",$_);
-    if ($a[0] eq "newid") { my $tmpid=$a[0]; $a[0]=$a[0]."\tGname"; $Anno{$tmpid}=join("\t",@a);}
-    #elsif ((exists $OK{$a[0]}) and ($OK{$a[0]} > 1000)) {
-	else{
-		$Anno{$a[0]}=join("\t",@a);
-    }
+    $Anno{$a[0]}=join("\t",@a);
 }
 
-my $header=$Anno{"newid"};
-my @Header=split("\t",$header);
 my $Nr=scalar(@Header);
 my $template=0;
 for(my $i=2; $i<$Nr; $i++) {$template=$template."\t0";}

diff --git a/ACF_fusion_circRNAs.pl b/ACF_fusion_circRNAs.pl
@@ -77,47 +77,9 @@
                         if ($debug > 0) {print OUT join("\t",@a),"\n",join("\t",@a2),"\n";}
                     }
                 }
-                elsif(($b[1] < $b2[4]) and ($b[4] > $b2[1])) {
-                    if ($print_exp > 0) {
-                        if ((exists $EXPR{$a[0]}) and (exists $EXPR{$a2[0]})){
-                            $cnt++;
-                            my @exp1=split("\t",$EXPR{$a[0]});
-                            my @exp2=split("\t",$EXPR{$a2[0]});
-                            my $expr=$exp1[2] > $exp2[2] ? $exp2[2] : $exp1[2];
-                            for (my $i=3; $i<scalar(@exp1); $i++) {
-                                $expr=$expr."\t".($exp1[$i] > $exp2[$i] ? $exp2[$i] : $exp1[$i]);
-                            }
-                            print OUT "fusion-circ-".$cnt,"\t",$a[2]."|".$a[3],"\t",$a[0],"\t",$a2[0],"\t",$expr,"\n";
-                            if ($debug > 0) {print OUT join("\t",@a),"\n",join("\t",@a2),"\n";}
-                        }
-                    }
-                    else{
-                        print OUT "fusion-circ-".$cnt,"\t",$a[2]."|".$a[3],"\t",$a[0],"\t",$a2[0],"\n";
-                        if ($debug > 0) {print OUT join("\t",@a),"\n",join("\t",@a2),"\n";}
-                    }
-                }
             }
             elsif (($b[2] eq "-") and ($b[5] eq "-") ){
-                if (($b[1] > $b2[4]) and ($b[4] < $b2[1])) {
-                    if ($print_exp > 0) {
-                        if ((exists $EXPR{$a[0]}) and (exists $EXPR{$a2[0]})){
-                            $cnt++;
-                            my @exp1=split("\t",$EXPR{$a[0]});
-                            my @exp2=split("\t",$EXPR{$a2[0]});
-                            my $expr=$exp1[2] > $exp2[2] ? $exp2[2] : $exp1[2];
-                            for (my $i=3; $i<scalar(@exp1); $i++) {
-                                $expr=$expr."\t".($exp1[$i] > $exp2[$i] ? $exp2[$i] : $exp1[$i]);
-                            }
-                            print OUT "fusion-circ-".$cnt,"\t",$a[2]."|".$a[3],"\t",$a[0],"\t",$a2[0],"\t",$expr,"\n";
-                            if ($debug > 0) {print OUT join("\t",@a),"\n",join("\t",@a2),"\n";}
-                        }
-                    }
-                    else{
-                        print OUT "fusion-circ-".$cnt,"\t",$a[2]."|".$a[3],"\t",$a[0],"\t",$a2[0],"\n";
-                        if ($debug > 0) {print OUT join("\t",@a),"\n",join("\t",@a2),"\n";}
-                    }
-                }
-                elsif(($b[1] < $b2[4]) and ($b[4] > $b2[1])) {
+                if(($b[1] < $b2[4]) and ($b[4] > $b2[1])) {
                     if ($print_exp > 0) {
                         if ((exists $EXPR{$a[0]}) and (exists $EXPR{$a2[0]})){
                             $cnt++;
@@ -157,47 +119,9 @@
                         if ($debug > 0) {print OUT join("\t",@a),"\n",join("\t",@a2),"\n";}
                     }
                 }
-                elsif(($b[1] > $b2[4]) and ($b[4] > $b2[1])) {
-                    if ($print_exp > 0) {
-                        if ((exists $EXPR{$a[0]}) and (exists $EXPR{$a2[0]})){
-                            $cnt++;
-                            my @exp1=split("\t",$EXPR{$a[0]});
-                            my @exp2=split("\t",$EXPR{$a2[0]});
-                            my $expr=$exp1[2] > $exp2[2] ? $exp2[2] : $exp1[2];
-                            for (my $i=3; $i<scalar(@exp1); $i++) {
-                                $expr=$expr."\t".($exp1[$i] > $exp2[$i] ? $exp2[$i] : $exp1[$i]);
-                            }
-                            print OUT "fusion-circ-".$cnt,"\t",$a[2]."|".$a[3],"\t",$a[0],"\t",$a2[0],"\t",$expr,"\n";
-                            if ($debug > 0) {print OUT join("\t",@a),"\n",join("\t",@a2),"\n";}
-                        }
-                    }
-                    else{
-                        print OUT "fusion-circ-".$cnt,"\t",$a[2]."|".$a[3],"\t",$a[0],"\t",$a2[0],"\n";
-                        if ($debug > 0) {print OUT join("\t",@a),"\n",join("\t",@a2),"\n";}
-                    }
-                }
             }
             elsif (($b[2] eq "+") and ($b[5] eq "-") ){
-                if (($b[1] < $b2[4]) and ($b[4] < $b2[1])) {
-                    if ($print_exp > 0) {
-                        if ((exists $EXPR{$a[0]}) and (exists $EXPR{$a2[0]})){
-                            $cnt++;
-                            my @exp1=split("\t",$EXPR{$a[0]});
-                            my @exp2=split("\t",$EXPR{$a2[0]});
-                            my $expr=$exp1[2] > $exp2[2] ? $exp2[2] : $exp1[2];
-                            for (my $i=3; $i<scalar(@exp1); $i++) {
-                                $expr=$expr."\t".($exp1[$i] > $exp2[$i] ? $exp2[$i] : $exp1[$i]);
-                            }
-                            print OUT "fusion-circ-".$cnt,"\t",$a[2]."|".$a[3],"\t",$a[0],"\t",$a2[0],"\t",$expr,"\n";
-                            if ($debug > 0) {print OUT join("\t",@a),"\n",join("\t",@a2),"\n";}
-                        }
-                    }
-                    else{
-                        print OUT "fusion-circ-".$cnt,"\t",$a[2]."|".$a[3],"\t",$a[0],"\t",$a2[0],"\n";
-                        if ($debug > 0) {print OUT join("\t",@a),"\n",join("\t",@a2),"\n";}
-                    }
-                }
-                elsif(($b[1] > $b2[4]) and ($b[4] > $b2[1])) {
+                if(($b[1] > $b2[4]) and ($b[4] > $b2[1])) {
                     if ($print_exp > 0) {
                         if ((exists $EXPR{$a[0]}) and (exists $EXPR{$a2[0]})){
                             $cnt++;

diff --git a/ACF_trans_splice_step3.pl b/ACF_trans_splice_step3.pl
@@ -60,22 +60,24 @@
 
 my %Anno;
 open IN4, $filein4;
+my $header=<IN4>;
+chomp $header;
+my @Header=split("\t",$header);
+my $tmpid=$Header[0];
+$Header[0]=$Header[0]."\tGname";
+$Anno{$tmpid}=join("\t",@Header);
 while(<IN4>) {
     chomp;
     my @a=split("\t",$_);
-    if ($a[0] eq "newid") { my $tmpid=$a[0]; $a[0]=$a[0]."\tGname"; $Anno{$tmpid}=join("\t",@a);}
-    #elsif ((exists $OK{$a[0]}) and ($OK{$a[0]} > 1000)) {
-	else{
-		$Anno{$a[0]}=join("\t",@a);
-    }
+    $Anno{$a[0]}=join("\t",@a);
 }
 
-my $header=$Anno{"newid"};
-my @Header=split("\t",$header);
 my $Nr=scalar(@Header);
 my $template=0;
 for(my $i=2; $i<$Nr; $i++) {$template=$template."\t0";}
 
+
+
 open OUT1,">".$filein1.".expr";
 open OUT11,">".$filein1.".newid";
 print OUT1 $header,"\n";

diff --git a/README.md b/README.md
@@ -3,12 +3,13 @@ Accurate CircRNA Finder Suite. Discovering circRNAs from RNA-Seq data.
 
 
 # Overview
-CircRNAs are generated through splicing, or to be precise, back-splicing where the downstream splice donor attact an upstream splice acceptor. Identifying the exact site of back-splice lies in the heart of circRNA discovery.
+CircRNAs are generated through splicing, or to be precise, back-splicing where the downstream splice donor attacks an upstream splice acceptor. Identifying the exact site of back-splice lies in the heart of circRNA discovery.
 
 ACFS first examines and pinpoints the back-splice site from RNA-Seq alignment using a maximal entropy model. The expression of circRNAs is estimated from a second round of alignment to the inferred pseudo circular sequences.
 
 No prior knowledge of gene annotation is needed for ACFS, but reading the coordinates is far less interesting than reading gene names, so circRNAs are annotated using the gene annotation if provided.
 
+ACFS is designed for Single-end RNA-Seq reads. Seeing is believing, we would like to see read directly spanning the back-splice sites. Paired-end data is also supported, albeit with lower sensitivity (if neither of the read ends crosses the back-splice, but the read-pair does).
 
 # Change Log
 - Update on 2016-05-06 :
@@ -97,6 +98,11 @@ Simply unpack the ACFS package.
     - If the reads are actually stransless or pair-ended, please run in parallel original reads and reverse-complemented reads.
     - No pair-end information is used, as the exact junction-site must be supported by a single read. Seeing is believing.
 
+6. For Paired-end data, it is highly recommended to align the reads to genome+transcriptome first (e.g. using Tophat2), and extract the unmapped read (read pairs) using the following script (the fasta header line is also modified)
+    ```
+    perl convert_unmapped_SAM_to_fa_for_acfs.pl <output_file_name> <unmapped.sam> <sample_id>
+    ```
+
 
 # Parameters
 There are nine mandatory parameters to run ACFS in a basic mode, searching for fusion-circRNAs need to be enabled. Modify the config file "SPEC_example.txt" accordingly.
@@ -144,14 +150,13 @@ Optional parameters, the values in below are set as default:
 
 # run ACFS
 1. make Pipeline Bash file
-```
-perl ACF_MAKE.pl SPEC_example.txt BASH_example.sh
-```
-
+    ```
+    perl ACF_MAKE.pl SPEC_example.txt BASH_example.sh
+    ```  
 2. find circles
-```
-bash BASH_example.sh
-```
+    ```
+    bash BASH_example.sh
+    ```
 
 
 # Results
@@ -195,11 +200,11 @@ bash BASH_example.sh
     perl -e 'open IN,"UNMAP_expr1"; my $line=<IN>; print $line; while(<IN>){chomp; my @a=split("\t",$_); print join("\t",@a),"\n"; $a[0]="rc".$a[0]; print join("\t",@a),"\n";}' > UNMAP_expr
     ```
     1C. **_for single-end and paired-end RNA-Seq unmapped reads_**.  
-    First align raw RNA-Seq reads to some reference with tools (such as Bowtie/BWA/STAR), obtain a SAM file containing all unmapped reads. For example if you choose Tophat, please convert <unmapped.bam> to <unmapped.sam>. Then run the following command:
+    First align raw RNA-Seq reads to some reference with tools (such as Bowtie/BWA/STAR/Tophat2), obtain a SAM file <unmapped.sam> containing all unmapped reads. For example if you choose Tophat, please convert <unmapped.bam> to <unmapped.sam>. Then run the following command:
     ```
-    perl convert_unmapped_SAM_to_fa_for_acfs.pl unmapped.sam newName newNAME.fa
+    perl convert_unmapped_SAM_to_fa_for_acfs.pl <newNAME.fa> <unmapped.sam> <sample_id>
     ```  
-    where <newName> is the new sample ID that is used to change the fasta/fastq header to ACF style. It could be "Ctrl12" or "TreatA.rep2", your choice. Just remember, there should be NO underscore as "_" in it.  
+    where <sample_id> is the new sample ID that is used to change the fasta/fastq header to ACF style. It could be "Ctrl12" or "TreatA.rep2", your choice. Just remember, there should be NO underscore as "_" in it.  
     <newNAME.fa> is the name of the converted file. Name it whatever you like.
     ```
     perl Truseq_merge_unique_fa.pl UNMAP newid newNAME.fa