Skip to content

Commit

Permalink
Implemented --chimOutType HardClip OR SoftClip options to output hard…
Browse files Browse the repository at this point in the history
… (default) / soft clipping in the BAM CIGAR for supplementary chimeric alignments. Implemented extra references input in the SAM/AM header from user-created extraReferences.txt file in the genome directory.
  • Loading branch information
alexdobin committed Nov 23, 2016
1 parent cf41c12 commit 0eab4bb
Show file tree
Hide file tree
Showing 11 changed files with 361 additions and 260 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
@@ -1,3 +1,5 @@
* Implemented extra references input in the SAM/AM header from user-created "extraReferences.txt" file in the genome directory.
* Implemented --chimOutType HardClip OR SoftClip options to output hard (default) / soft clipping in the BAM CIGAR for supplementary chimeric alignments.
* Implemented --chimMainSegmentMultNmax parameters, which may be used to prohibit chimeric alignments with multimapping main segments to reduce false positive chimeras.
* Implemented new SAM attribute 'ch' to mark chimeric aligmments in the BAM file for --chimOutType WithinBAM option.
* Fixed a problem with RNEXT field in the Chimeric.out.sam file: RNEXT now always points to the other mate start.
Expand Down
2 changes: 1 addition & 1 deletion source/BAMbinSortByCoordinate.cpp
Expand Up @@ -49,7 +49,7 @@ void BAMbinSortByCoordinate(uint32 iBin, uint binN, uint binS, uint nThreads, st

BGZF *bgzfBin;
bgzfBin=bgzf_open((dirBAMsort+"/b"+to_string((uint) iBin)).c_str(),("w"+to_string((long long) P->outBAMcompression)).c_str());
outBAMwriteHeader(bgzfBin,P->samHeaderSortedCoord,P->chrName,P->chrLength);
outBAMwriteHeader(bgzfBin,P->samHeaderSortedCoord,P->chrNameAll,P->chrLengthAll);
//send ordered aligns to bgzf one-by-one
for (uint ia=0;ia<binN;ia++) {
char* ib=bamIn+startPos[ia*3+2];
Expand Down
2 changes: 1 addition & 1 deletion source/BAMbinSortUnmapped.cpp
Expand Up @@ -6,7 +6,7 @@ void BAMbinSortUnmapped(uint32 iBin, uint nThreads, string dirBAMsort, BGZF *bgz

BGZF *bgzfBin;
bgzfBin=bgzf_open((dirBAMsort+"/b"+to_string((uint) iBin)).c_str(),("w"+to_string((long long) P->outBAMcompression)).c_str());
outBAMwriteHeader(bgzfBin,P->samHeaderSortedCoord,P->chrName,P->chrLength);
outBAMwriteHeader(bgzfBin,P->samHeaderSortedCoord,P->chrNameAll,P->chrLengthAll);


vector<string> bamInFile;
Expand Down
93 changes: 64 additions & 29 deletions source/Parameters.cpp
Expand Up @@ -197,7 +197,7 @@ Parameters::Parameters() {//initalize parameters info
parArray.push_back(new ParameterInfoScalar <int> (-1, -1, "chimScoreJunctionNonGTAG", &chimScoreJunctionNonGTAG));
parArray.push_back(new ParameterInfoScalar <uint> (-1, -1, "chimMainSegmentMultNmax", &chimMainSegmentMultNmax));
parArray.push_back(new ParameterInfoScalar <uint> (-1, -1, "chimJunctionOverhangMin", &chimJunctionOverhangMin));
parArray.push_back(new ParameterInfoScalar <string> (-1, -1, "chimOutType", &chimOutType));
parArray.push_back(new ParameterInfoVector <string> (-1, -1, "chimOutType", &chim.out.type));
parArray.push_back(new ParameterInfoVector <string> (-1, -1, "chimFilter", &chimFilter));
parArray.push_back(new ParameterInfoScalar <uint> (-1, -1, "chimSegmentReadGapMax", &chimSegmentReadGapMax));

Expand Down Expand Up @@ -905,10 +905,72 @@ void Parameters::inputParameters (int argInN, char* argIn[]) {//input parameters
inOut->logMain << "WARNING --outSAMstrandField=intronMotif, therefore STAR will output XS attribute" <<endl;
};

if (chimOutType=="WithinBAM" && !outSAMattrPresent.NM) {
//chimeric
if (chim.out.type.at(0)=="WithinBAM")
{
chim.out.bam=true;
} else if (chim.out.type.at(0)=="SeparateSAMold")
{
chim.out.bam=false;
} else{
ostringstream errOut;
errOut <<"EXITING because of FATAL INPUT ERROR: unknown/unimplemented value for the first word of --chimOutType: "<<chim.out.type.at(0) <<"\n";
errOut <<"SOLUTION: re-run STAR with --chimOutType SeparateSAMold OR WithinBAM\n";
exitWithError(errOut.str(), std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this);
};

if (chim.out.bam && !outBAMunsorted && !outBAMcoord) {
ostringstream errOut;
errOut <<"EXITING because of fatal PARAMETERS error: --chimOutType WithinBAM requires BAM output\n";
errOut <<"SOLUTION: re-run with --outSAMtype BAM Unsorted/SortedByCoordinate\n";
exitWithError(errOut.str(), std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this);
};

if (chim.out.bam && !outSAMattrPresent.NM) {
outSAMattrOrder.push_back(ATTR_NM);
inOut->logMain << "WARNING --chimOutType=WithinBAM, therefore STAR will output NM attribute" <<endl;
};


if (chim.out.bam)
{
chim.out.bamHardClip=true;//default
if (chim.out.type.size()>1)
{
if (chim.out.type.at(1)=="HardClip")
{
chim.out.bamHardClip=true;
} else if (chim.out.type.at(1)=="SoftClip")
{
chim.out.bamHardClip=false;
} else {
ostringstream errOut;
errOut <<"EXITING because of FATAL INPUT ERROR: unknown/unimplemented value for the 2nd word of --chimOutType: "<<chim.out.type.at(1) <<"\n";
errOut <<"SOLUTION: re-run STAR with --chimOutType WithinBAM HardClip OR SoftClip\n";
exitWithError(errOut.str(), std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this);
};
};
};

chim.filter.genomicN=false;
for (uint ii=0; ii<chimFilter.size(); ii++)
{
if (chimFilter.at(ii)=="banGenomicN")
{
chim.filter.genomicN=true;
}
else if (chimFilter.at(ii)=="None")
{//nothing to do
}
else
{
ostringstream errOut;
errOut << "EXITING because of fatal PARAMETERS error: unrecognized value of --chimFilter="<<chimFilter.at(ii)<<"\n";
errOut << "SOLUTION: use allowed values: banGenomicN || None";
exitWithError(errOut.str(),std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this);
};
};


alignEndsType.ext[0][0]=false;
alignEndsType.ext[0][1]=false;
Expand Down Expand Up @@ -1114,33 +1176,6 @@ void Parameters::inputParameters (int argInN, char* argIn[]) {//input parameters
inOut->logMain<<"WARNING: --limitBAMsortRAM=0, will use genome size as RAM limit for BAM sorting\n";
};

if (chimOutType=="WithinBAM" && !outBAMunsorted && !outBAMcoord) {
ostringstream errOut;
errOut <<"EXITING because of fatal PARAMETERS error: --chimOutType WithinBAM requires BAM output\n";
errOut <<"SOLUTION: re-run with --outSAMtype BAM Unsorted/SortedByCoordinate\n";
exitWithError(errOut.str(), std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this);
};

//chimeric
chimPar.filter.genomicN=false;
for (uint ii=0; ii<chimFilter.size(); ii++)
{
if (chimFilter.at(ii)=="banGenomicN")
{
chimPar.filter.genomicN=true;
}
else if (chimFilter.at(ii)=="None")
{//nothing to do
}
else
{
ostringstream errOut;
errOut << "EXITING because of fatal PARAMETERS error: unrecognized value of --chimFilter="<<chimFilter.at(ii)<<"\n";
errOut << "SOLUTION: use allowed values: banGenomicN || None";
exitWithError(errOut.str(),std::cerr, inOut->logMain, EXIT_CODE_PARAMETER, *this);
};
};

for (uint ii=0; ii<readNameSeparator.size(); ii++)
{
if (readNameSeparator.at(ii)=="space")
Expand Down
13 changes: 9 additions & 4 deletions source/Parameters.h
Expand Up @@ -38,7 +38,7 @@ class Parameters {
string inputBAMfile;

//genome, SA, ...
vector <uint> chrStart, chrLength;
vector <uint> chrStart, chrLength, chrLengthAll;
string genomeDir,genomeLoad;
vector <string> genomeFastaFiles;
uint genomeSAsparseD;//sparsity=distance between indices
Expand Down Expand Up @@ -301,7 +301,6 @@ class Parameters {
uint chimSegmentReadGapMax; //max read gap for stitching chimeric windows
int chimScoreMin,chimScoreDropMax,chimScoreSeparation, chimScoreJunctionNonGTAG; //min chimeric score
uint chimMainSegmentMultNmax;
string chimOutType;
vector <string> chimFilter;

struct
Expand All @@ -310,7 +309,13 @@ class Parameters {
{
bool genomicN;
} filter;
} chimPar;
struct
{
vector <string> type;
bool bam;
bool bamHardClip;
} out;
} chim;

//splitting
char Qsplit;
Expand All @@ -328,7 +333,7 @@ class Parameters {
uint nGenome, nSA, nSAbyte, nChrReal;//genome length, SA length, # of chromosomes, vector of chromosome start loci
uint nGenome2, nSA2, nSAbyte2, nChrReal2; //same for the 2nd pass
uint nSAi; //size of the SAindex
vector <string> chrName;
vector <string> chrName, chrNameAll;
map <string,uint> chrNameIndex;
unsigned char GstrandBit, SAiMarkNbit, SAiMarkAbsentBit; //SA index bit for strand information
uint GstrandMask, SAiMarkAbsentMask, SAiMarkAbsentMaskC, SAiMarkNmask, SAiMarkNmaskC;//maske to remove strand bit from SA index, to remove mark from SAi index
Expand Down
8 changes: 5 additions & 3 deletions source/ReadAlign_alignBAM.cpp
Expand Up @@ -128,8 +128,10 @@ int ReadAlign::alignBAM(Transcript const &trOut, uint nTrOut, uint iTrOut, uint

//alignType>=0: unmapped reads
// -1: normal mapped reads
// -11: chimeric alignment, chimeric junction on the left
// -12: chimeric alignment, chimeric junction on the right
// -10: chimeric alignment, not supplemental (like -11,-12,-13)
// -11: chimeric alignment, supplemental, hard-clipping, chimeric junction on the left
// -12: chimeric alignment, supplemental, hard-clipping, chimeric junction on the right
// -13: chimeric alignment, supplemental, soft-clipping


if (P->outSAMmode=="None") return 0; //no SAM/BAM output
Expand Down Expand Up @@ -228,7 +230,7 @@ int ReadAlign::alignBAM(Transcript const &trOut, uint nTrOut, uint iTrOut, uint

if (readFilter=='Y') samFLAG+=0x200; //not passing quality control

if (alignType==-11 || alignType==-12) {
if (alignType==-11 || alignType==-12 || alignType==-13) {
samFLAG+=0x800; //mark chimeric alignments
} else {//only non-chimeric alignments will be marked as non-primary, since chimeric are already marked with 0x800
if (!trOut.primaryFlag) samFLAG +=0x100;//mark not primary align
Expand Down
6 changes: 3 additions & 3 deletions source/ReadAlign_chimericDetection.cpp
Expand Up @@ -223,7 +223,7 @@ bool ReadAlign::chimericDetection() {
if (b1<4) b1=3-b1;
};

if ( ( P->chimPar.filter.genomicN && (b0>3 || b1>3) ) || bR>3) {//chimera is not called if there are Ns in the genome or in the read
if ( ( P->chim.filter.genomicN && (b0>3 || b1>3) ) || bR>3) {//chimera is not called if there are Ns in the genome or in the read
chimN=0;
break;
};
Expand Down Expand Up @@ -382,7 +382,7 @@ bool ReadAlign::chimericDetection() {
trChim[1-chimRepresent].primaryFlag=false;
};

if (P->chimOutType=="WithinBAM") {//BAM output
if (P->chim.out.bam) {//BAM output
int alignType, bamN=0, bamIsuppl=-1, bamIrepr=-1;
uint bamBytesTotal=0;//estimate of the total size of all bam records, for output buffering
uint mateChr,mateStart;
Expand All @@ -399,7 +399,7 @@ bool ReadAlign::chimericDetection() {
alignType=-10; //this is representative part of chimeric alignment, record is as normal; if encompassing chimeric junction, both are recorded as normal
bamIrepr=( (itr%2)==(trChim[itr].Str) && chimType!=3) ? bamN+1 : bamN;//this is the mate that is chimerically split
} else {//"supplementary" chimeric segment
alignType=( (itr%2)==(trChim[itr].Str) ) ? -12 : -11; //right:left chimeric junction
alignType=P->chim.out.bamHardClip ? ( ( itr%2==trChim[itr].Str ) ? -12 : -11) : -13 ; //right:left chimeric junction
bamIsuppl=bamN;
if (chimType==1) {//PE alignment, need mate info for the suppl
uint iex=0;
Expand Down
2 changes: 1 addition & 1 deletion source/ReadAlign_outputAlignments.cpp
Expand Up @@ -13,7 +13,7 @@ void ReadAlign::outputAlignments() {
if ( chimericDetection() )
{
statsRA.chimericAll++;
if ( P->chimOutType=="WithinBAM" )
if ( P->chim.out.bam)
{
//if chimeric alignment was recorded in main BAM files, it contains the representative portion, so non-chimeric aligmnent is not output
return;
Expand Down
26 changes: 25 additions & 1 deletion source/STAR.cpp
Expand Up @@ -192,6 +192,29 @@ int main(int argInN, char* argIn[]) {
samHeaderStream << "@SQ\tSN:"<< P->chrName.at(ii) <<"\tLN:"<<P->chrLength[ii]<<"\n";
};

P->chrNameAll=P->chrName;
P->chrLengthAll=P->chrLength;
{//add exra references
ifstream extrastream (P->genomeDir + "/extraReferences.txt");
while (extrastream.good()) {
string line1;
getline(extrastream,line1);
istringstream stream1 (line1);
string field1;
stream1 >> field1;//should check for @SQ

if (field1!="") {//skip blank lines
samHeaderStream << line1 <<"\n";

stream1 >> field1;
P->chrNameAll.push_back(field1.substr(3));
stream1 >> field1;
P->chrLengthAll.push_back((uint) stoll(field1.substr(3)));
};
};
extrastream.close();
};

if (P->outSAMheaderPG.at(0)!="-") {
samHeaderStream << P->outSAMheaderPG.at(0);
for (uint ii=1;ii<P->outSAMheaderPG.size(); ii++) {
Expand All @@ -211,6 +234,7 @@ int main(int argInN, char* argIn[]) {
samHeaderStream << line1 <<"\n";
};
};
comstream.close();
};


Expand Down Expand Up @@ -238,7 +262,7 @@ int main(int argInN, char* argIn[]) {
*P->inOut->outSAM << P->samHeader;
};
if (P->outBAMunsorted){
outBAMwriteHeader(P->inOut->outBAMfileUnsorted,P->samHeader,P->chrName,P->chrLength);
outBAMwriteHeader(P->inOut->outBAMfileUnsorted,P->samHeader,P->chrNameAll,P->chrLengthAll);
};
// if (P->outBAMcoord){
// outBAMwriteHeader(P->inOut->outBAMfileCoord,P->samHeader,P->chrName,P->chrLength);
Expand Down
6 changes: 5 additions & 1 deletion source/parametersDefault
Expand Up @@ -525,9 +525,13 @@ winReadCoverageBasesMin 0

### Chimeric Alignments
chimOutType SeparateSAMold
string: type of chimeric output
string(s): type of chimeric output
1st word:
SeparateSAMold ... output old SAM into separate Chimeric.out.sam file
WithinBAM ... output into main aligned BAM files (Aligned.*.bam)
2nd word:
WithinBAM HardClip ... hard-clipping in the CIGAR for supplemental chimeric alignments (defaultif no 2nd word is present)
WithinBAM SoftClip ... soft-clipping in the CIGAR for supplemental chimeric alignments

chimSegmentMin 0
int>=0: minimum length of chimeric segment length, if ==0, no chimeric output
Expand Down

0 comments on commit 0eab4bb

Please sign in to comment.