diff --git a/src/clibs/emboss/src/biolib_getorf.c b/src/clibs/emboss/src/biolib_getorf.c index 98d9b92..19e94e6 100644 --- a/src/clibs/emboss/src/biolib_getorf.c +++ b/src/clibs/emboss/src/biolib_getorf.c @@ -1,7 +1,7 @@ /* @source biolib_getorf ** ** Finds and extracts open reading frames (ORFs) - this is the Biolib -** extension which calls into EMBOSS and uses much of Gary Williams' +** extension which calls into EMBOSS and uses much of Gary Williams' ** logic. ** ** @author Copyright (C) Gary Williams (gwilliam@hgmp.mrc.ac.uk) @@ -26,18 +26,18 @@ #include static void getorf_WriteORF(const AjPSeq seq, ajint len, ajint seqlen, - AjBool sense, ajint find, ajint *orf_no, - ajint start, ajint pos, const AjPStr str, - AjPSeqout seqout, ajint around); + AjBool sense, ajint find, ajint *orf_no, + ajint start, ajint pos, const AjPStr str, + AjPSeqout seqout, ajint around); static void getorf_AppORF(ajint find, AjPStr *str, - const char *chrseq, ajint pos, - char aa); + const char *chrseq, ajint pos, + char aa); static void getorf_FindORFs(const AjPSeq seq, ajint len, const AjPTrn trnTable, - ajuint minsize, ajuint maxsize, AjPSeqout seqout, - AjBool sense, AjBool circular, ajint find, - ajint *orf_no, AjBool methionine, ajint around); + ajuint minsize, ajuint maxsize, AjPSeqout seqout, + AjBool sense, AjBool circular, ajint find, + ajint *orf_no, AjBool methionine, ajint around); /* types of control codon */ #define START 1 @@ -57,8 +57,30 @@ static void getorf_FindORFs(const AjPSeq seq, ajint len, const AjPTrn trnTable, ** Get ORF's reading from START to STOP ******************************************************************************/ -ORF *biolib_getorf(AjPSeq seq, AjPTrn table, unsigned int minsize) -{ +void biolib_getorf(AjPSeq seq, AjPTrn table, unsigned int minsize) { + ORFrec record; + /* ORF number to append to name of sequence to create unique name */ + ajint orf_no; + + AjBool sense; /* ajTrue = forward sense */ + ajint len; + + orf_no = 1; /* number of the next ORF */ + sense = ajTrue; /* forward sense initially */ + + /* get the length of the sequence */ + len = ajSeqGetLen(seq); + + /* find the ORFs */ + getorf_FindORFs(seq, len, trnTable, minsize/3, 10000, seqout, sense, + circular, find, &orf_no, methionine, around, &record); + + /* now reverse complement the sequence and do it again */ + sense = ajFalse; + ajSeqReverseForce(seq); + // getorf_FindORFs(seq, len, trnTable, minsize, maxsize, seqout, sense, + // circular, find, &orf_no, methionine, + // around); } /* @func getorf_acd *********************************************************** @@ -68,128 +90,122 @@ ORF *biolib_getorf(AjPSeq seq, AjPTrn table, unsigned int minsize) ******************************************************************************/ #ifdef EMBOSS_ACD -int getorf_acd(int argc, char **argv) -{ - AjPSeqout seqout; - AjPSeqall seqall; - AjPStr tablestr; - ajuint minsize; - ajuint maxsize; - AjPStr findstr; - AjBool methionine; - AjBool circular; - AjBool reverse; - ajint around; - - embInit("getorf", argc, argv); - - seqout = ajAcdGetSeqoutall("outseq"); - seqall = ajAcdGetSeqall("sequence"); - tablestr = ajAcdGetListSingle("table"); - minsize = ajAcdGetInt("minsize"); - maxsize = ajAcdGetInt("maxsize"); - findstr = ajAcdGetListSingle("find"); - methionine = ajAcdGetBoolean("methionine"); - circular = ajAcdGetBoolean("circular"); - reverse = ajAcdGetBoolean("reverse"); - around = ajAcdGetInt("flanking"); - - getorf(seqout, seqall, tablestr, minsize, maxsize, findstr, methionine, circular, reverse, around); - - ajSeqoutClose(seqout); - ajSeqallDel(&seqall); - ajSeqoutDel(&seqout); - ajStrDel(&tablestr); - ajStrDel(&findstr); - - embExit(); - - return 0; +int getorf_acd(int argc, char **argv) { + AjPSeqout seqout; + AjPSeqall seqall; + AjPStr tablestr; + ajuint minsize; + ajuint maxsize; + AjPStr findstr; + AjBool methionine; + AjBool circular; + AjBool reverse; + ajint around; + + embInit("getorf", argc, argv); + + seqout = ajAcdGetSeqoutall("outseq"); + seqall = ajAcdGetSeqall("sequence"); + tablestr = ajAcdGetListSingle("table"); + minsize = ajAcdGetInt("minsize"); + maxsize = ajAcdGetInt("maxsize"); + findstr = ajAcdGetListSingle("find"); + methionine = ajAcdGetBoolean("methionine"); + circular = ajAcdGetBoolean("circular"); + reverse = ajAcdGetBoolean("reverse"); + around = ajAcdGetInt("flanking"); + + getorf(seqout, seqall, tablestr, minsize, maxsize, findstr, methionine, circular, reverse, around); + + ajSeqoutClose(seqout); + ajSeqallDel(&seqall); + ajSeqoutDel(&seqout); + ajStrDel(&tablestr); + ajStrDel(&findstr); + + embExit(); + + return 0; } #endif AjPSeqout get_orf( - AjPSeqout seqout, - AjPSeqall seqall, - AjPStr tablestr, - ajuint minsize, - ajuint maxsize, - AjPStr findstr, - AjBool methionine, - AjBool circular, - AjBool reverse, - ajint around -) -{ - ajint table; - ajint find; - AjPSeq seq = NULL; - AjPTrn trnTable; - AjPStr sseq = NULL; /* sequence string */ - - /* ORF number to append to name of sequence to create unique name */ - ajint orf_no; - - - AjBool sense; /* ajTrue = forward sense */ - ajint len; - - /* initialise the translation table */ - ajStrToInt(tablestr, &table); - trnTable = ajTrnNewI(table); - - /* what sort of ORF are we looking for */ - ajStrToInt(findstr, &find); - + AjPSeqout seqout, + AjPSeqall seqall, + AjPStr tablestr, + ajuint minsize, + ajuint maxsize, + AjPStr findstr, + AjBool methionine, + AjBool circular, + AjBool reverse, + ajint around +) { + ajint table; + ajint find; + AjPSeq seq = NULL; + AjPTrn trnTable; + AjPStr sseq = NULL; /* sequence string */ + + /* ORF number to append to name of sequence to create unique name */ + ajint orf_no; + + + AjBool sense; /* ajTrue = forward sense */ + ajint len; + + /* initialise the translation table */ + ajStrToInt(tablestr, &table); + trnTable = ajTrnNewI(table); + + /* what sort of ORF are we looking for */ + ajStrToInt(findstr, &find); + + /* + ** get the minimum size converted to protein length if storing + ** protein sequences + */ + if (find == P_STOP2STOP || find == P_START2STOP || find == AROUND_START) { + minsize /= 3; + maxsize /= 3; + } + + while (ajSeqallNext(seqall, &seq)) { + orf_no = 1; /* number of the next ORF */ + sense = ajTrue; /* forward sense initially */ + + /* get the length of the sequence */ + len = ajSeqGetLen(seq); + /* - ** get the minimum size converted to protein length if storing - ** protein sequences + ** if the sequence is circular, append it to itself to triple its + ** length so can deal easily with wrapped ORFs, but don't update + ** len */ - if(find == P_STOP2STOP || find == P_START2STOP || find == AROUND_START) - { - minsize /= 3; - maxsize /= 3; + if (circular) { + ajStrAssignS(&sseq, ajSeqGetSeqS(seq)); + ajStrAppendS(&sseq, ajSeqGetSeqS(seq)); + ajStrAppendS(&sseq, ajSeqGetSeqS(seq)); + ajSeqAssignSeqS(seq, sseq); } - - while(ajSeqallNext(seqall, &seq)) - { - orf_no = 1; /* number of the next ORF */ - sense = ajTrue; /* forward sense initially */ - - /* get the length of the sequence */ - len = ajSeqGetLen(seq); - - /* - ** if the sequence is circular, append it to itself to triple its - ** length so can deal easily with wrapped ORFs, but don't update - ** len - */ - if(circular) - { - ajStrAssignS(&sseq, ajSeqGetSeqS(seq)); - ajStrAppendS(&sseq, ajSeqGetSeqS(seq)); - ajStrAppendS(&sseq, ajSeqGetSeqS(seq)); - ajSeqAssignSeqS(seq, sseq); - } - - /* find the ORFs */ - getorf_FindORFs(seq, len, trnTable, minsize, maxsize, seqout, sense, - circular, find, &orf_no, methionine, around); - - /* now reverse complement the sequence and do it again */ - if(reverse) - { - sense = ajFalse; - ajSeqReverseForce(seq); - getorf_FindORFs(seq, len, trnTable, minsize, maxsize, seqout, sense, - circular, find, &orf_no, methionine, - around); - } + + /* find the ORFs */ + getorf_FindORFs(seq, len, trnTable, minsize, maxsize, seqout, sense, + circular, find, &orf_no, methionine, around, &record); + + /* now reverse complement the sequence and do it again */ + if (reverse) { + sense = ajFalse; + ajSeqReverseForce(seq); + getorf_FindORFs(seq, len, trnTable, minsize, maxsize, seqout, sense, + circular, find, &orf_no, methionine, + around); } - ajTrnDel(&trnTable); - ajSeqDel(&seq); - ajStrDel(&sseq); + } + ajTrnDel(&trnTable); + ajSeqDel(&seq); + ajStrDel(&sseq); } /* @funcstatic getorf_FindORFs ************************************************ @@ -212,307 +228,275 @@ AjPSeqout get_orf( ******************************************************************************/ void getorf_FindORFs(const AjPSeq seq, ajint len, const AjPTrn trnTable, - ajuint minsize, ajuint maxsize, AjPSeqout seqout, - AjBool sense, AjBool circular, ajint find, - ajint *orf_no, AjBool methionine, ajint around) -{ - AjBool ORF[3]; /* true if found an ORF */ - AjBool LASTORF[3]; /* true if hit the end of an ORF past - the end on the genome in this - frame */ - AjBool GOTSTOP[3]; /* true if found a STOP in a circular - genome's frame when - find = P_STOP2STOP or - N_STOP2STOP */ - ajint start[3]; /* possible starting position of the - three frames */ - ajint pos; - ajint codon; - char aa; - ajint frame; - AjPStr newstr[3]; /* strings of the three frames of ORF - sequences that we are growing */ - AjPSeq pep = NULL; - ajint i; - - ajint seqlen; - const char *chrseq; - - seqlen = ajSeqGetLen(seq); - chrseq = ajSeqGetSeqC(seq); - - /* initialise the ORF sequences */ - newstr[0] = NULL; - newstr[1] = NULL; - newstr[2] = NULL; - + ajuint minsize, ajuint maxsize, AjPSeqout seqout, + AjBool sense, AjBool circular, ajint find, + ajint *orf_no, AjBool methionine, ajint around, + ORFrec *record) { + AjBool ORF[3]; /* true if found an ORF */ + AjBool LASTORF[3]; /* true if hit the end of an ORF past + the end on the genome in this + frame */ + AjBool GOTSTOP[3]; /* true if found a STOP in a circular + genome's frame when + find = P_STOP2STOP or + N_STOP2STOP */ + ajint start[3]; /* possible starting position of the + three frames */ + ajint pos; + ajint codon; + char aa; + ajint frame; + AjPStr newstr[3]; /* strings of the three frames of ORF + sequences that we are growing */ + AjPSeq pep = NULL; + ajint i; + + ajint seqlen; + const char *chrseq; + + seqlen = ajSeqGetLen(seq); + chrseq = ajSeqGetSeqC(seq); + + /* initialise the ORF sequences */ + newstr[0] = NULL; + newstr[1] = NULL; + newstr[2] = NULL; + + /* + ** initialise flags for found the last ORF past the end of a circular + ** genome + */ + LASTORF[0] = ajFalse; + LASTORF[1] = ajFalse; + LASTORF[2] = ajFalse; + + /* initialise flags for found at least one STOP codon in a frame */ + GOTSTOP[0] = ajFalse; + GOTSTOP[1] = ajFalse; + GOTSTOP[2] = ajFalse; + + if (circular || find == P_START2STOP || find == N_START2STOP || + find == AROUND_START) { + ORF[0] = ajFalse; + ORF[1] = ajFalse; + ORF[2] = ajFalse; + } else { /* - ** initialise flags for found the last ORF past the end of a circular - ** genome + ** assume already in a ORF so we get ORFs at the start of the + ** sequence */ - LASTORF[0] = ajFalse; - LASTORF[1] = ajFalse; - LASTORF[2] = ajFalse; - - /* initialise flags for found at least one STOP codon in a frame */ - GOTSTOP[0] = ajFalse; - GOTSTOP[1] = ajFalse; - GOTSTOP[2] = ajFalse; - - if(circular || find == P_START2STOP || find == N_START2STOP || - find == AROUND_START) - { - ORF[0] = ajFalse; - ORF[1] = ajFalse; - ORF[2] = ajFalse; + ORF[0] = ajTrue; + ORF[1] = ajTrue; + ORF[2] = ajTrue; + start[0] = 0; + start[1] = 1; + start[2] = 2; + } + + for (pos=0; pos= seqlen-5) { + + /* + ** End of the sequence? If so, append any + ** last codon to the sequence - otherwise, ignore the STOP + ** codon + */ + if (codon != STOP) + getorf_AppORF(find, &newstr[frame], chrseq, pos, + aa); + + /* Already have a sequence to write out? */ + if (ORF[frame]) { + if (ajStrGetLen(newstr[frame]) >= minsize && + ajStrGetLen(newstr[frame]) <= maxsize) { + /* create a new sequence */ + if (codon == STOP) + getorf_WriteORF(seq, len, seqlen, sense, + find, orf_no, start[frame], + pos-1, newstr[frame], + seqout, around); + else + getorf_WriteORF(seq, len, seqlen, sense, + find, orf_no, start[frame], + pos+2, newstr[frame], + seqout, around); + } + + ajStrSetClear(&newstr[frame]); + } + + /* + ** if its a circular genome and the STOP codon hits past + ** the end of the genome in all frames, then break + */ + if (circular && pos >= len) { + ORF[frame] = ajFalse; /* past the end of the genome */ + LASTORF[frame] = ajTrue; /* finished getting ORFs */ + if (LASTORF[0] && LASTORF[1] && LASTORF[2]) + break; + } else { + /* + ** hit a STOP, therefore a potential ORF to write + ** out next time, even if the genome is circular + */ + ORF[frame] = ajTrue; + start[frame] = pos+3; /* next start of the ORF */ + } + + } else if (ORF[frame]) + /* append sequence to newstr if in an ORF */ + getorf_AppORF(find, &newstr[frame], chrseq, pos, aa); + } else { /* Look for start: P_START2STOP N_START2STOP AROUND_START */ + + if (codon == START && !ORF[frame]) { + /* not in a ORF already and found a START */ + if (pos < len) { + /* + ** reset the newstr to zero length to enable + ** storing the ORF for this + */ + ajStrSetClear(&newstr[frame]); + ORF[frame] = ajTrue; /* now in an ORF */ + start[frame] = pos; /* start of the ORF for this frame */ + if (methionine) + getorf_AppORF(find, &newstr[frame], chrseq, + pos, 'M'); + else + getorf_AppORF(find, &newstr[frame], chrseq, + pos, aa); + } + } else if (codon == STOP) { + /* hit a STOP */ + + /* Already have a sequence to write out? */ + if (ORF[frame]) { + ORF[frame] = ajFalse; /* not in an ORF */ + + if (ajStrGetLen(newstr[frame]) >= minsize && + ajStrGetLen(newstr[frame]) <= maxsize) { + /* create a new sequence */ + getorf_WriteORF(seq, len, seqlen, sense, + find, orf_no, start[frame], + pos-1, newstr[frame], + seqout, around); + } + } + + /* + ** if a circular genome and hit the STOP past + ** the end of the genome in all frames, then break + */ + if (circular && pos >= len) { + LASTORF[frame] = ajTrue; /* finished getting ORFs */ + if (LASTORF[0] && LASTORF[1] && LASTORF[2]) break; + } + + ajStrSetClear(&newstr[frame]); + } else if (pos >= seqlen-5) { + /* hit the end of the sequence without a stop */ + + /* Already have a sequence to write out? */ + if (ORF[frame]) { + ORF[frame] = ajFalse; /* not in an ORF */ + + /* + ** End of the sequence? If so, append any + ** last codon to the sequence - otherwise, ignore the + ** STOP codon + */ + if (pos >= seqlen-5 && pos < seqlen-2) + getorf_AppORF(find, &newstr[frame], chrseq, + pos, aa); + + if (ajStrGetLen(newstr[frame]) >= minsize && + ajStrGetLen(newstr[frame]) <= maxsize) { + /* create a new sequence */ + getorf_WriteORF(seq, len, seqlen, sense, + find, orf_no, start[frame], + pos+2, newstr[frame], + seqout, around); + } + } + + /* + ** if a circular genome and hit the STOP past + ** the end of the genome in all frames, then break + */ + if (circular && pos >= len) { + LASTORF[frame] = ajTrue; /* finished getting ORFs */ + if (LASTORF[0] && LASTORF[1] && LASTORF[2]) break; + } + + ajStrSetClear(&newstr[frame]); + } else + if (ORF[frame]) + getorf_AppORF(find, &newstr[frame], chrseq, pos, + aa); + } - else - { - /* - ** assume already in a ORF so we get ORFs at the start of the - ** sequence - */ - ORF[0] = ajTrue; - ORF[1] = ajTrue; - ORF[2] = ajTrue; - start[0] = 0; - start[1] = 1; - start[2] = 2; + } + + /* + ** Currently miss reporting a STOP-to-STOP ORF that is + ** the full length of a circular genome when there are no STOP codons in + ** that frame + */ + if ((find == P_STOP2STOP || find == N_STOP2STOP) && circular) { + if (!GOTSTOP[0]) { + /* translate frame 1 into pep */ + pep = ajTrnSeqOrig(trnTable, seq, 1); + if (ajSeqGetLen(pep) >= minsize && + ajSeqGetLen(pep) <= maxsize) + getorf_WriteORF(seq, len, seqlen, sense, find, orf_no, + 0, seqlen-1, ajSeqGetSeqS(pep), seqout, + around); + ajSeqDel(&pep); } - for(pos=0; pos= seqlen-5) - { - - /* - ** End of the sequence? If so, append any - ** last codon to the sequence - otherwise, ignore the STOP - ** codon - */ - if(codon != STOP) - getorf_AppORF(find, &newstr[frame], chrseq, pos, - aa); - - /* Already have a sequence to write out? */ - if(ORF[frame]) - { - if(ajStrGetLen(newstr[frame]) >= minsize && - ajStrGetLen(newstr[frame]) <= maxsize) - { - /* create a new sequence */ - if(codon == STOP) - getorf_WriteORF(seq, len, seqlen, sense, - find, orf_no, start[frame], - pos-1, newstr[frame], - seqout, around); - else - getorf_WriteORF(seq, len, seqlen, sense, - find, orf_no, start[frame], - pos+2, newstr[frame], - seqout, around); - } - - ajStrSetClear(&newstr[frame]); - } - - /* - ** if its a circular genome and the STOP codon hits past - ** the end of the genome in all frames, then break - */ - if(circular && pos >= len) - { - ORF[frame] = ajFalse; /* past the end of the genome */ - LASTORF[frame] = ajTrue; /* finished getting ORFs */ - if(LASTORF[0] && LASTORF[1] && LASTORF[2]) - break; - } - else - { - /* - ** hit a STOP, therefore a potential ORF to write - ** out next time, even if the genome is circular - */ - ORF[frame] = ajTrue; - start[frame] = pos+3; /* next start of the ORF */ - } - - } - else if(ORF[frame]) - /* append sequence to newstr if in an ORF */ - getorf_AppORF(find, &newstr[frame], chrseq, pos, aa); - } - else /* Look for start: P_START2STOP N_START2STOP AROUND_START */ - { - - if(codon == START && !ORF[frame]) - { - /* not in a ORF already and found a START */ - if(pos < len) - { - /* - ** reset the newstr to zero length to enable - ** storing the ORF for this - */ - ajStrSetClear(&newstr[frame]); - ORF[frame] = ajTrue; /* now in an ORF */ - start[frame] = pos; /* start of the ORF for this frame */ - if(methionine) - getorf_AppORF(find, &newstr[frame], chrseq, - pos, 'M'); - else - getorf_AppORF(find, &newstr[frame], chrseq, - pos, aa); - } - } - else if(codon == STOP) - { - /* hit a STOP */ - - /* Already have a sequence to write out? */ - if(ORF[frame]) - { - ORF[frame] = ajFalse; /* not in an ORF */ - - if(ajStrGetLen(newstr[frame]) >= minsize && - ajStrGetLen(newstr[frame]) <= maxsize) - { - /* create a new sequence */ - getorf_WriteORF(seq, len, seqlen, sense, - find, orf_no, start[frame], - pos-1, newstr[frame], - seqout, around); - } - } - - /* - ** if a circular genome and hit the STOP past - ** the end of the genome in all frames, then break - */ - if(circular && pos >= len) - { - LASTORF[frame] = ajTrue; /* finished getting ORFs */ - if(LASTORF[0] && LASTORF[1] && LASTORF[2]) break; - } - - ajStrSetClear(&newstr[frame]); - } - else if(pos >= seqlen-5) - { - /* hit the end of the sequence without a stop */ - - /* Already have a sequence to write out? */ - if(ORF[frame]) - { - ORF[frame] = ajFalse; /* not in an ORF */ - - /* - ** End of the sequence? If so, append any - ** last codon to the sequence - otherwise, ignore the - ** STOP codon - */ - if(pos >= seqlen-5 && pos < seqlen-2) - getorf_AppORF(find, &newstr[frame], chrseq, - pos, aa); - - if(ajStrGetLen(newstr[frame]) >= minsize && - ajStrGetLen(newstr[frame]) <= maxsize) - { - /* create a new sequence */ - getorf_WriteORF(seq, len, seqlen, sense, - find, orf_no, start[frame], - pos+2, newstr[frame], - seqout, around); - } - } - - /* - ** if a circular genome and hit the STOP past - ** the end of the genome in all frames, then break - */ - if(circular && pos >= len) - { - LASTORF[frame] = ajTrue; /* finished getting ORFs */ - if(LASTORF[0] && LASTORF[1] && LASTORF[2]) break; - } - - ajStrSetClear(&newstr[frame]); - } - else - if(ORF[frame]) - getorf_AppORF(find, &newstr[frame], chrseq, pos, - aa); - - } + if (!GOTSTOP[1]) { + /* translate frame 2 into pep */ + pep = ajTrnSeqOrig(trnTable, seq, 2); + if (ajSeqGetLen(pep) >= minsize && + ajSeqGetLen(pep) <= maxsize) + getorf_WriteORF(seq, len, seqlen, sense, find, orf_no, + 1, seqlen-1, ajSeqGetSeqS(pep), seqout, + around); + ajSeqDel(&pep); } - /* - ** Currently miss reporting a STOP-to-STOP ORF that is - ** the full length of a circular genome when there are no STOP codons in - ** that frame - */ - if((find == P_STOP2STOP || find == N_STOP2STOP) && circular) - { - if(!GOTSTOP[0]) - { - /* translate frame 1 into pep */ - pep = ajTrnSeqOrig(trnTable, seq, 1); - if(ajSeqGetLen(pep) >= minsize && - ajSeqGetLen(pep) <= maxsize) - getorf_WriteORF(seq, len, seqlen, sense, find, orf_no, - 0, seqlen-1, ajSeqGetSeqS(pep), seqout, - around); - ajSeqDel(&pep); - } - - if(!GOTSTOP[1]) - { - /* translate frame 2 into pep */ - pep = ajTrnSeqOrig(trnTable, seq, 2); - if(ajSeqGetLen(pep) >= minsize && - ajSeqGetLen(pep) <= maxsize) - getorf_WriteORF(seq, len, seqlen, sense, find, orf_no, - 1, seqlen-1, ajSeqGetSeqS(pep), seqout, - around); - ajSeqDel(&pep); - } - - if(!GOTSTOP[2]) - { - /* translate frame 3 into pep */ - pep = ajTrnSeqOrig(trnTable, seq, 3); - if(ajSeqGetLen(pep) >= minsize && - ajSeqGetLen(pep) >= maxsize) - getorf_WriteORF(seq, len, seqlen, sense, find, orf_no, - 2, seqlen-1, ajSeqGetSeqS(pep), seqout, - around); - ajSeqDel(&pep); - } + if (!GOTSTOP[2]) { + /* translate frame 3 into pep */ + pep = ajTrnSeqOrig(trnTable, seq, 3); + if (ajSeqGetLen(pep) >= minsize && + ajSeqGetLen(pep) >= maxsize) + getorf_WriteORF(seq, len, seqlen, sense, find, orf_no, + 2, seqlen-1, ajSeqGetSeqS(pep), seqout, + around); + ajSeqDel(&pep); } + } - for(i=0;i<3;++i) - ajStrDel(&newstr[i]); + for (i=0;i<3;++i) + ajStrDel(&newstr[i]); - return; + return; } @@ -537,179 +521,168 @@ void getorf_FindORFs(const AjPSeq seq, ajint len, const AjPTrn trnTable, ******************************************************************************/ static void getorf_WriteORF(const AjPSeq seq, - ajint len, ajint seqlen, AjBool sense, - ajint find, ajint *orf_no, ajint start, ajint pos, - const AjPStr str, AjPSeqout seqout, ajint around) -{ - AjPSeq new; - AjPStr name = NULL; /* name of the ORF */ - AjPStr value = NULL; /* string value of the ORF number */ - ajint s; - ajint e; /* start and end positions */ - AjPStr aroundstr = NULL; /* holds sequence string around the - codon of interest */ - ajint codonpos = 0; /* holds position of start of codon - of interest */ - - s = start+1; - e = pos+1; + ajint len, ajint seqlen, AjBool sense, + ajint find, ajint *orf_no, ajint start, ajint pos, + const AjPStr str, AjPSeqout seqout, ajint around) { + AjPSeq new; + AjPStr name = NULL; /* name of the ORF */ + AjPStr value = NULL; /* string value of the ORF number */ + ajint s; + ajint e; /* start and end positions */ + AjPStr aroundstr = NULL; /* holds sequence string around the + codon of interest */ + ajint codonpos = 0; /* holds position of start of codon + of interest */ + + s = start+1; + e = pos+1; + + /* + ** it is possible for an ORF in a circular genome to appear to start + ** past the end of the genome. + ** Move the reported positions back to start in the range 1..len + ** for readability. + */ + while (s > len) { + s -= len; + e -= len; + } + + new = ajSeqNew(); + if (find == N_STOP2STOP || find == N_START2STOP || + find == AROUND_INIT_STOP || find == AROUND_END_STOP || + find == AROUND_START) + ajSeqSetNuc(new); + else + ajSeqSetProt(new); + + + /* + ** Set the start and end positions to report and get the sequence for + ** the AROUND* sequences + */ + if (find == AROUND_INIT_STOP) { + codonpos = s-3; + s = codonpos - around; /* 50 before the initial STOP */ + e = codonpos + around+2; /* 50 after the end of the STOP */ + if (s < 1) + return; + + if (e > seqlen) + return; + ajStrAssignSubS(&aroundstr, ajSeqGetSeqS(seq), s-1, e-1); + + } else if (find == AROUND_START) { + codonpos = s; + s = codonpos - around; /* 50 before the initial STOP */ + e = codonpos + around+2; /* 50 after the end of the STOP */ + if (s < 1) + return; + + if (e > seqlen) + return; + ajStrAssignSubS(&aroundstr, ajSeqGetSeqS(seq), s-1, e-1); + + } else if (find == AROUND_END_STOP) { + codonpos = e+1; + s = codonpos - around; /* 50 before the initial STOP */ + e = codonpos + around+2; /* 50 after the end of the STOP */ + if (s < 1) + return; + + if (e > seqlen) + return; + ajStrAssignSubS(&aroundstr, ajSeqGetSeqS(seq), s-1, e-1); + } + + + + /* set the name and description */ + ajStrAssignS(&name, ajSeqGetNameS(seq)); + ajStrAppendC(&name, "_"); + + /* post-increment the ORF number for the next ORF */ + ajStrFromInt(&value,(*orf_no)++); + + ajStrAppendS(&name, value); + ajSeqAssignNameS(new, name); + + /* set the description of the translation */ + ajStrAssignC(&name, "["); + + /* Reverse the reported positions if this is the reverse sense */ + if (!sense) { + s = len-s+1; + e = len-e+1; /* - ** it is possible for an ORF in a circular genome to appear to start - ** past the end of the genome. - ** Move the reported positions back to start in the range 1..len - ** for readability. + ** shift the positions back into the range 1..len as far as possible + ** without going into negative numbers */ - while(s > len) - { - s -= len; - e -= len; + while (e > len) { + s -= len; + e -= len; } - new = ajSeqNew(); - if(find == N_STOP2STOP || find == N_START2STOP || - find == AROUND_INIT_STOP || find == AROUND_END_STOP || - find == AROUND_START) - ajSeqSetNuc(new); - else - ajSeqSetProt(new); - - - /* - ** Set the start and end positions to report and get the sequence for - ** the AROUND* sequences - */ - if(find == AROUND_INIT_STOP) - { - codonpos = s-3; - s = codonpos - around; /* 50 before the initial STOP */ - e = codonpos + around+2; /* 50 after the end of the STOP */ - if(s < 1) - return; - - if(e > seqlen) - return; - ajStrAssignSubS(&aroundstr, ajSeqGetSeqS(seq), s-1, e-1); - - } - else if(find == AROUND_START) - { - codonpos = s; - s = codonpos - around; /* 50 before the initial STOP */ - e = codonpos + around+2; /* 50 after the end of the STOP */ - if(s < 1) - return; - - if(e > seqlen) - return; - ajStrAssignSubS(&aroundstr, ajSeqGetSeqS(seq), s-1, e-1); - - } - else if(find == AROUND_END_STOP) - { - codonpos = e+1; - s = codonpos - around; /* 50 before the initial STOP */ - e = codonpos + around+2; /* 50 after the end of the STOP */ - if(s < 1) - return; - - if(e > seqlen) - return; - ajStrAssignSubS(&aroundstr, ajSeqGetSeqS(seq), s-1, e-1); + while (e < 0 || s < 0) { + s += len; + e += len; } + } + /* the base before the stop codon (numbering bases from 1) */ + ajStrFromInt(&value, s); + ajStrAppendS(&name, value); + ajStrAppendC(&name, " - "); - /* set the name and description */ - ajStrAssignS(&name, ajSeqGetNameS(seq)); - ajStrAppendC(&name, "_"); + /* the base before the stop codon (numbering bases from 1) */ + ajStrFromInt(&value, e); - /* post-increment the ORF number for the next ORF */ - ajStrFromInt(&value,(*orf_no)++); - - ajStrAppendS(&name, value); - ajSeqAssignNameS(new, name); - /* set the description of the translation */ - ajStrAssignC(&name, "["); + ajStrAppendS(&name, value); + ajStrAppendC(&name, "] "); - /* Reverse the reported positions if this is the reverse sense */ - if(!sense) - { - s = len-s+1; - e = len-e+1; - - /* - ** shift the positions back into the range 1..len as far as possible - ** without going into negative numbers - */ - while(e > len) - { - s -= len; - e -= len; - } + /* make it clear if this is the reverse sense */ + if (!sense) + ajStrAppendC(&name, "(REVERSE SENSE) "); - while(e < 0 || s < 0) - { - s += len; - e += len; - } - } - /* the base before the stop codon (numbering bases from 1) */ - ajStrFromInt(&value, s); - - ajStrAppendS(&name, value); - ajStrAppendC(&name, " - "); + /* + ** make it clear if this is a circular genome and the ORF crosses + ** the breakpoint + */ + if (s> len || e > len) + ajStrAppendC(&name, "(ORF crosses the breakpoint) "); - /* the base before the stop codon (numbering bases from 1) */ - ajStrFromInt(&value, e); - + if (find == AROUND_INIT_STOP || find == AROUND_START || + find == AROUND_END_STOP) { + ajStrAppendC(&name, "Around codon at "); + ajStrFromInt(&value, codonpos); ajStrAppendS(&name, value); - ajStrAppendC(&name, "] "); + ajStrAppendC(&name, ". "); + } - /* make it clear if this is the reverse sense */ - if(!sense) - ajStrAppendC(&name, "(REVERSE SENSE) "); + ajStrAppendS(&name, ajSeqGetDescS(seq)); + ajSeqAssignDescS(new, name); - - /* - ** make it clear if this is a circular genome and the ORF crosses - ** the breakpoint - */ - if(s> len || e > len) - ajStrAppendC(&name, "(ORF crosses the breakpoint) "); + /* replace newstr in new */ + if (find == N_STOP2STOP || find == N_START2STOP || find == P_STOP2STOP || + find == P_START2STOP) + ajSeqAssignSeqS(new, str); + else + /* sequence to be 50 bases around the codon */ + ajSeqAssignSeqS(new, aroundstr); - if(find == AROUND_INIT_STOP || find == AROUND_START || - find == AROUND_END_STOP) - { - ajStrAppendC(&name, "Around codon at "); - ajStrFromInt(&value, codonpos); - ajStrAppendS(&name, value); - ajStrAppendC(&name, ". "); - } + ajSeqoutWriteSeq(seqout, new); - ajStrAppendS(&name, ajSeqGetDescS(seq)); - ajSeqAssignDescS(new, name); + ajSeqDel(&new); + ajStrDel(&value); + ajStrDel(&name); - - /* replace newstr in new */ - if(find == N_STOP2STOP || find == N_START2STOP || find == P_STOP2STOP || - find == P_START2STOP) - ajSeqAssignSeqS(new, str); - else - /* sequence to be 50 bases around the codon */ - ajSeqAssignSeqS(new, aroundstr); - - ajSeqoutWriteSeq(seqout, new); - - ajSeqDel(&new); - ajStrDel(&value); - ajStrDel(&name); - - return; + return; } @@ -728,20 +701,17 @@ static void getorf_WriteORF(const AjPSeq seq, ******************************************************************************/ static void getorf_AppORF(ajint find, AjPStr *str, - const char *chrseq, ajint pos, - char aa) -{ - - if(find == N_STOP2STOP || find == N_START2STOP || - find == AROUND_INIT_STOP || find == AROUND_END_STOP) - { - ajStrAppendK(str, chrseq[pos]); - ajStrAppendK(str, chrseq[pos+1]); - ajStrAppendK(str, chrseq[pos+2]); - } - else if(find == P_STOP2STOP || find == P_START2STOP || - find == AROUND_START) - ajStrAppendK(str, aa); - - return; + const char *chrseq, ajint pos, + char aa) { + + if (find == N_STOP2STOP || find == N_START2STOP || + find == AROUND_INIT_STOP || find == AROUND_END_STOP) { + ajStrAppendK(str, chrseq[pos]); + ajStrAppendK(str, chrseq[pos+1]); + ajStrAppendK(str, chrseq[pos+2]); + } else if (find == P_STOP2STOP || find == P_START2STOP || + find == AROUND_START) + ajStrAppendK(str, aa); + + return; } diff --git a/src/clibs/emboss/src/biolib_getorf.h b/src/clibs/emboss/src/biolib_getorf.h index 12b200e..d8b45fa 100644 --- a/src/clibs/emboss/src/biolib_getorf.h +++ b/src/clibs/emboss/src/biolib_getorf.h @@ -3,19 +3,19 @@ #include #ifdef __cplusplus - extern "C" { +extern "C" { #endif -struct ORFstruct { - AjPSeq seq; - int frame; - unsigned int start, stop; -}; + struct ORFstruct { + AjPSeq seq; + int frame; + unsigned int start, stop; + }; -typedef struct ORFstruct ORF; + typedef struct ORFstruct ORFrec; -ORF *biolib_getorf(AjPSeq seq, AjPTrn table, unsigned int minsize); +// ORF *biolib_getorf(AjPSeq seq, AjPTrn table, unsigned int minsize); #ifdef __cplusplus - } +} #endif