diff --git a/Makefile b/Makefile index 8874681..8e8f80e 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,13 @@ +MAKEFLAGS += --no-print-directory CC = g++ ifdef debug CFLAGS= -O0 -g -fomit-frame-pointer else -CFLAGS= -O4 -fomit-frame-pointer +CFLAGS= -O3 -fomit-frame-pointer endif -SUBDIRS = sparsePregraph standardPregraph -PROG= SOAPdenovo-63mer SOAPdenovo-127mer +SUBDIRS = sparsePregraph standardPregraph fusion +PROG= SOAPdenovo-63mer SOAPdenovo-127mer SOAPdenovo-fusion INCLUDES= -I./sparsePregraph/inc -I./standardPregraph/inc LIBPATH= -L/lib64 -L/usr/lib64 -L./sparsePregraph/inc -L./standardPregraph/inc @@ -23,15 +24,6 @@ EXTRA_FLAGS += -Wl,--hash-style=both LIBS += -lbam -lrt endif -ifneq (,$(findstring Unix,$(shell uname))) -EXTRA_FLAGS += -Wl,--hash-style=both -LIBS += -lbam -lrt -endif - -ifneq (,$(findstring Darwin,$(shell uname))) -LIBS += -lbammac -endif - ifneq (,$(findstring $(shell uname -m), x86_64)) CFLAGS += -m64 endif @@ -45,7 +37,10 @@ CFLAGS += -mpowerpc64 endif -all: SOAPdenovo-63mer SOAPdenovo-127mer +all: SOAPdenovo-63mer SOAPdenovo-127mer SOAPdenovo-fusion + +SOAPdenovo-fusion: + @cd fusion;make;cp SOAPdenovo-fusion ../;cd ..; ifdef debug SOAPdenovo-63mer: @@ -56,10 +51,6 @@ SOAPdenovo-127mer: @cd sparsePregraph;make 127mer=1 debug=1;cd ..; @cd standardPregraph;make 127mer=1 debug=1;cd ..; @$(CC) sparsePregraph/*.o standardPregraph/*.o $(LIBPATH) $(LIBS) $(EXTRA_FLAGS) -o SOAPdenovo-127mer -clean: - @cd sparsePregraph;make clean;cd ..; - @cd standardPregraph;make clean;cd ..; - @rm SOAPdenovo-63mer SOAPdenovo-127mer -f else SOAPdenovo-63mer: @cd sparsePregraph;make 63mer=1;cd ..; @@ -69,8 +60,10 @@ SOAPdenovo-127mer: @cd sparsePregraph;make 127mer=1;cd ..; @cd standardPregraph;make 127mer=1;cd ..; @$(CC) sparsePregraph/*.o standardPregraph/*.o $(LIBPATH) $(LIBS) $(EXTRA_FLAGS) -o SOAPdenovo-127mer +endif + clean: @cd sparsePregraph;make clean;cd ..; @cd standardPregraph;make clean;cd ..; - @rm SOAPdenovo-63mer SOAPdenovo-127mer -f -endif + @cd fusion;make clean;cd ..; + @rm -f SOAPdenovo-63mer SOAPdenovo-127mer SOAPdenovo-fusion diff --git a/README.md b/README.md index e3801a6..e2ecc4d 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,19 @@ # Manual of SOAPdenovo2 -## What's next of SOAPdenovo2 +## About MEGAHIT -MEGAHIT is the formal successor of SOAPdenovo2 +MEGAHIT works with single-cell sequencing data and metagenomcis data. Compare to SOAPdenovo, it generates longer contigs and consumes less memory. +To scaffold the contigs generated by MEGAHIT, please use SOAPdenovo-fusion. It is a preparation module that takes contigs as input and generates files that could be used consecutively by SOAPdenovo's map and scaff module. + +Reference: MEGAHIT: An ultra-fast single-node solution for large and complex metagenomics assembly via succinct de Bruijn graph -http://www.ncbi.nlm.nih.gov/pubmed/25609793 -https://github.com/voutcn/megahit +Manuscript +Github + +## For MAC users + +Please use brew to install SOAPdenovo. SOAPdenovo's package in Homebrew-science is managed by Shaun Jackman. ## Introduction diff --git a/fusion/Makefile b/fusion/Makefile new file mode 100755 index 0000000..b33e154 --- /dev/null +++ b/fusion/Makefile @@ -0,0 +1,46 @@ +# Generated automatically from Makefile.in by configure. +SHELL = /bin/sh + +exec_prefix = . +bindir = $(exec_prefix)/bin +libdir = +mandir = + +CC = gcc +CCOPT = -O3 -fprefetch-loop-arrays -funroll-loops -fomit-frame-pointer -w +LIBS = -lm -lpthread +INCDIRS = -Iinc/ +CFLAGS = ${CCOPT} ${INCDIRS} + +all: clean SOAPdenovo-fusion +SRCS1 = searchPath.c scaffold.c check.c seq.c bundle.c potential.c\ + loadGraph.c mem_manager.c attachPEinfo.c newhash.c\ + output_scaffold.c orderContig.c connect.c hashFunction.c\ + readseq1by1.c fib.c fibHeap.c stack.c kmer.c prepare.c +OBJS1 = searchPath.o scaffold.o check.o seq.o bundle.o potential.o\ + loadGraph.o mem_manager.o attachPEinfo.o newhash.o\ + output_scaffold.o orderContig.o connect.o hashFunction.o\ + readseq1by1.o fib.o fibHeap.o stack.o kmer.o prepare.o + +SRCS2 = prlHashCtg.c prlRead2Ctg.c map.c localAsm.c\ + lib.c darray.c prlReadFillGap.c read2scaf.c +OBJS2 = prlHashCtg.o prlRead2Ctg.o map.o localAsm.o\ + lib.o darray.o prlReadFillGap.o read2scaf.o + + +SRCS3 = main.c +OBJS3 = main.o + +.c.o : + @printf "Compiling $<... \r" + @$(CC) $(CFLAGS) -c $< + +SOAPdenovo-fusion: $(OBJS1) $(OBJS2) $(OBJS3) + @printf "Making $@... \r" + @$(CC) $(CCOPT) -o $@ $^ $(LIBS) + @printf "$@ compilation done.\n"; + +clean: + @/bin/rm -f *.o SOAPdenovo-fusion + @printf "SOAPdenovo-fusion cleaning done. \n" + diff --git a/fusion/attachPEinfo.c b/fusion/attachPEinfo.c new file mode 100755 index 0000000..b733e80 --- /dev/null +++ b/fusion/attachPEinfo.c @@ -0,0 +1,488 @@ +#include "stdinc.h" +#include "newhash.h" +#include "extfunc.h" +#include "extvab.h" +#include "stack.h" + +#define CNBLOCKSIZE 10000 +#define GAPARRSIZE 256 +#define BIG_NEG -10000000 +#define BIG_POS 10000000 +static STACK * isStack; +static int ignorePE1,ignorePE2,ignorePE3,ignorePE4,ignorePE5,static_flag; +static int onsameCtgPE; +static unsigned long long peSUM; + +//static boolean staticF; + +static int existCounter; + +int calcuIS(STACK *intStack,int *SD); + + +static int cmp_pe(const void *a,const void *b) +{ + PE_INFO *A,*B; + A = (PE_INFO *)a; + B = (PE_INFO *)b; + + if(A->rank>B->rank) + return 1; + else if(A->rank==B->rank) + return 0; + else + return -1; +} + +void loadPEgrads(char *infile) +{ + FILE *fp; + char name[256],line[1024]; + int i; + boolean rankSet=1; + + sprintf(name,"%s.peGrads",infile); + fp = fopen(name,"r"); + if(!fp){ + printf("can not open file %s .\n",name); + gradsCounter = 0; + return; + } + + while(fgets(line,sizeof(line),fp)!=NULL){ + if(line[0] == 'g'){ + sscanf(line+10, "%d %lld %d",&gradsCounter,&n_solexa,&maxReadLen); + //printf("there're %d grads, %lld reads, max read len %d\n",gradsCounter,n_solexa,maxReadLen); + printf("[%s]reads statistic : %lld reads with max len %d in %d grads .\n",__FUNCTION__,n_solexa,maxReadLen,gradsCounter); + break; + } + } + + alloc_pe_mem(gradsCounter); + + for(i=0;i255) + weight = 255; + + connect = getCntBetween(e1, e2); + if(connect){ + if(!weight) + return connect; + existCounter++; + if(!inherit){ + sum = connect->weightNotInherit*connect->gapLen + gap*weight; + connect->gapLen = sum/(connect->weightNotInherit+weight); + if(connect->weightNotInherit+weight <=255) + connect->weightNotInherit += weight; + else if(connect->weightNotInherit<255) + connect->weightNotInherit = 255; + }else{ + sum = connect->weight*connect->gapLen + gap*weight; + connect->gapLen = sum/(connect->weight+weight); + if(!connect->inherit){ + connect->maxSingleWeight = connect->weightNotInherit; + } + connect->inherit = 1; + connect->maxSingleWeight = connect->maxSingleWeight>weight ? + connect->maxSingleWeight:weight; + } + if(connect->weight+weight <=255){ + connect->weight += weight; + }else if(connect->weight<255){ + connect->weight = 255; + } + + }else{ + newCntCounter++; + connect = allocateCN(e2,gap); + if(cntLookupTable) + putCnt2LookupTable(e1,connect); + connect->weight = weight; + if(contig_array[e1].mask||contig_array[e2].mask){ + connect->mask = 1; + } + connect->next = contig_array[e1].downwardConnect; + contig_array[e1].downwardConnect = connect; + if(!inherit){ + connect->weightNotInherit = weight; + }else{ + connect->weightNotInherit = 0; + connect->inherit = 1; + connect->maxSingleWeight = weight; + } + } + + return connect; +} +CONNECT *add1AccuConnect(unsigned int e1, unsigned int e2, int gap, int weight) +{ + if(e1==e2||e1==getTwinCtg(e2)) + return NULL; + CONNECT *connect=NULL; + //long long sum; + if(weight>255) + weight = 255; + + connect = getCntBetween(e1, e2); + if(connect){ + if(!weight) + return connect; + existCounter++; + //if(!inherit){ + //sum = connect->weightNotInherit*connect->gapLen + gap*weight; + //connect->gapLen = sum/(connect->weightNotInherit+weight); + int i=connect->weightNotInherit; + + if(connect->weightNotInherit+weight <=255) + connect->weightNotInherit += weight; + else if(connect->weightNotInherit<255) + connect->weightNotInherit = 255; + for(;iweightNotInherit;i++){ + connect->PE[i]=gap; + fprintf(stderr,"inputting a PE with estimated gap size %d\n",gap); + } + /*}else{ + //sum = connect->weight*connect->gapLen + gap*weight; + //connect->gapLen = sum/(connect->weight+weight); + if(!connect->inherit){ + connect->maxSingleWeight = connect->weightNotInherit; + } + connect->inherit = 1; + connect->maxSingleWeight = connect->maxSingleWeight>weight ? + connect->maxSingleWeight:weight; + }*/ + if(connect->weight+weight <=255){ + connect->weight += weight; + }else if(connect->weight<255){ + connect->weight = 255; + } + + }else{ + newCntCounter++; + connect = allocateCN(e2,gap); + if(cntLookupTable) + putCnt2LookupTable(e1,connect); + connect->weight = weight; + connect->PE=(int *)ckalloc(GAPARRSIZE*sizeof(int));//newly added + fprintf(stderr,"creating array for PEs in a connection.\n"); + int i; + for(i=0;iPE[i]=gap; + fprintf(stderr,"inputting a PE with estimated gap size %d\n",gap); + } + if(contig_array[e1].mask||contig_array[e2].mask){ + connect->mask = 1; + } + connect->next = contig_array[e1].downwardConnect; + contig_array[e1].downwardConnect = connect; + //if(!inherit){ + connect->weightNotInherit = weight; + /*}else{ + connect->weightNotInherit = 0; + connect->inherit = 1; + connect->maxSingleWeight = weight; + }*/ + } + + return connect; +} +int attach1PE(unsigned int e1,int pre_pos,unsigned int bal_e2,int pos,int insert_size) +{ + int gap,realpeSize; + unsigned int bal_e1,e2; + if(e1==bal_e2){ + ignorePE1++; + return -1; //orientation wrong + } + + bal_e1 = getTwinCtg(e1); + e2 = getTwinCtg(bal_e2); + if(e1==e2){ + realpeSize = contig_array[e1].length + overlaplen - pre_pos - pos; + if(realpeSize>0){ + peSUM += realpeSize; + onsameCtgPE++; + if((int)contig_array[e1].length>insert_size){ + int *item = (int *)stackPush(isStack); + (*item) = realpeSize; + } + } + return 2; + } + + gap = insert_size - overlaplen + pre_pos + pos - contig_array[e1].length - contig_array[e2].length; + //fprintf(stderr,"[%s]\tgap\t%d\t%d\t%f\t%f\n",__FUNCTION__,gap,insert_size,close_threshold,insert_size*close_threshold); + if(gap<-(insert_size*close_threshold)){ + ignorePE2++; + return 0; + } + if(gap>insert_size){ + ignorePE3++; + return 0; + } + add1AccuConnect(e1,e2,gap,1); + add1AccuConnect(bal_e2,bal_e1,gap,1); + + return 1; +} + +int connectByPE_grad(FILE *fp,int peGrad,char *line) +{ + fprintf(stderr,"[%s]entering this function.\n",__FUNCTION__); + long long pre_readno,readno,minno,maxno; + int pre_pos,pos,flag,PE,count=0; + unsigned int pre_contigno,contigno,newIndex; + + if(peGrad<0||peGrad>gradsCounter){ + printf("[%s]specified pe grad is out of bound .\n",__FUNCTION__); + return 0; + } + maxno = pes[peGrad].PE_bound; + if(peGrad==0) + minno = 0; + else + minno = pes[peGrad-1].PE_bound; + + onsameCtgPE = peSUM = 0; + PE = pes[peGrad].insertS; + if(strlen(line)){ + sscanf(line,"%lld %d %d",&pre_readno,&pre_contigno,&pre_pos); + //printf("first record %d %d %d\n",pre_readno,pre_contigno,pre_pos); + if(pre_readno<=minno) + pre_readno = -1; + } + else + pre_readno = -1; + ignorePE1 = ignorePE2 = ignorePE3 = ignorePE4 = ignorePE5 = 0; + static_flag = 1; + isStack = (STACK *)createStack(CNBLOCKSIZE,sizeof(int)); + while(fgets(line,lineLen,fp)!=NULL){ + sscanf(line,"%lld %d %d",&readno,&contigno,&pos); + if(readno>maxno) + break; + if(readno<=minno) + continue; + + newIndex = index_array[contigno]; + //if(contig_array[newIndex].bal_edge==0) + if(isSameAsTwin(newIndex)) + continue; + if(PE&&(readno%2==0)&&(pre_readno==readno-1)){ // they are a pair of reads + flag = attach1PE(pre_contigno,pre_pos,newIndex,pos,PE); + if(flag==1) + count++; + } + pre_readno = readno; + pre_contigno = newIndex; + pre_pos = pos; + } + printf("[%s]Finish loading all PEs in grad %d .\n",__FUNCTION__,peGrad); + printf("[%s]Calculating estimated gap size for all connections .\n",__FUNCTION__); + unsigned int i; + for(i=1;i<=num_ctg;i++){ + CONNECT *tmp=contig_array[i].downwardConnect; + while(tmp){ + if(tmp->weightNotInherit<=8&&tmp->weightNotInherit>2){//delete max and min value + int max=BIG_NEG,maxid=-1,min=BIG_POS,minid=-1; + int weight=tmp->weightNotInherit; + int ii; + for(ii=0;iiPE[ii]>max){ + max=tmp->PE[ii]; + maxid=ii; + } + if(tmp->PE[ii]<=min){ + min=tmp->PE[ii]; + minid=ii; + } + } + int sum=0; + for(ii=0;iiPE[ii]; + } + } + ignorePE4+=2; + tmp->gapLen=sum/(weight-2); + fprintf(stderr,"estimating contigs' gap by removing max&min PE ,with max&min %d %d\n", + tmp->PE[maxid],tmp->PE[minid]); + }else if(tmp->weightNotInherit>8){//delete values exceed 3*SD + long long int sum=0; + int weight=tmp->weightNotInherit; + int ii; + int counter=0; + for(ii=0;iiPE[ii]; + } + + long long int avg=sum/weight; + sum = 0; + for(ii=0;iiPE[ii]-avg)*(tmp->PE[ii]-avg)); + } + + double SD=(sqrt((double)sum/(weight-1)))*3;//just for fast + sum=0; + int num=0; + for(ii=0;iiPE[ii]-avg)<=SD){ + sum+=tmp->PE[ii]; + num++; + }else{ + ignorePE5++; + counter++; + } + } + if(num==0){ + fprintf(stderr,"[%s]num=0 in removing exceed 3*SD(%.1f) avg(%d)step",__FUNCTION__,SD,avg); + for(ii=0;iiPE[ii]); + } + } + tmp->gapLen=sum/num; + fprintf(stderr,"estimating contigs' gap by removing PE exceeding 3*SD ,removing %d PEs\n",counter); + }else if(tmp->weightNotInherit<=2){ + int weight=tmp->weightNotInherit; + int sum=0; + int ii; + for(ii=0;iiPE[ii]; + } + tmp->gapLen=sum/weight; + fprintf(stderr,"weight too small , directly estimate gap size.\n"); + } + //fprintf(stderr,"finish %d connection.\n",i); + free((void *)tmp->PE); + tmp=tmp->next; + } + } + //printf("%d PEs with insert size %d attached, %d + %d + %d ignored\n",count,PE,ignorePE1,ignorePE2,ignorePE3); + fprintf(stderr,"[%s]%d PEs of insert size %d loaded .\n",__FUNCTION__,count,PE); + fprintf(stderr,"[%s]PEs discarded:%d because of wrong orientation,%d too close,%d too far,\n",__FUNCTION__,ignorePE1,ignorePE2,ignorePE3); + fprintf(stderr,"[%s]%d deleted by removing max&min , %d not fall in 3*SD.\n",__FUNCTION__,ignorePE4,ignorePE5); + printf("[%s]%d PEs of insert size %d loaded .\n",__FUNCTION__,count,PE); + printf("[%s]PEs discarded :%d because of wrong orientation,%d too close,%d too far ,\n",__FUNCTION__,ignorePE1,ignorePE2,ignorePE3); + printf("[%s]%d deleted by removing max&min , %d not fall in 3*SD .\n",__FUNCTION__,ignorePE4,ignorePE5); + + if(onsameCtgPE>0){ + //printf("estimated PE size %lli, by %d pairs\n",peSUM/onsameCtgPE,onsameCtgPE); + int SD=0; + int avg=calcuIS(isStack,&SD); + printf("[%s]%d PE attached on same contig with estimated insert size %d SD %d .\n",__FUNCTION__,onsameCtgPE,avg,SD); + } + //printf("on contigs longer than %d, %d pairs found,",PE,isStack->item_c); + //printf("insert_size estimated: %d\n",calcuIS(isStack)); + freeStack(isStack); + return count; +} + + +int calcuIS(STACK *intStack,int *SD) +{ + long long sum=0; + int avg=0; + int *item; + int num = intStack->item_c; + + if(num<100) + return avg; + stackBackup(intStack); + while((item=(int *)stackPop(intStack))!=NULL) + sum += *item; + + stackRecover(intStack); + num = intStack->item_c; + avg = sum/num; + + sum = 0; + stackBackup(intStack); + while((item=(int *)stackPop(intStack))!=NULL) + sum += (*item-avg)*(*item-avg); + + *SD = sqrt(sum/(num-1)); + if(SD==0){ + //printf("SD=%d, ",SD); + return avg; + } + stackRecover(intStack); + sum = num = 0; + while((item=(int *)stackPop(intStack))!=NULL) + if(abs(*item-avg)<3**SD){ + sum += *item; + num++; + } + + avg = sum/num; + //printf("SD=%d, ",SD); + return avg; + +} + +unsigned int getTwinCtg(unsigned int ctg) +{ + return ctg + contig_array[ctg].bal_edge - 1; +} + +boolean isSmallerThanTwin(unsigned int ctg) +{ + return contig_array[ctg].bal_edge > 1; +} + +boolean isLargerThanTwin(unsigned int ctg) +{ + return contig_array[ctg].bal_edge < 1; +} + +boolean isSameAsTwin(unsigned int ctg) +{ + return contig_array[ctg].bal_edge == 1; +} diff --git a/fusion/bundle.c b/fusion/bundle.c new file mode 100755 index 0000000..4bc1efa --- /dev/null +++ b/fusion/bundle.c @@ -0,0 +1,455 @@ +#include "stdinc.h" +#include "newhash.h" +#include "extfunc.h" +#include "extvab.h" +#include "dfibHeap.h" +#include "fibHeap.h" +#include "darray.h" + + +#define CNBLOCKSIZE 10000 +#define GAPARRSIZE 256 +#define BIG_NEG -10000000 +#define BIG_POS 10000000 +static STACK * isStack; +static int onsameCtgPE; +extern int calcuIS(STACK *intStack,int *SD); +void outputBundle(FILE *fp, int insertS); + +static CONNECT *bun1AccuConnect(unsigned int e1, unsigned int e2, int gap, int weight) +{ + if(e1==e2||e1==getTwinCtg(e2)) + return NULL; + CONNECT *connect=NULL; + //long long sum; + if(weight>255) + weight = 255; + + connect = getCntBetween(e1, e2); + if(connect){ + if(!weight) + return connect; + //existCounter++; + //if(!inherit){ + //sum = connect->weightNotInherit*connect->gapLen + gap*weight; + //connect->gapLen = sum/(connect->weightNotInherit+weight); + int i=connect->weightNotInherit; + + if(connect->weightNotInherit+weight <=255) + connect->weightNotInherit += weight; + else if(connect->weightNotInherit<255) + connect->weightNotInherit = 255; + for(;iweightNotInherit;i++){ + //connect->PE[i]=gap; + //fprintf(stderr,"inputting a PE with estimated gap size %d\n",gap); + } + /*}else{ + //sum = connect->weight*connect->gapLen + gap*weight; + //connect->gapLen = sum/(connect->weight+weight); + if(!connect->inherit){ + connect->maxSingleWeight = connect->weightNotInherit; + } + connect->inherit = 1; + connect->maxSingleWeight = connect->maxSingleWeight>weight ? + connect->maxSingleWeight:weight; + }*/ + if(connect->weight+weight <=255){ + connect->weight += weight; + }else if(connect->weight<255){ + connect->weight = 255; + } + + }else{ + newCntCounter++; + connect = allocateCN(e2,gap); + if(cntLookupTable) + putCnt2LookupTable(e1,connect); + connect->weight = weight; + //connect->PE=(int *)ckalloc(GAPARRSIZE*sizeof(int));//newly added + //fprintf(stderr,"creating array for PEs in a connection.\n"); + int i; + for(i=0;iPE[i]=gap; + //fprintf(stderr,"inputting a PE with estimated gap size %d\n",gap); + } + if(contig_array[e1].mask||contig_array[e2].mask){ + connect->mask = 1; + } + connect->next = contig_array[e1].downwardConnect; + contig_array[e1].downwardConnect = connect; + //if(!inherit){ + connect->weightNotInherit = weight; + /*}else{ + connect->weightNotInherit = 0; + connect->inherit = 1; + connect->maxSingleWeight = weight; + }*/ + } + + return connect; +} + +static int in1PE(unsigned int e1,int pre_pos,unsigned int bal_e2,int pos,int insert_size) +{ + int gap,realpeSize; + unsigned int bal_e1,e2; + if(e1==bal_e2){ + //ignorePE1++; + return -1; //orientation wrong + } + + bal_e1 = getTwinCtg(e1); + e2 = getTwinCtg(bal_e2); + if(e1==e2){ + realpeSize = contig_array[e1].length + overlaplen - pre_pos - pos; + if(realpeSize>0){ + //peSUM += realpeSize; + onsameCtgPE++; + if((int)contig_array[e1].length>insert_size){ + int *item = (int *)stackPush(isStack); + (*item) = realpeSize; + } + } + return 2; + } + + gap = insert_size - overlaplen + pre_pos + pos - contig_array[e1].length - contig_array[e2].length; + //fprintf(stderr,"[%s]\t%d\t%d\tgap\t%d\t%d\t%d\t%d\n",__FUNCTION__,e1,e2,gap,contig_array[e1].bal_edge,contig_array[e2].bal_edge,insert_size); + //if(gap<-(insert_size/10)){ + // //ignorePE2++; + // return 0; + //} + bun1AccuConnect(e1,e2,gap,1); + bun1AccuConnect(bal_e2,bal_e1,gap,1); + + return 1; +} + +static int inputPE(FILE *fp,int peGrad,char *line) +{ + long long pre_readno,readno,minno,maxno; + int pre_pos,pos,flag,PE,count=0; + unsigned int pre_contigno,contigno,newIndex; + + if(peGrad<0||peGrad>gradsCounter){ + printf("[%s]specified pe grad is out of bound .\n",__FUNCTION__); + return 0; + } + maxno = pes[peGrad].PE_bound; + if(peGrad==0) + minno = 0; + else + minno = pes[peGrad-1].PE_bound; + + //onsameCtgPE = peSUM = 0; + PE = pes[peGrad].insertS; + if(strlen(line)){ + sscanf(line,"%lld %d %d",&pre_readno,&pre_contigno,&pre_pos); + //printf("first record %d %d %d\n",pre_readno,pre_contigno,pre_pos); + if(pre_readno<=minno) + pre_readno = -1; + } + else + pre_readno = -1; + //ignorePE1 = ignorePE2 = ignorePE3 = ignorePE4 = ignorePE5 = 0; + //static_flag = 1; + isStack = (STACK *)createStack(CNBLOCKSIZE,sizeof(int)); + while(fgets(line,lineLen,fp)!=NULL){ + sscanf(line,"%lld %d %d",&readno,&contigno,&pos); + if(readno>maxno) + break; + if(readno<=minno) + continue; + + newIndex = index_array[contigno]; + //if(contig_array[newIndex].bal_edge==0) + if(isSameAsTwin(newIndex)) + continue; + if(PE&&(readno%2==0)&&(pre_readno==readno-1)){ // they are a pair of reads + flag = in1PE(pre_contigno,pre_pos,newIndex,pos,PE); + if(flag==1) + count++; + } + pre_readno = readno; + pre_contigno = newIndex; + pre_pos = pos; + } + printf("[%s]Finish loading all PEs in grad %d .\n",__FUNCTION__,peGrad); + printf("[%s]Calculating estimated gap size for all connections .\n",__FUNCTION__); + /*unsigned int i; + for(i=1;i<=num_ctg;i++){ + CONNECT *tmp=contig_array[i].downwardConnect; + while(tmp){ + if(tmp->weightNotInherit<=8&&tmp->weightNotInherit>2){//delete max and min value + int max=BIG_NEG,maxid=-1,min=BIG_POS,minid=-1; + int weight=tmp->weightNotInherit; + int ii; + for(ii=0;iiPE[ii]>max){ + max=tmp->PE[ii]; + maxid=ii; + } + if(tmp->PE[ii]<=min){ + min=tmp->PE[ii]; + minid=ii; + } + } + int sum=0; + for(ii=0;iiPE[ii]; + } + } + //ignorePE4+=2; + tmp->gapLen=sum/(weight-2); + //fprintf(stderr,"estimating contigs' gap by removing max&min PE ,with max&min %d %d\n", + //tmp->PE[maxid],tmp->PE[minid]); + }else if(tmp->weightNotInherit>8){//delete values exceed 3*SD + long long int sum=0; + int weight=tmp->weightNotInherit; + int ii; + int counter=0; + for(ii=0;iiPE[ii]; + } + + long long int avg=sum/weight; + sum = 0; + for(ii=0;iiPE[ii])*(avg-(long long int)tmp->PE[ii])); + } + + double SD=(sqrt((double)sum/(weight-1)))*3;//just for fast + sum=0; + int num=0; + for(ii=0;iiPE[ii]-avg)<=SD){ + sum+=tmp->PE[ii]; + num++; + }else{ + //ignorePE5++; + counter++; + } + } + if(num==0){ + //fprintf(stderr,"[%s]num=0 in removing exceed 3*SD(%.1f) avg(%lld)step",__FUNCTION__,SD,avg); + for(ii=0;iiPE[ii]); + } + } + tmp->gapLen=sum/num; + //fprintf(stderr,"estimating contigs' gap by removing PE exceeding 3*SD ,removing %d PEs\n",counter); + }else if(tmp->weightNotInherit<=2){ + int weight=tmp->weightNotInherit; + int sum=0; + int ii; + for(ii=0;iiPE[ii]; + } + tmp->gapLen=sum/weight; + //fprintf(stderr,"weight too small , directly estimate gap size.\n"); + //} + //fprintf(stderr,"finish %d connection.\n",i); + //free((void *)tmp->PE); + tmp=tmp->next; + } + }*/ + //printf("%d PEs with insert size %d attached, %d + %d + %d ignored\n",count,PE,ignorePE1,ignorePE2,ignorePE3); + fprintf(stderr,"[%s]%d PEs of insert size %d loaded .\n",__FUNCTION__,count,PE); + //fprintf(stderr,"[%s]PEs discarded:%d because of wrong orientation,%d too close,%d too far,\n",__FUNCTION__,ignorePE1,ignorePE2,ignorePE3); + //fprintf(stderr,"[%s]%d deleted by removing max&min , %d not fall in 3*SD.\n",__FUNCTION__,ignorePE4,ignorePE5); + //printf("[%s]%d PEs of insert size %d loaded .\n",__FUNCTION__,count,PE); + //printf("[%s]PEs discarded :%d because of wrong orientation,%d too close,%d too far ,\n",__FUNCTION__,ignorePE1,ignorePE2,ignorePE3); + //printf("[%s]%d deleted by removing max&min , %d not fall in 3*SD .\n",__FUNCTION__,ignorePE4,ignorePE5); + + /*if(onsameCtgPE>0){ + //printf("estimated PE size %lli, by %d pairs\n",peSUM/onsameCtgPE,onsameCtgPE); + int SD=0; + int avg=calcuIS(isStack,&SD); + printf("[%s]%d PE attached on same contig with estimated insert size %d SD %d .\n",__FUNCTION__,onsameCtgPE,avg,SD); + }*/ + //printf("on contigs longer than %d, %d pairs found,",PE,isStack->item_c); + //printf("insert_size estimated: %d\n",calcuIS(isStack)); + //freeStack(isStack); + return count; +} + +int call_bundle(){ + char name[256],*line; + FILE *fp,*linkF; + int i; + int flag=0; + unsigned int j; + + loadUpdatedEdges(graphfile); + + //sprintf(name,"%s.bundle",graphfile); + + linkF = ckopen(name,"w"); + + if(!pes) + loadPEgrads(graphfile); + + sprintf(name,"%s.readOnContig",graphfile); + fp = ckopen(name,"r"); + + lineLen = 1024; + line = (char *)ckalloc(lineLen*sizeof(char)); + + fgets(line,lineLen,fp); + line[0] = '\0'; + + //printf("\n"); + newCntCounter = 0; + //createCntMemManager(); + //createCntLookupTable(); + /*int *length_array = (unsigned int *)ckalloc((num_ctg+1)*sizeof(unsigned int)); + //use length_array to change info in index_array + for(i=1;i<=num_ctg;i++) + length_array[i] = 0; + + for(i=1;i<=num_ctg;i++){ + if(index_array[i]>0) + length_array[index_array[i]] = i; + } + for(i=1;i<=num_ctg;i++) + index_array[i] = length_array[i]; + */ + for(i=0;iPE); + tmp=tmp->next; + } + contig_array[j].downwardConnect = NULL; + } + //destroyConnectMem(); + //deleteCntLookupTable(); + + fclose(linkF); + } + + outputBundle(linkF,1); + destroyConnectMem(); + deleteCntLookupTable(); + + free((void *)line); + fclose(fp); + //fclose(linkF); + printf("[%s]all PEs attached\n",__FUNCTION__); + + return 0; +} + +void outputBundle(FILE *fp, int insertS) +{ + unsigned int i,bal_ctg,bal_toCtg; + CONNECT *cnts,*temp_cnt; + //printf("outputLinks, %d contigs\n",num_ctg); + for(i=1;i<=num_ctg;i++){ + cnts = contig_array[i].downwardConnect; + bal_ctg = getTwinCtg(i); + //fprintf(stderr,"contig %d.\n",i); + while(cnts){ + if(cnts->weightNotInherit<=bund_threshold){ + cnts = cnts->next; + continue; + } + //fprintf(stderr,"with contig %d.\n",cnts->contigID); + //fprintf(fp,"%-10d %-10d\t%d\t%d\t%d\n" + //,i,cnts->contigID,cnts->gapLen,cnts->weight,insertS); + /*int st1,st2,ed1,ed2,len1,len2,gap; + len1=contig_array[i].length+overlaplen; + len2=contig_array[cnts->contigID].length+overlaplen; + gap=-cnts->gapLen; + if(len1contigID];*/ + /*if((id1/2+1)==1194){ + int ii; + fprintf(stdout,"\n"); + for(ii=0;iiweightNotInherit;++ii){ + fprintf(stdout,"%d ",cnts->PE[ii]); + } + fprintf(stdout,"\n"); + }*/ + /*if(isSmallerThanTwin(id1)){ + if(isSmallerThanTwin(id2)){ + fprintf(fp,"%u\t%d\t%u\t%d\t%d\n",id1/2+1,len1,id2/2+1,len2,cnts->gapLen,cnts->weightNotInherit); + + }else{ + fprintf(fp,"%u\t%d\t%u\t%d\t%d\n",id1/2+1,len1,-id2/2,len2,cnts->gapLen,cnts->weightNotInherit); + } + }else{ + if(isSmallerThanTwin(id2)){ + fprintf(fp,"%u\t%d\t%u\t%d\t%d\n",-id1/2,len1,id2/2+1,len2,cnts->gapLen,cnts->weightNotInherit); + }else{ + fprintf(fp,"%u\t%d\t%u\t%d\t%d\n",-id1/2,len1,-id2/2,len2,cnts->gapLen,cnts->weightNotInherit); + } + }*/ + //int ii=0; + //int weight=cnts->weightNotInherit; + //for(;iigapLen); + //} + if(cnts->gapLen<0){ + fprintf(fp,"%d\t%d\t%d\n",i,cnts->contigID,cnts->gapLen); + } + + //fprintf(fp,"\n"); + cnts->weightNotInherit = 0; + + bal_toCtg = getTwinCtg(cnts->contigID); + temp_cnt = getCntBetween(bal_toCtg,bal_ctg); + if(temp_cnt) + temp_cnt->weightNotInherit = 0; + + cnts = cnts->next; + } + } +} + diff --git a/fusion/check.c b/fusion/check.c new file mode 100755 index 0000000..2af20d6 --- /dev/null +++ b/fusion/check.c @@ -0,0 +1,64 @@ +/*************************************************************************** + * Title: check.c + * Author: Haixu Tang + * Created: Jun. 2002 + * Last modified: May. 2004 + * + * Copyright (c) 2001-2004 The Regents of the University of California + * All Rights Reserved + * See file LICENSE for details. + ***************************************************************************/ + +/* ckopen - open file; check for success */ + +#include +//#include + +void *ckalloc(unsigned long long amount); +FILE *ckopen(char *name, char *mode); + +FILE *ckopen(char *name, char *mode) +{ + FILE *fp; + + if ((fp = fopen(name, mode)) == NULL) { + printf("Cannot open file %s.\n", name); + exit(-1); + } + return(fp); +} + + +/* ckalloc - allocate space; check for success */ + +void *ckalloc(unsigned long long amount) +{ + void *p; + + if ((p = (void *) calloc( 1, (unsigned long long) amount)) == NULL && amount != 0) { + printf("not enought memory"); + fflush(stdout); + exit(-1); + } + return(p); +} + + +/* reallocate memory */ +void *ckrealloc(void *p, size_t new_size, size_t old_size) +{ + void *q; + + q = realloc((void *) p, new_size); + if (new_size == 0 || q != (void *) 0) + return q; + + /* manually reallocate space */ + q = ckalloc(new_size); + + /* move old memory to new space */ + bcopy(p, q, old_size); + free(p); + + return q; +} diff --git a/fusion/connect.c b/fusion/connect.c new file mode 100755 index 0000000..4a63a20 --- /dev/null +++ b/fusion/connect.c @@ -0,0 +1,173 @@ +#include "stdinc.h" +#include "newhash.h" +#include "extfunc.h" +#include "extvab.h" + +#define CNBLOCKSIZE 100000 + +void createCntMemManager() +{ + if(!cn_mem_manager) + cn_mem_manager = createMem_manager(CNBLOCKSIZE,sizeof(CONNECT)); + //else + //printf("cn_mem_manger was created\n"); +} + +void destroyConnectMem() +{ + freeMem_manager(cn_mem_manager); + cn_mem_manager = NULL; +} + +CONNECT *allocateCN(unsigned int contigId, int gap) +{ + CONNECT *newCN; + newCN = (CONNECT *)getItem(cn_mem_manager); + newCN->contigID = contigId; + newCN->gapLen = gap; + + newCN->minGap = 0; + newCN->maxGap = 0; + newCN->bySmall = 0; + newCN->weakPoint = 0; + + newCN->weight = 1; + newCN->weightNotInherit = 0; + newCN->mask = 0; + newCN->used = 0; + newCN->checking = 0; + newCN->deleted = 0; + newCN->prevInScaf = 0; + newCN->inherit = 0; + newCN->singleInScaf = 0; + newCN->nextInScaf = NULL; + newCN->PE=NULL;//(int *)ckalloc(CNBLOCKSIZE*sizeof(int)); + + return newCN; +} + +void output_cntGVZ(char *outfile) +{ + char name[256]; + FILE *fp; + unsigned int i; + CONNECT *connect; + boolean flag; + + sprintf(name,"%s.scaffold.gvz",outfile); + fp = ckopen(name,"w"); + fprintf(fp,"digraph G{\n"); + fprintf(fp,"\tsize=\"512,512\";\n"); + + for(i=num_ctg;i>0;i--){ + //if(contig_array[i].mask||!contig_array[i].downwardConnect) + if(!contig_array[i].downwardConnect) + continue; + connect = contig_array[i].downwardConnect; + while(connect){ + //if(connect->mask||connect->deleted){ + if(connect->deleted){ + connect = connect->next; + continue; + } + if(connect->prevInScaf||connect->nextInScaf) + flag = 1; + else + flag = 0; + if(!connect->mask) + fprintf(fp,"\tC%d_%d -> C%d_%d [label = \"%d(%d_%d)\"];\n" + ,i,contig_array[i].length,connect->contigID,contig_array[connect->contigID].length, + connect->gapLen,flag,connect->weight); + else + fprintf(fp,"\tC%d_%d -> C%d_%d [label = \"%d(%d_%d)\", color = red];\n" + ,i,contig_array[i].length,connect->contigID,contig_array[connect->contigID].length, + connect->gapLen,flag,connect->weight); + connect = connect->next; + } + } + fprintf(fp,"}\n"); + fclose(fp); +} + +/***************** below this line all codes are about lookup table *****************/ + +void createCntLookupTable() +{ + if(!cntLookupTable) + cntLookupTable = (CONNECT **)ckalloc((3*num_ctg+1)*sizeof(CONNECT *)); +} + +void deleteCntLookupTable() +{ + if(cntLookupTable){ + free((void *)cntLookupTable); + cntLookupTable = NULL; + } +} + +void putCnt2LookupTable(unsigned int from_c,CONNECT *cnt) +{ + if(!cnt||!cntLookupTable) + return; + unsigned int index = 2*from_c + cnt->contigID; + cnt->nextInLookupTable = cntLookupTable[index]; + cntLookupTable[index] = cnt; +} + +static CONNECT *getCntInLookupTable(unsigned int from_c,unsigned int to_c) +{ + unsigned int index = 2*from_c + to_c; + CONNECT *ite_cnt = cntLookupTable[index]; + while(ite_cnt){ + if(ite_cnt->contigID==to_c) + return ite_cnt; + ite_cnt = ite_cnt->nextInLookupTable; + } + return NULL; +} + +CONNECT *getCntBetween(unsigned int from_c, unsigned int to_c) +{ + CONNECT *pcnt; + + if(cntLookupTable){ + pcnt = getCntInLookupTable(from_c,to_c); + return pcnt; + } + pcnt = contig_array[from_c].downwardConnect; + + while(pcnt){ + if(pcnt->contigID==to_c) + return pcnt; + pcnt = pcnt->next; + } + return pcnt; +} +/* +void removeCntInLookupTable(unsigned int from_c,unsigned int to_c) +{ + unsigned int index = 2*from_c + to_c; + CONNECT *ite_cnt = cntLookupTable[index]; + CONNECT *cnt; + + if(!ite_cnt){ + printf("removeCntInLookupTable: not found A\n"); + return; + } + if(ite_cnt->contigID==to_c){ + cntLookupTable[index] = ite_cnt->nextInLookupTable; + return; + } + + while(ite_cnt->nextInLookupTable&&ite_cnt->nextInLookupTable->contigID!=to_c) + ite_cnt = ite_cnt->nextInLookupTable; + + if(ite_cnt->nextInLookupTable){ + cnt = ite_cnt->nextInLookupTable; + ite_cnt->nextInLookupTable = cnt->nextInLookupTable; + return; + } + printf("removeCntInLookupTable: not found B\n"); + return; +} +*/ diff --git a/fusion/darray.c b/fusion/darray.c new file mode 100755 index 0000000..5d6a789 --- /dev/null +++ b/fusion/darray.c @@ -0,0 +1,56 @@ +#include "darray.h" +#include "check.h" + +DARRAY *createDarray(int num_items,size_t unit_size) +{ + DARRAY *newDarray = (DARRAY *)malloc(1*sizeof(DARRAY)); + + newDarray->array_size = num_items; + newDarray->item_size = unit_size; + newDarray->item_c = 0; + newDarray->array = (void *)ckalloc(num_items*unit_size); + return newDarray; +} + +void *darrayPut(DARRAY *darray,long long index) +{ + int i=2; + if(index+1>darray->item_c) + darray->item_c = index + 1; + if(indexarray_size) + return darray->array + darray->item_size*index; + while(index>i*darray->array_size) + i++; + + darray->array = (void *)ckrealloc(darray->array,i*darray->array_size*darray->item_size + ,darray->array_size*darray->item_size); + darray->array_size *=i; + return (void *)((void *)darray->array + darray->item_size*index); +} + +void *darrayGet(DARRAY *darray, long long index) +{ + if(indexarray_size) + return (void *)((void *)darray->array + darray->item_size*index); + printf("array read index %lld out of range %lld\n",index,darray->array_size); + return NULL; +} + + +void emptyDarray(DARRAY *darray) +{ + darray->item_c = 0; +} + +void freeDarray(DARRAY *darray) +{ + + if(!darray) + return; + + if(darray->array) + free((void *)darray->array); + + free((void *)darray); +} + diff --git a/fusion/fib.c b/fusion/fib.c new file mode 100755 index 0000000..33f36a2 --- /dev/null +++ b/fusion/fib.c @@ -0,0 +1,640 @@ +/* +Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk) + + This file is part of Velvet. + + Velvet is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + Velvet is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Velvet; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*/ +/*- + * Copyright 1997-2003 John-Mark Gurney. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: fib.c,v 1.10 2007/10/19 13:09:26 zerbino Exp $ + * + */ +#include +#include +#include "fib.h" +#include "fibpriv.h" +#include "extfunc2.h" + +#define HEAPBLOCKSIZE 10000 + +static int fh_comparedata(FibHeap * h, Coordinate key, unsigned int data, FibHeapNode * b); +unsigned int fh_replacekeydata(FibHeap * h, FibHeapNode * x,Coordinate key, unsigned int data); + +static FibHeapNode *allocateFibHeapEl(FibHeap * heap) +{ + return (FibHeapNode *)getItem(heap->nodeMemory); +}; + +static void deallocateFibHeapEl(FibHeapNode * a, FibHeap * heap) +{ + returnItem(heap->nodeMemory, a); +} + +#define swap(type, a, b) \ + do { \ + type c; \ + c = a; \ + a = b; \ + b = c; \ + } while (0) \ + +#define INT_BITS (sizeof(IDnum) * 8) + +static inline IDnum ceillog2(IDnum a) +{ + IDnum oa; + IDnum i; + IDnum b; + IDnum cons; + + oa = a; + b = INT_BITS / 2; + i = 0; + while (b) { + i = (i << 1); + cons = ((IDnum) 1) << b; + if (a >= cons) { + a /= cons; + i = i | 1; + } else + a &= cons - 1; + b /= 2; + } + if ((((IDnum) 1 << i)) == oa) + return i; + else + return i + 1; +} + +/* + * Private Heap Functions + */ +static void fh_initheap(FibHeap * new) +{ + new->fh_cmp_fnct = NULL; + new->nodeMemory = createMem_manager(sizeof(FibHeapNode), HEAPBLOCKSIZE); + new->fh_neginf = 0; + new->fh_n = 0; + new->fh_Dl = -1; + new->fh_cons = NULL; + new->fh_min = NULL; + new->fh_root = NULL; + new->fh_keys = 0; +} + +static void fh_destroyheap(FibHeap * h) +{ + h->fh_cmp_fnct = NULL; + h->fh_neginf = 0; + if (h->fh_cons != NULL) + free(h->fh_cons); + h->fh_cons = NULL; + free(h); +} + +/* + * Public Heap Functions + */ +FibHeap *fh_makekeyheap() +{ + FibHeap *n; + + if ((n = malloc(sizeof *n)) == NULL) + return NULL; + + fh_initheap(n); + n->fh_keys = 1; + + return n; +} + +FibHeap *fh_makeheap() +{ + FibHeap *n; + + if ((n = malloc(sizeof *n)) == NULL) + return NULL; + + fh_initheap(n); + + return n; +} + +voidcmp fh_setcmp(FibHeap * h, voidcmp fnct) +{ + voidcmp oldfnct; + + oldfnct = h->fh_cmp_fnct; + h->fh_cmp_fnct = fnct; + + return oldfnct; +} + +unsigned int fh_setneginf(FibHeap * h, unsigned int data) +{ + unsigned int old; + + old = h->fh_neginf; + h->fh_neginf = data; + + return old; +} + +FibHeap *fh_union(FibHeap * ha, FibHeap * hb) +{ + FibHeapNode *x; + + if (ha->fh_root == NULL || hb->fh_root == NULL) { + /* either one or both are empty */ + if (ha->fh_root == NULL) { + fh_destroyheap(ha); + return hb; + } else { + fh_destroyheap(hb); + return ha; + } + } + ha->fh_root->fhe_left->fhe_right = hb->fh_root; + hb->fh_root->fhe_left->fhe_right = ha->fh_root; + x = ha->fh_root->fhe_left; + ha->fh_root->fhe_left = hb->fh_root->fhe_left; + hb->fh_root->fhe_left = x; + ha->fh_n += hb->fh_n; + /* + * we probably should also keep stats on number of unions + */ + + /* set fh_min if necessary */ + if (fh_compare(ha, hb->fh_min, ha->fh_min) < 0) + ha->fh_min = hb->fh_min; + + fh_destroyheap(hb); + return ha; +} + +void fh_deleteheap(FibHeap * h) +{ + freeMem_manager(h->nodeMemory); + h->nodeMemory = NULL; + fh_destroyheap(h); +} + +/* + * Public Key Heap Functions + */ +FibHeapNode *fh_insertkey(FibHeap * h, Coordinate key, unsigned int data) +{ + FibHeapNode *x; + + if ((x = fhe_newelem(h)) == NULL) + return NULL; + + /* just insert on root list, and make sure it's not the new min */ + x->fhe_data = data; + x->fhe_key = key; + + fh_insertel(h, x); + + return x; +} + +boolean fh_isempty(FibHeap *h) +{ + + if (h->fh_min == NULL) + return 1; + else + return 0; + +} + +Coordinate fh_minkey(FibHeap * h) +{ + if (h->fh_min == NULL) + return INT_MIN; + return h->fh_min->fhe_key; +} + + +unsigned int fh_replacekeydata(FibHeap * h, FibHeapNode * x, + Coordinate key, unsigned int data) +{ + unsigned int odata; + Coordinate okey; + FibHeapNode *y; + int r; + + odata = x->fhe_data; + okey = x->fhe_key; + + /* + * we can increase a key by deleting and reinserting, that + * requires O(lgn) time. + */ + if ((r = fh_comparedata(h, key, data, x)) > 0) { + /* XXX - bad code! */ + abort(); + } + + x->fhe_data = data; + x->fhe_key = key; + + /* because they are equal, we don't have to do anything */ + if (r == 0) + return odata; + + y = x->fhe_p; + + if (h->fh_keys && okey == key) + return odata; + + if (y != NULL && fh_compare(h, x, y) <= 0) { + fh_cut(h, x, y); + fh_cascading_cut(h, y); + } + + /* + * the = is so that the call from fh_delete will delete the proper + * element. + */ + if (fh_compare(h, x, h->fh_min) <= 0) + h->fh_min = x; + + return odata; +} + +Coordinate fh_replacekey(FibHeap * h, FibHeapNode * x, Coordinate key) +{ + Coordinate ret; + + ret = x->fhe_key; + (void) fh_replacekeydata(h, x, key, x->fhe_data); + + return ret; +} + +/* + * Public void * Heap Functions + */ +/* + * this will return these values: + * NULL failed for some reason + * ptr token to use for manipulation of data + */ +FibHeapNode *fh_insert(FibHeap * h, unsigned int data) +{ + FibHeapNode *x; + + if ((x = fhe_newelem(h)) == NULL) + return NULL; + + /* just insert on root list, and make sure it's not the new min */ + x->fhe_data = data; + + fh_insertel(h, x); + + return x; +} + +unsigned int fh_min(FibHeap * h) +{ + if (h->fh_min == NULL) + return 0; + return h->fh_min->fhe_data; +} + +unsigned int fh_extractmin(FibHeap * h) +{ + FibHeapNode *z; + unsigned int ret=0; + + + if (h->fh_min != NULL) { + z = fh_extractminel(h); + ret = z->fhe_data; +#ifndef NO_FREE + deallocateFibHeapEl(z, h); +#endif + + } + + return ret; +} + +unsigned int fh_replacedata(FibHeapNode * x, unsigned int data) +{ + unsigned int odata = x->fhe_data; + x->fhe_data = data; + return odata; +} + +unsigned int fh_delete(FibHeap * h, FibHeapNode * x) +{ + unsigned int k; + + k = x->fhe_data; + if (!h->fh_keys) + fh_replacedata(x, h->fh_neginf); + else + fh_replacekey(h, x, INT_MIN); + fh_extractmin(h); + + return k; +} + +/* + * begin of private element fuctions + */ +static FibHeapNode *fh_extractminel(FibHeap * h) +{ + FibHeapNode *ret; + FibHeapNode *x, *y, *orig; + + ret = h->fh_min; + + orig = NULL; + /* put all the children on the root list */ + /* for true consistancy, we should use fhe_remove */ + for (x = ret->fhe_child; x != orig && x != NULL;) { + if (orig == NULL) + orig = x; + y = x->fhe_right; + x->fhe_p = NULL; + fh_insertrootlist(h, x); + x = y; + } + /* remove minimum from root list */ + fh_removerootlist(h, ret); + h->fh_n--; + + /* if we aren't empty, consolidate the heap */ + if (h->fh_n == 0) + h->fh_min = NULL; + else { + h->fh_min = ret->fhe_right; + fh_consolidate(h); + } + + return ret; +} + +static void fh_insertrootlist(FibHeap * h, FibHeapNode * x) +{ + if (h->fh_root == NULL) { + h->fh_root = x; + x->fhe_left = x; + x->fhe_right = x; + return; + } + + fhe_insertafter(h->fh_root, x); +} + +static void fh_removerootlist(FibHeap * h, FibHeapNode * x) +{ + if (x->fhe_left == x) + h->fh_root = NULL; + else + h->fh_root = fhe_remove(x); +} + +static void fh_consolidate(FibHeap * h) +{ + FibHeapNode **a; + FibHeapNode *w; + FibHeapNode *y; + FibHeapNode *x; + IDnum i; + IDnum d; + IDnum D; + + fh_checkcons(h); + + /* assign a the value of h->fh_cons so I don't have to rewrite code */ + D = h->fh_Dl + 1; + a = h->fh_cons; + + for (i = 0; i < D; i++) + a[i] = NULL; + + while ((w = h->fh_root) != NULL) { + x = w; + fh_removerootlist(h, w); + d = x->fhe_degree; + /* XXX - assert that d < D */ + while (a[d] != NULL) { + y = a[d]; + if (fh_compare(h, x, y) > 0) + swap(FibHeapNode *, x, y); + fh_heaplink(h, y, x); + a[d] = NULL; + d++; + } + a[d] = x; + } + h->fh_min = NULL; + for (i = 0; i < D; i++) + if (a[i] != NULL) { + fh_insertrootlist(h, a[i]); + if (h->fh_min == NULL + || fh_compare(h, a[i], h->fh_min) < 0) + h->fh_min = a[i]; + } +} + +static void fh_heaplink(FibHeap * h, FibHeapNode * y, FibHeapNode * x) +{ + /* make y a child of x */ + if (x->fhe_child == NULL) + x->fhe_child = y; + else + fhe_insertbefore(x->fhe_child, y); + y->fhe_p = x; + x->fhe_degree++; + y->fhe_mark = 0; +} + +static void fh_cut(FibHeap * h, FibHeapNode * x, FibHeapNode * y) +{ + fhe_remove(x); + y->fhe_degree--; + fh_insertrootlist(h, x); + x->fhe_p = NULL; + x->fhe_mark = 0; +} + +static void fh_cascading_cut(FibHeap * h, FibHeapNode * y) +{ + FibHeapNode *z; + + while ((z = y->fhe_p) != NULL) { + if (y->fhe_mark == 0) { + y->fhe_mark = 1; + return; + } else { + fh_cut(h, y, z); + y = z; + } + } +} + +/* + * begining of handling elements of fibheap + */ +static FibHeapNode *fhe_newelem(FibHeap * h) +{ + FibHeapNode *e; + + if ((e = allocateFibHeapEl(h)) == NULL) + return NULL; + + fhe_initelem(e); + + return e; +} + +static void fhe_initelem(FibHeapNode * e) +{ + e->fhe_degree = 0; + e->fhe_mark = 0; + e->fhe_p = NULL; + e->fhe_child = NULL; + e->fhe_left = e; + e->fhe_right = e; + e->fhe_data = 0; +} + +static void fhe_insertafter(FibHeapNode * a, FibHeapNode * b) +{ + if (a == a->fhe_right) { + a->fhe_right = b; + a->fhe_left = b; + b->fhe_right = a; + b->fhe_left = a; + } else { + b->fhe_right = a->fhe_right; + a->fhe_right->fhe_left = b; + a->fhe_right = b; + b->fhe_left = a; + } +} + +static inline void fhe_insertbefore(FibHeapNode * a, FibHeapNode * b) +{ + fhe_insertafter(a->fhe_left, b); +} + +static FibHeapNode *fhe_remove(FibHeapNode * x) +{ + FibHeapNode *ret; + + if (x == x->fhe_left) + ret = NULL; + else + ret = x->fhe_left; + + /* fix the parent pointer */ + if (x->fhe_p != NULL && x->fhe_p->fhe_child == x) + x->fhe_p->fhe_child = ret; + + x->fhe_right->fhe_left = x->fhe_left; + x->fhe_left->fhe_right = x->fhe_right; + + /* clear out hanging pointers */ + x->fhe_p = NULL; + x->fhe_left = x; + x->fhe_right = x; + + return ret; +} + +static void fh_checkcons(FibHeap * h) +{ + IDnum oDl; + + /* make sure we have enough memory allocated to "reorganize" */ + if (h->fh_Dl == -1 || h->fh_n > (1 << h->fh_Dl)) { + oDl = h->fh_Dl; + if ((h->fh_Dl = ceillog2(h->fh_n) + 1) < 8) + h->fh_Dl = 8; + if (oDl != h->fh_Dl) + h->fh_cons = + (FibHeapNode **) realloc(h->fh_cons, + sizeof *h-> + fh_cons * + (h->fh_Dl + 1)); + if (h->fh_cons == NULL) + abort(); + } +} + +static int fh_compare(FibHeap * h, FibHeapNode * a, FibHeapNode * b) +{ + if (a->fhe_key < b->fhe_key) + return -1; + if (a->fhe_key == b->fhe_key) + return 0; + return 1; +} + +static int +fh_comparedata(FibHeap * h, Coordinate key, unsigned int data, FibHeapNode * b) +{ + FibHeapNode a; + + a.fhe_key = key; + a.fhe_data = data; + + return fh_compare(h, &a, b); +} + +static void fh_insertel(FibHeap * h, FibHeapNode * x) +{ + fh_insertrootlist(h, x); + + if (h->fh_min == NULL + || (h->fh_keys ? x->fhe_key < + h->fh_min->fhe_key : h->fh_cmp_fnct(x->fhe_data, + h->fh_min->fhe_data) < + 0)) + h->fh_min = x; + + h->fh_n++; +} diff --git a/fusion/fibHeap.c b/fusion/fibHeap.c new file mode 100755 index 0000000..8235ee2 --- /dev/null +++ b/fusion/fibHeap.c @@ -0,0 +1,77 @@ +/* +Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk) + + This file is part of Velvet. + + Velvet is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + Velvet is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Velvet; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*/ +#include "fib.h" + +// Constructor +// Memory allocated +FibHeap *newFibHeap() +{ + return fh_makekeyheap(); +} + +// Add new node into heap with a key, and a pointer to the specified node +FibHeapNode *insertNodeIntoHeap(FibHeap * heap, Coordinate key, + unsigned int node) +{ + return fh_insertkey(heap, key, node); +} + +// Returns smallest key in heap +Coordinate minKeyOfHeap(FibHeap * heap) +{ + return fh_minkey(heap); +} + +// Replaces the key for a given node +Coordinate replaceKeyInHeap(FibHeap * heap, FibHeapNode * node, + Coordinate newKey) +{ + return fh_replacekey(heap, node, newKey); +} + +// Removes the node with the shortest key, then returns it. +unsigned int removeNextNodeFromHeap(FibHeap * heap) +{ + return (unsigned int) fh_extractmin(heap); +} + +boolean IsHeapEmpty(FibHeap *heap) +{ + return fh_isempty(heap); +} + +// Destructor +void destroyHeap(FibHeap * heap) +{ + fh_deleteheap(heap); +} + +// Replace the node pointed to by a heap node +void replaceValueInHeap(FibHeapNode * node, unsigned int newValue) +{ + fh_replacedata(node, newValue); +} + +// Remove unwanted node +void destroyNodeInHeap(FibHeapNode * node, FibHeap * heap) +{ + fh_delete(heap, node); +} diff --git a/fusion/finalFusion b/fusion/finalFusion new file mode 100755 index 0000000..c5c46b8 Binary files /dev/null and b/fusion/finalFusion differ diff --git a/fusion/hashFunction.c b/fusion/hashFunction.c new file mode 100755 index 0000000..f2424fd --- /dev/null +++ b/fusion/hashFunction.c @@ -0,0 +1,83 @@ +#include + + +#define KMER_HASH_MASK 0x0000000000ffffffL +#define KMER_HASH_BUCKETS 16777216 // 4^12 + +static int crc_table[256] = { + 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, + 0x706af48f, 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, + 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, + 0x90bf1d91, 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, + 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 0x136c9856, + 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, + 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, + 0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, + 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, + 0x45df5c75, 0xdcd60dcf, 0xabd13d59, 0x26d930ac, 0x51de003a, + 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599, + 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, + 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, + 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, + 0x9fbfe4a5, 0xe8b8d433, 0x7807c9a2, 0x0f00f934, 0x9609a88e, + 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01, + 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, + 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950, + 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, + 0xfbd44c65, 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, + 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, + 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5, + 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, 0xbe0b1010, + 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, + 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, + 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, + 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615, + 0x73dc1683, 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, + 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 0xf00f9344, + 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, + 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, + 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, + 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, + 0xa6bc5767, 0x3fb506dd, 0x48b2364b, 0xd80d2bda, 0xaf0a1b4c, + 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef, + 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, + 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, + 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, + 0x2cd99e8b, 0x5bdeae1d, 0x9b64c2b0, 0xec63f226, 0x756aa39c, + 0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713, + 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, + 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242, + 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, + 0x18b74777, 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, + 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, 0xa00ae278, + 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7, + 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, 0x40df0b66, + 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, + 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, + 0xcdd70693, 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, + 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, + 0x2d02ef8d +}; + +static int crc32(int crc, const char *buf, int len) +{ + if (buf == NULL) + return 0; + + crc = crc ^ 0xffffffff; + while (len--) { + crc = + crc_table[((int) crc ^ (*buf++)) & 0xff] ^ (crc >> 8); + } + + return crc ^ 0xffffffff; +} + +Kmer hash_kmer(Kmer kmer) +{ + Kmer hash; + hash = kmer; + hash = crc32(0, (char *) &kmer, sizeof(Kmer)); + hash &= KMER_HASH_MASK; + return hash; +} diff --git a/fusion/inc/check.h b/fusion/inc/check.h new file mode 100755 index 0000000..db8f1ca --- /dev/null +++ b/fusion/inc/check.h @@ -0,0 +1,5 @@ + +extern void *ckalloc(unsigned long long amount); +extern void *ckrealloc(void *p, size_t new_size, size_t old_size); +extern FILE *ckopen(char *name, char *mode); + diff --git a/fusion/inc/darray.h b/fusion/inc/darray.h new file mode 100755 index 0000000..9b8e4f4 --- /dev/null +++ b/fusion/inc/darray.h @@ -0,0 +1,23 @@ +#ifndef __DARRAY__ +#define __DARRAY__ + +#include +#include +#include + +typedef struct dynamic_array +{ + void *array; + long long array_size; + size_t item_size; + long long item_c; +}DARRAY; + +void *darrayPut(DARRAY *darray,long long index); +void *darrayGet(DARRAY *darray,long long index); +DARRAY *createDarray(int num_items,size_t unit_size); +void freeDarray(DARRAY *darray); +void emptyDarray(DARRAY *darray); + +#endif + diff --git a/fusion/inc/def.h b/fusion/inc/def.h new file mode 100755 index 0000000..9c4d5b9 --- /dev/null +++ b/fusion/inc/def.h @@ -0,0 +1,296 @@ +/* this file provides some datatype definition */ +#ifndef _DEF +#define _DEF + +#include "def2.h" +#include "types.h" +#include "stack.h" +#include "darray.h" + +#define EDGE_BIT_SIZE 6 +#define word_len 12 +#define taskMask 0xf //the last 7 bits + +#define MaxEdgeCov 16000 + +#define base2int(base) (char)(((base)&0x06)>>1) +#define int2base(seq) "ACTG"[seq] +#define int2compbase(seq) "TGAC"[seq] +#define int_comp(seq) (char)(seq^0x02) //(char)((0x4E>>((seq)<<1))&0x03) + +int b_ban; + +typedef unsigned long long Kmer; + +typedef struct edon +{ + Kmer kmer; + unsigned int ctgLen:1; + unsigned int twin:1; + unsigned int pos:30; + unsigned int ctgID; + struct edon *left; + struct edon *right; +}EDON; + +struct node_pt; + +typedef struct node +{ + Kmer kmer; + unsigned char links; + unsigned char linksB; + unsigned char cvg; + unsigned char linear:1; + unsigned char deleted:1; + unsigned char mark:1; + unsigned int to_end; // the edge no. it belongs to + struct node *left; + struct node *right; +}NODE; + +typedef struct node_pt +{ + NODE *node; + Kmer kmer; + boolean isSmaller; + struct node_pt *next; +}NODE_PT; + +typedef struct preedge +{ + Kmer from_node; + Kmer to_node; + char *seq; + int length; + unsigned short cvg; + unsigned short bal_edge:2; //indicate whether it's bal_edge is the previous edge, next edge or itself +}preEDGE; + +typedef struct readinterval +{ + int readid; + unsigned int edgeid; + int start; + struct readinterval *bal_rv; + struct readinterval *nextOnEdge; + struct readinterval *prevOnEdge; + struct readinterval *nextInRead; + struct readinterval *prevInRead; +}READINTERVAL; + +struct arc; +typedef struct edge +{ + unsigned int from_vt; + unsigned int to_vt; + int length; + unsigned short cvg:14; + unsigned short bal_edge:2; + unsigned short multi:14; + unsigned short deleted : 1; + unsigned short flag : 1; + char *seq; + READINTERVAL *rv; + struct arc *arcs; + long long *markers; +}EDGE; + +typedef struct edge_pt +{ + EDGE *edge; + struct edge_pt *next; +}EDGE_PT; + +typedef struct vertex +{ + Kmer kmer; +}VERTEX; + +typedef struct connection +{ + unsigned int contigID; + int gapLen; + + unsigned short maxGap; + unsigned char minGap; + unsigned char bySmall:1; + unsigned char weakPoint:1; + + unsigned char weightNotInherit; + unsigned char weight; + unsigned char maxSingleWeight; + unsigned char mask : 1; + unsigned char used : 1; + unsigned char weak : 1; + unsigned char deleted : 1; + unsigned char prevInScaf : 1; + unsigned char inherit : 1; + unsigned char checking : 1; + unsigned char singleInScaf : 1; + struct connection *nextInScaf; + struct connection *next; + struct connection *nextInLookupTable; + int *PE; +}CONNECT; + +typedef struct prearc +{ + unsigned int to_ed; + unsigned int multiplicity; + struct prearc *next; +}preARC; + +typedef struct contig +{ + unsigned int from_vt; + unsigned int to_vt; + unsigned int length; + unsigned short indexInScaf; + unsigned char cvg; + unsigned char bal_edge:2; // 0, 1 or 2 + unsigned char mask : 1; + unsigned char flag : 1; + unsigned char multi: 1; + unsigned char inSubGraph: 1; + char *seq; + CONNECT *downwardConnect; + preARC *arcs; + STACK *closeReads; +}CONTIG; + +typedef struct read_nearby +{ + int len; + int dis; // dis to nearby contig or scaffold's start position + long long seqStarter; //sequence start position in dynamic array +}READNEARBY; + +typedef struct annotation +{ + unsigned long long readID; + unsigned int contigID; + int pos; +}ANNOTATION; + +typedef struct parameter +{ + unsigned char threadID; + void **hash_table; + unsigned char *mainSignal; + unsigned char *selfSignal; +}PARAMETER; + +typedef struct lightannot +{ + int contigID; + int pos; +}LIGHTANNOT; + +typedef struct edgepatch +{ + Kmer from_kmer,to_kmer; + unsigned int length; + char bal_edge; +}EDGEPATCH; + +typedef struct lightctg +{ + unsigned int index; + int length; + char *seq; +}LIGHTCTG; + + +typedef struct arc +{ + unsigned int to_ed; + unsigned int multiplicity; + struct arc *prev; + struct arc *next; + struct arc *bal_arc; + struct arc *nextInLookupTable; +}ARC; + +typedef struct arcexist +{ + Kmer kmer; + struct arcexist *left; + struct arcexist *right; +}ARCEXIST; + +typedef struct lib_info +{ + int min_ins; + int max_ins; + int avg_ins; + int rd_len_cutoff; + int reverse; + int asm_flag; + int map_len; + int pair_num_cut; + int rank; + //indicate which file is next to be read + int curr_type; + int curr_index; + + //file handlers to opened files + FILE *fp1; + FILE *fp2; + boolean f1_start; + boolean f2_start; + //whether last read is read1 in pair + int paired; // 0 -- single; 1 -- read1; 2 -- read2; + +//type1 + char **a1_fname; + char **a2_fname; + int num_a1_file; + int num_a2_file; + +//type2 + char **q1_fname; + char **q2_fname; + int num_q1_file; + int num_q2_file; + +//type3 + char **p_fname; + int num_p_file; //fasta only + +//type4 &5 + char **s_a_fname; + int num_s_a_file; + char **s_q_fname; + int num_s_q_file; + +}LIB_INFO; + +typedef struct ctg4heap{ + unsigned int ctgID; + int dis; + unsigned char ds_shut4dheap:1; // ignore downstream connections + unsigned char us_shut4dheap:1; // ignore upstream connections + unsigned char ds_shut4uheap:1; // ignore downstream connections + unsigned char us_shut4uheap:1; // ignore upstream connections +}CTGinHEAP; + +typedef struct ctg4scaf{ + unsigned int ctgID; + int start; + int end; //position in scaff + unsigned int cutHead : 8; // + unsigned int cutTail : 7; // + unsigned int scaftig_start : 1; //is it a scaftig starter + unsigned int mask : 1; // is it masked for further operations + unsigned int gapSeqLen:15; + int gapSeqOffset; +}CTGinSCAF; + +typedef struct pe_info{ + int insertS; + long long PE_bound; + int rank; + int pair_num_cut; +}PE_INFO; +#endif diff --git a/fusion/inc/def2.h b/fusion/inc/def2.h new file mode 100755 index 0000000..677002f --- /dev/null +++ b/fusion/inc/def2.h @@ -0,0 +1,43 @@ +#ifndef _DEF2 +#define _DEF2 +typedef char boolean; +typedef long long IDnum; +typedef double Time; +typedef long long Coordinate; +// Fibonacci heaps used mainly in Tour Bus +typedef struct fibheap FibHeap; +typedef struct fibheap_el FibHeapNode; +typedef struct dfibheap DFibHeap; +typedef struct dfibheap_el DFibHeapNode; +//Memory manager +typedef struct block_start +{ + struct block_start *next; +}BLOCK_START; + +typedef struct recycle_mark +{ + struct recycle_mark *next; +}RECYCLE_MARK; + +typedef struct mem_manager +{ + BLOCK_START *block_list; + int index_in_block; + int items_per_block; + size_t item_size; + RECYCLE_MARK *recycle_list; + unsigned long long counter; +}MEM_MANAGER; + +struct dfibheap_el { + int dfhe_degree; + boolean dfhe_mark; + DFibHeapNode *dfhe_p; + DFibHeapNode *dfhe_child; + DFibHeapNode *dfhe_left; + DFibHeapNode *dfhe_right; + Time dfhe_key; + unsigned int dfhe_data;//void *dfhe_data; +}; +#endif diff --git a/fusion/inc/dfib.h b/fusion/inc/dfib.h new file mode 100755 index 0000000..fa96304 --- /dev/null +++ b/fusion/inc/dfib.h @@ -0,0 +1,72 @@ +/* +Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk) + + This file is part of Velvet. + + Velvet is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + Velvet is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Velvet; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*/ +/*- + * Copyright 1997, 1998-2003 John-Mark Gurney. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: dfib.h,v 1.8 2007/04/24 12:16:41 zerbino Exp $ + * + */ + +#ifndef _DFIB_H_ +#define _DFIB_H_ + +#include +#include "def2.h" //#include "globals.h" + +/* functions for key heaps */ +DFibHeap *dfh_makekeyheap(void); +DFibHeapNode *dfh_insertkey(DFibHeap *, Time, unsigned int); +Time dfh_replacekey(DFibHeap *, DFibHeapNode *, Time); +unsigned int dfh_replacekeydata(DFibHeap *, DFibHeapNode *, Time, unsigned int); + +unsigned int dfh_extractmin(DFibHeap *); +unsigned int dfh_replacedata(DFibHeapNode *, unsigned int); +unsigned int dfh_delete(DFibHeap *, DFibHeapNode *); +void dfh_deleteheap(DFibHeap *); + +// DEBUG +IDnum dfibheap_getSize(DFibHeap *); +Time dfibheap_el_getKey(DFibHeapNode *); +// END DEBUG + +#endif /* _FIB_H_ */ diff --git a/fusion/inc/dfibHeap.h b/fusion/inc/dfibHeap.h new file mode 100755 index 0000000..120252c --- /dev/null +++ b/fusion/inc/dfibHeap.h @@ -0,0 +1,43 @@ +/* +Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk) + + This file is part of Velvet. + + Velvet is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + Velvet is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Velvet; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*/ +#ifndef _DFIBHEAP_H_ +#define _DFIBHEAP_H_ + +DFibHeap *newDFibHeap(); + +DFibHeapNode *insertNodeIntoDHeap(DFibHeap * heap, Time key, unsigned int node); + +Time replaceKeyInDHeap(DFibHeap * heap, DFibHeapNode * node, Time newKey); + +unsigned int removeNextNodeFromDHeap(DFibHeap * heap); + +void destroyDHeap(DFibHeap * heap); + +boolean HasMin(DFibHeap *h); + +void replaceValueInDHeap(DFibHeapNode * node, unsigned int newValue); + +void *destroyNodeInDHeap(DFibHeapNode * node, DFibHeap * heap); + +IDnum getDFibHeapSize(DFibHeap * heap); + +Time getKey(DFibHeapNode * node); +#endif diff --git a/fusion/inc/dfibpriv.h b/fusion/inc/dfibpriv.h new file mode 100755 index 0000000..fb0d5b3 --- /dev/null +++ b/fusion/inc/dfibpriv.h @@ -0,0 +1,96 @@ +/* +Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk) + + This file is part of Velvet. + + Velvet is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + Velvet is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Velvet; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*/ +/*- + * Copyright 1997, 1999-2003 John-Mark Gurney. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: dfibpriv.h,v 1.8 2007/10/09 09:56:46 zerbino Exp $ + * + */ + +#ifndef _DFIBPRIV_H_ +#define _DFIBPRIV_H_ + +//#include "globals.h" +#include "def2.h" + +/* + * specific node operations + */ + +static DFibHeapNode *dfhe_newelem(DFibHeap *); +static void dfhe_insertafter(DFibHeapNode * a, DFibHeapNode * b); +static inline void dfhe_insertbefore(DFibHeapNode * a, DFibHeapNode * b); +static DFibHeapNode *dfhe_remove(DFibHeapNode * a); + +/* + * global heap operations + */ +struct dfibheap { + MEM_MANAGER *nodeMemory; + IDnum dfh_n; + IDnum dfh_Dl; + DFibHeapNode **dfh_cons; + DFibHeapNode *dfh_min; + DFibHeapNode *dfh_root; +}; + +static void dfh_insertrootlist(DFibHeap *, DFibHeapNode *); +static void dfh_removerootlist(DFibHeap *, DFibHeapNode *); +static void dfh_consolidate(DFibHeap *); +static void dfh_heaplink(DFibHeap * h, DFibHeapNode * y, DFibHeapNode * x); +static void dfh_cut(DFibHeap *, DFibHeapNode *, DFibHeapNode *); +static void dfh_cascading_cut(DFibHeap *, DFibHeapNode *); +static DFibHeapNode *dfh_extractminel(DFibHeap *); +static void dfh_checkcons(DFibHeap * h); +static int dfh_compare(DFibHeap * h, DFibHeapNode * a, DFibHeapNode * b); +static int dfh_comparedata(DFibHeap * h, Time key, + unsigned int data, DFibHeapNode * b); +static void dfh_insertel(DFibHeap * h, DFibHeapNode * x); + + +/* + * general functions + */ +static inline IDnum ceillog2(IDnum a); + +#endif /* _FIBPRIV_H_ */ diff --git a/fusion/inc/extfunc.h b/fusion/inc/extfunc.h new file mode 100755 index 0000000..72052f2 --- /dev/null +++ b/fusion/inc/extfunc.h @@ -0,0 +1,209 @@ +/*************************************************************************** + * Title: extfunc.h + * Author: Haixu Tang + * Created: Jun. 2002 + * Last modified: May. 2004 + * + * Copyright (c) 2001-2004 The Regents of the University of California + * All Rights Reserved + * See file LICENSE for details. + ***************************************************************************/ +#include "check.h" +#include "extfunc2.h" +extern NODE **seq2nodes_with_pair(char *seqfile,char *outfile); +extern NODE **prlSeq2nodes_with_pair(char *seqfile,char *outfile); +extern void readseq1by1(char *src_seq, char *src_name,int *len_seq, FILE *fp,long long num_seq); +extern void readseqPbyP(char *src_seq, char *src_name,int *insertS,int *len_seq, FILE *fp,long long num_seq); +extern void nodes2edges_with_pair(NODE **hash_table,EDGE_PT **edge_list,char *outfile); +extern int findOrInsertOccurenceInNodeTree(Kmer kmer, NODE ** T); +extern NODE *SplayNodeTree(NODE * T,Kmer kmer); +extern Kmer reverseComplement(Kmer word,int overlap); +extern Kmer hash_kmer(Kmer kmer); +extern void link2next(NODE *node,char ch); +extern unsigned char check_link2next(NODE *node,char ch); +extern void unlink2next(NODE *node,char ch); +extern void link2prev(NODE *node,char ch); +extern unsigned char check_link2prev(NODE *node,char ch); +extern void unlink2prev(NODE *node,char ch); +extern int count_link2next(NODE *node); +extern int count_link2prev(NODE *node); +extern Kmer nextKmer(Kmer prev,char ch); +extern Kmer prevKmer(Kmer next,char ch); +extern long long readseqpar(int *max_len,int *min_leg,int *max_name_len,FILE *fp); +extern void destroyNodeHash(NODE **hash_table); +extern void free_edge_list(EDGE_PT *el); +extern void reverseComplementSeq(char *seq, int len,char *bal_seq); +extern void free_node_list(NODE_PT *np); +extern NODE *SplayNodeTree_FILTER(NODE *T,Kmer kmer); +extern NODE *allocateNode_cvg(Kmer kmer); +extern int findOrInsertOccurenceInNodeTree_cvg(Kmer kmer, NODE **T); +extern void free_edge_array(EDGE *ed_array,int ed_num); +extern void free_lightctg_array(LIGHTCTG *ed_array,int ed_num); +extern char getCharInTightString(char *tightSeq,int pos); +extern void writeChar2tightSting(char nt,char *tightSeq,int pos); +extern void short_reads_sum(); +extern void read_one_sequence(FILE *fp,long long *T,char **X); +extern void output_edges(preEDGE *ed_array,int ed_num,char *outfile); +extern void read2edge(char *seqfile,NODE **hash_table,char *outfile); +extern void loadVertex(char *graphfile); +extern int kmer2vt(Kmer kmer); +extern void loadEdge(char *graphfile); +extern boolean loadPath(char *graphfile); +extern READINTERVAL *allocateRV(int readid,int edgeid); +extern void createRVmemo(); +extern void dismissRV(READINTERVAL *rv); +extern void destroyReadIntervMem(); +extern void destroyConnectMem(); +extern void u2uConcatenate(); +extern void unlink2all(NODE *node,NODE **hash_table); +extern void cutTip(NODE **hash_table); +extern void output_contig(EDGE *ed_array,unsigned int ed_num,char *outfile,int cut_len); +extern void printTightString(char *tightSeq,int len); +extern int roughUniqueness(unsigned int edgeno,char ignore_cvg,char *ignored); +extern void outputReadPos(char *graphfile,int min_len); +extern NODE *reverseComplementNode(NODE *node1,NODE **hash_table); +extern void testSearch(); +extern void print_kmer(FILE *fp,Kmer kmer,char c); +extern void allpathConcatenate(); +extern void output_updated_edges(char *outfile); +extern void output_updated_vertex(char *outfile); +extern void loadUpdatedEdges(char *graphfile); +extern void loadUpdatedVertex(char *graphfile); +extern void connectByPE(char *infile); +extern void output_cntGVZ(char *outfile); +extern void output_graph(char *outfile); +extern void removeUnreliable(NODE **hash_talbe); +extern void testLinearC2C(); +extern void output_contig_graph(char *outfile); +extern void scaffolding(unsigned int cut_len,char *outfile); +extern int cmp_int(const void *a,const void *b); +extern CONNECT *allocateCN(unsigned int contigId, int gap); +extern int recoverRep(); +extern void loadPEgrads(char *infile); +extern int putInsertS(long long readid,int size,int *currGrads); +extern int getInsertS(long long readid,int *readlen); +extern int connectByPE_grad(FILE *fp,int peGrad,char *line); +extern void PEgradsScaf(char *infile); +extern void reorderAnnotation(char *infile,char *outfile); +extern int count_ends(NODE **hash_table); +extern void output_1edge(preEDGE *edge, FILE *fp); +extern void prlRead2edge(char *libfile,char *outfile); +extern int count_edges(NODE **hash_table); +extern int prlFindOrInsertOccurenceInNodeTree_cvg(Kmer kmer, NODE ** T,MEM_MANAGER *node_mem_manager); +extern void prlDestroyNodeHash(NODE **hash_table); +extern void annotFileTrans(char *infile,char *outfile); +extern void prlLoadPath(char *graphfile); +extern void misCheck(char *infile,char *outfile); +extern int uniqueLenSearch(unsigned int *len_array,unsigned int *flag_array,int num,unsigned int target); +extern int cmp_vertex(const void *a,const void *b); +extern void linkContig2Vts(); +extern int bisearch(VERTEX *vts,int num,Kmer target); +extern int connectByPE_gradPatch(FILE *fp1,FILE *fp2,int peGrad,char *line1,char *line2); +extern void scaftiging(char *graphfile,int len_cut); +extern void gapFilling(char *graphfile,int cut_len); +extern ARC *getArcBetween(unsigned int from_ed, unsigned int to_ed); +extern void bubblePinch(double simiCutoff,char *outfile,int M); +extern void linearConcatenate(); +extern unsigned char setArcMulti(unsigned int from_ed,unsigned int to_ed,unsigned char value); +extern ARC *allocateArc(unsigned int edgeid); +extern void cutTipsInGraph(int cutLen, boolean strict); +extern ARC *deleteArc(ARC *arc_list,ARC *arc); +extern void compactEdgeArray(); +extern void dismissArc(ARC *arc); +extern void createArcMemo(); +extern ARC *getArcBetween(unsigned int from_ed, unsigned int to_ed); +extern ARC *allocateArc(unsigned int edgeid); +extern void unlink2prevUncertain(NODE *node,char ch,boolean smaller); +extern char firstCharInKmer(Kmer kmer); +extern void writeChar2tightString(char nt,char *tightSeq,int pos); +extern Kmer reverseComplementVerbose(Kmer word,int overlap); +extern Kmer KmerPlus(Kmer prev,char ch); +extern void output_heavyArcs(char *outfile); +extern preARC *allocatePreArc(unsigned int edgeid); +extern void destroyPreArcMem(); +extern void traceAlongArc(unsigned int destE,unsigned int currE,int max_steps,int min,int max,int index,int len,int *num_route); +extern void freeContig_array(); +extern void output_scafSeq(char *graphfile,int len_cut); +extern void putArcInHash(unsigned int from_ed,unsigned int to_ed); +extern boolean DoesArcExist(unsigned int from_ed,unsigned int to_ed); +extern void recordArcInHash(); +extern void destroyArcHash(); +extern void removeWeakEdges(int lenCutoff,unsigned int multiCutoff); +extern void createArcLookupTable(); +extern void deleteArcLookupTable(); +extern void putArc2LookupTable(unsigned int from_ed,ARC *arc); +extern void removeArcInLookupTable(unsigned int from_ed,unsigned int to_ed); +extern ARC *arcCount(unsigned int edgeid,unsigned int *num); +extern void mapFileTrans(char *infile); +extern void solveReps(); +extern void removeDeadArcs(); +extern void destroyArcMem(); +extern int count_link2prevB(NODE *node); +extern int count_link2nextB(NODE *node); +extern void getCntsInFile(char *infile); +extern void scafByCntInfo(char *infile); +extern CONNECT *add1Connect(unsigned int e1, unsigned int e2, int gap, int weight,boolean inherit); +extern void getScaff(char *infile); +extern void traceAlongMaskedCnt(unsigned int destE,unsigned int currE,int max_steps,int min,int max,int index,int len,int *num_route); +extern void createPreArcMemManager(); +extern boolean loadPathBin(char *graphfile); +extern void analyzeTips(NODE **hash_table, char *graphfile); +extern void recordArcsInLookupTable(); +extern FILE *multiFileRead1seq(char *src_seq, char *src_name, int *len_seq, FILE *fp,FILE *freads); +extern void multiFileSeqpar(FILE *fp); +extern long long multiFileParse(int *max_leg, int *min_leg,int *max_name_leg, FILE *fp); +extern CONNECT *getCntBetween(unsigned int from_ed, unsigned int to_ed); +extern void createCntMemManager(); +extern void destroyConnectMem(); +extern void createCntLookupTable(); +extern void deleteCntLookupTable(); +extern void putCnt2LookupTable(unsigned int from_c,CONNECT *cnt); +extern int prlFindOrInsertOccurenceInEdonTree(Kmer kmer, EDON ** T,MEM_MANAGER *node_mem_manager); +extern EDON *SplayEdonTree(EDON * T,Kmer kmer); +extern void prlDestroyEdonHash(EDON **hash_table); +extern void prlRead2Ctg(char *seqfile,char *outfile); +extern void prlLongRead2Ctg(char *libfile,char *outfile); +extern boolean prlContig2nodes(char *grapfile,int len_cut); +extern void scan_libInfo(char *libfile); +extern int getMaxLongReadLen(int num_libs); +extern void free_libs(); +extern boolean read1seqInLib(char *src_seq, char *src_name, int *len_seq, + int *libNo,boolean pair,unsigned char purpose); +extern NODE **prlEdge2nodes(char *grapfile); +extern void prlRead2graph(char *libfile,NODE **hash_table,char *outfile); +extern void save4laterSolve(); +extern void solveRepsAfter(); +extern void free_pe_mem(); +extern void alloc_pe_mem(int gradsCounter); +extern NODE *searchNodeTree(NODE * T,Kmer kmer); +extern EDON *searchEdonTree(EDON * T,Kmer kmer); +extern void prlDestroyPreArcMem(); +extern preARC *prlAllocatePreArc(unsigned int edgeid,MEM_MANAGER *manager); +extern boolean prlRead2HashTable(char *libfile,char *outfile); +extern void free_allSets(); +extern void removeSingleTips(); +extern void removeMinorTips(); +extern void kmer2edges(char *outfile); +extern void output_vertex(char *outfile); +extern boolean prlRead2HashTable(char *libfile,char *outfile); +extern void Links2Scaf(char *infile); +extern void PE2Links(char *infile); +extern void basicContigInfo(char *infile); +extern unsigned int getTwinCtg(unsigned int ctg); +extern boolean isSmallerThanTwin(unsigned int ctg); +extern boolean isLargerThanTwin(unsigned int ctg); +extern boolean isSameAsTwin(unsigned int ctg); +extern boolean loadMarkerBin(char *graphfile); +extern void readsCloseGap(char *graphfile); +extern void prlReadsCloseGap(char *graphfile); +extern void locateReadOnScaf(char *graphfile); +extern unsigned int getTwinEdge(unsigned int edge); +extern boolean EdSmallerThanTwin(unsigned int edge); +extern boolean EdLargerThanTwin(unsigned int edge); +extern boolean EdSameAsTwin(unsigned int edge); +extern void removeLowCovEdges(int lenCutoff,unsigned short covCutoff); +extern int localGraph(READNEARBY *rdArray,int num,CTGinSCAF *ctg1,CTGinSCAF *ctg2, + int origOverlap,Kmer *kmerCtg1,Kmer *kmerCtg2, + int overlap,DARRAY *gapSeqArray,char *seqCtg1,char *seqCtg2,char *seqGap); + + diff --git a/fusion/inc/extfunc2.h b/fusion/inc/extfunc2.h new file mode 100755 index 0000000..cf64e20 --- /dev/null +++ b/fusion/inc/extfunc2.h @@ -0,0 +1,7 @@ +#ifndef _MEM_MANAGER +#define _MEM_MANAGER +extern MEM_MANAGER *createMem_manager(int num_items,size_t unit_size); +extern void *getItem(MEM_MANAGER *mem_Manager); +extern void returnItem(MEM_MANAGER *mem_Manager,void *); +extern void freeMem_manager(MEM_MANAGER *mem_Manager); +#endif diff --git a/fusion/inc/extvab.h b/fusion/inc/extvab.h new file mode 100755 index 0000000..a0baca8 --- /dev/null +++ b/fusion/inc/extvab.h @@ -0,0 +1,92 @@ +/*************************************************************************** + * Title: extvab.h + * Author: Hongmei Zhu + * Created: Jun. 2007 + * Last modified: May. 2009 + * + * All Rights Reserved + * See file LICENSE for details. + ***************************************************************************/ +/*** global variables ****/ +extern int overlaplen; +extern int inGraph; +extern long long n_ban; +extern Kmer WORDFILTER; +extern boolean globalFlag; +extern int thrd_num; + +extern int verbosity; +extern char verboseStr[verboseBufSize]; + +/**** reads info *****/ +extern long long n_solexa; +extern long long prevNum; +extern int ins_size_var; +extern PE_INFO *pes; +extern int maxReadLen; +extern int maxReadLen4all; +extern int minReadLen; +extern int maxNameLen; +extern int num_libs; +extern LIB_INFO *lib_array; +extern int libNo; +extern long long readNumBack; +extern int gradsCounter; +/*** used for pregraph *****/ +extern MEM_MANAGER *prearc_mem_manager; //also used in scaffolding +extern MEM_MANAGER **preArc_mem_managers; +extern boolean deLowKmer; +extern boolean deLowEdge; +extern KmerSet **KmerSets; // also used in mapping +extern KmerSet **KmerSetsPatch; + +extern spcKmerSet *spcSet; + +/**** used for contiging ****/ +extern boolean repsTie; +extern long long arcCounter; +extern unsigned int num_ed; +extern unsigned int num_ed_limit; +extern unsigned int extraEdgeNum; +extern EDGE *edge_array; +extern VERTEX *vt_array; +extern MEM_MANAGER *rv_mem_manager; +extern MEM_MANAGER *arc_mem_manager; +extern unsigned int num_vt; +extern int len_bar; +extern ARC **arcLookupTable; +extern long long *markersArray; +/***** used for scaffolding *****/ +extern MEM_MANAGER *cn_mem_manager; +extern unsigned int num_ctg; +extern unsigned int *index_array; +extern CONTIG *contig_array; +extern int lineLen; +extern int weakPE; +extern long long newCntCounter; +extern CONNECT **cntLookupTable; +extern unsigned int ctg_short; +extern int cvgAvg; +extern boolean orig2new; +/**** used for gapFilling ****/ +extern DARRAY *readSeqInGap; +extern DARRAY *gapSeqDarray; +extern DARRAY **darrayBuf; +extern int fillGap; +/**** used for searchPath *****/ +extern int maxSteps; +extern int num_trace; +extern unsigned int**found_routes; +extern unsigned int*so_far; +extern int max_n_routes; +extern boolean maskRep; +extern int GLDiff; +extern int initKmerSetSize; +extern char *shortrdsfile; +extern char *graphfile; +extern double OverlapPercent ; +extern double ConflPercent ; +extern double close_threshold; +extern int bund_threshold; +extern char *ctg_file; +//extern boolean large_kmer; diff --git a/fusion/inc/fib.h b/fusion/inc/fib.h new file mode 100755 index 0000000..40ac9d3 --- /dev/null +++ b/fusion/inc/fib.h @@ -0,0 +1,81 @@ +/* +Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk) + + This file is part of Velvet. + + Velvet is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + Velvet is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Velvet; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*/ +/*- + * Copyright 1997, 1998-2003 John-Mark Gurney. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: fib.h,v 1.9 2007/04/24 12:16:41 zerbino Exp $ + * + */ + +#ifndef _FIB_H_ +#define _FIB_H_ + +//#include "globals.h" +#include +#include "def2.h" + +typedef Coordinate(*voidcmp) (unsigned int , unsigned int); + +/* functions for key heaps */ +boolean fh_isempty(FibHeap *); +FibHeap *fh_makekeyheap(void); +FibHeapNode *fh_insertkey(FibHeap *, Coordinate, unsigned int); +Coordinate fh_minkey(FibHeap *); +Coordinate fh_replacekey(FibHeap *, FibHeapNode *, Coordinate); +unsigned int fh_replacekeydata(FibHeap *, FibHeapNode *, Coordinate, unsigned int); + +/* functions for unsigned int * heaps */ +FibHeap *fh_makeheap(void); +voidcmp fh_setcmp(FibHeap *, voidcmp); +unsigned int fh_setneginf(FibHeap *, unsigned int); +FibHeapNode *fh_insert(FibHeap *, unsigned int); + +/* shared functions */ +unsigned int fh_extractmin(FibHeap *); +unsigned int fh_min(FibHeap *); +unsigned int fh_replacedata(FibHeapNode *, unsigned int); +unsigned int fh_delete(FibHeap *, FibHeapNode *); +void fh_deleteheap(FibHeap *); +FibHeap *fh_union(FibHeap *, FibHeap *); + +#endif /* _FIB_H_ */ diff --git a/fusion/inc/fibHeap.h b/fusion/inc/fibHeap.h new file mode 100755 index 0000000..e4adbb3 --- /dev/null +++ b/fusion/inc/fibHeap.h @@ -0,0 +1,43 @@ +/* +Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk) + + This file is part of Velvet. + + Velvet is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + Velvet is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Velvet; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*/ +#ifndef _FIBHEAP_H_ +#define _FIBHEAP_H_ + +FibHeap *newFibHeap(); + +FibHeapNode *insertNodeIntoHeap(FibHeap * heap, Coordinate key, + unsigned int node); + +Coordinate minKeyOfHeap(FibHeap * heap); + +Coordinate replaceKeyInHeap(FibHeap * heap, FibHeapNode * node, + Coordinate newKey); + +void replaceValueInHeap(FibHeapNode * node, unsigned int newValue); + +unsigned int removeNextNodeFromHeap(FibHeap * heap); + +void *destroyNodeInHeap(FibHeapNode * node, FibHeap * heap); + +void destroyHeap(FibHeap * heap); + +boolean IsHeapEmpty(FibHeap *heap); +#endif diff --git a/fusion/inc/fibpriv.h b/fusion/inc/fibpriv.h new file mode 100755 index 0000000..651a3da --- /dev/null +++ b/fusion/inc/fibpriv.h @@ -0,0 +1,110 @@ +/* +Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk) + + This file is part of Velvet. + + Velvet is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + Velvet is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Velvet; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*/ +/*- + * Copyright 1997, 1999-2003 John-Mark Gurney. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: fibpriv.h,v 1.10 2007/10/09 09:56:46 zerbino Exp $ + * + */ + +#ifndef _FIBPRIV_H_ +#define _FIBPRIV_H_ + +#include "def2.h" + +/* + * specific node operations + */ +struct fibheap_el { + int fhe_degree; + boolean fhe_mark; + FibHeapNode *fhe_p; + FibHeapNode *fhe_child; + FibHeapNode *fhe_left; + FibHeapNode *fhe_right; + Coordinate fhe_key; + unsigned int fhe_data; +}; + +static FibHeapNode *fhe_newelem(struct fibheap *); +static void fhe_initelem(FibHeapNode *); +static void fhe_insertafter(FibHeapNode * a, FibHeapNode * b); +static inline void fhe_insertbefore(FibHeapNode * a, FibHeapNode * b); +static FibHeapNode *fhe_remove(FibHeapNode * a); + +/* + * global heap operations + */ +struct fibheap { + Coordinate(*fh_cmp_fnct) (unsigned int, unsigned int); + MEM_MANAGER *nodeMemory; + IDnum fh_n; + IDnum fh_Dl; + FibHeapNode **fh_cons; + FibHeapNode *fh_min; + FibHeapNode *fh_root; + unsigned int fh_neginf; + boolean fh_keys:1; +}; + +static void fh_initheap(FibHeap *); +static void fh_insertrootlist(FibHeap *, FibHeapNode *); +static void fh_removerootlist(FibHeap *, FibHeapNode *); +static void fh_consolidate(FibHeap *); +static void fh_heaplink(FibHeap * h, FibHeapNode * y, FibHeapNode * x); +static void fh_cut(FibHeap *, FibHeapNode *, FibHeapNode *); +static void fh_cascading_cut(FibHeap *, FibHeapNode *); +static FibHeapNode *fh_extractminel(FibHeap *); +static void fh_checkcons(FibHeap * h); +static void fh_destroyheap(FibHeap * h); +static int fh_compare(FibHeap * h, FibHeapNode * a, FibHeapNode * b); +static int fh_comparedata(FibHeap * h, Coordinate key, + unsigned int data, FibHeapNode * b); +static void fh_insertel(FibHeap * h, FibHeapNode * x); + +/* + * general functions + */ +static inline IDnum ceillog2(IDnum a); + +#endif /* _FIBPRIV_H_ */ diff --git a/fusion/inc/general.h b/fusion/inc/general.h new file mode 100755 index 0000000..ba52eb5 --- /dev/null +++ b/fusion/inc/general.h @@ -0,0 +1,89 @@ +/* + * Filename: general.h + * + * + * Description: + * Basic functions + * + * Created on: Feb 8, 2010 + * Author: Ruibang Luo, BGI + * + * History: + * 1. + */ + +#pragma once +#ifndef GENERAL_H_AQUA_ +#define GENERAL_H_AQUA_ + +#include + +//Useful Variables************************************************************* +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#define FN_SIZE 2048 +//***************************************************************************** + +//Types************************************************************************ +typedef unsigned int uint; +typedef unsigned char uchar; +typedef unsigned short ushort; +typedef unsigned long ulong; +typedef unsigned long long ulonglong; + +typedef unsigned char BYTE; +typedef unsigned short WORD; +typedef unsigned int DWORD; + +typedef unsigned char u8_t; +typedef unsigned short u16_t; +typedef unsigned int u32_t; +typedef unsigned long long u64_t; + +typedef char * chptr; + +//***************************************************************************** + +//Debugging******************************************************************** +//Verbose system +//Verbosity should seperated into 4 levels: 0, 1, 2, 3 +#define VERBOSITY_BOTTOM 0 +#define VERBOSITY_TOP 4 +int ModifyVerbosity(const int); +#define verboseBufSize 16384 + +#define ModVerboseStrAndVerbose(level, ...) \ + {\ + if(verbosity >> level)\ + {\ + snprintf(verboseStr, verboseBufSize, ##__VA_ARGS__);\ + fprintf(stderr,"[%s]:%s\n",__FUNCTION__,verboseStr);\ + }\ + } +#define mvnv(level, ...) ModVerboseStrAndVerbose(level, ##__VA_ARGS__) +#define die(...) \ + {\ + ModVerboseStrAndVerbose(0, ##__VA_ARGS__);\ + fprintf(stderr,"Program terminated.\n");\ + exit(EXIT_FAILURE);\ + } +#define sigdie(sig, ...) \ + {\ + ModVerboseStrAndVerbose(0, ##__VA_ARGS__);\ + fprintf(stderr,"Program terminated.\n");\ + exit(sig);\ + } +#define perrdie(...) \ + {\ + ModVerboseStrAndVerbose(0, ##__VA_ARGS__);\ + perror("");\ + fprintf(stderr,"Program terminated.\n");\ + exit(EXIT_FAILURE);\ + } +#define mk \ +{\ + fprintf(stderr, "DBG Marker @ %s:%d\n", __FUNCTION__, __LINE__);\ +} + +#endif diff --git a/fusion/inc/global.h b/fusion/inc/global.h new file mode 100755 index 0000000..5e7c71d --- /dev/null +++ b/fusion/inc/global.h @@ -0,0 +1,74 @@ +int overlaplen=25; +int verbosity=3; +char verboseStr[verboseBufSize]; +int inGraph; +long long n_ban; +long long n_solexa=0; +long long prevNum=0; +int ins_size_var=20; +PE_INFO *pes=NULL; +MEM_MANAGER *rv_mem_manager=NULL; +MEM_MANAGER *cn_mem_manager=NULL; +MEM_MANAGER *arc_mem_manager=NULL; +unsigned int num_vt=0; +unsigned int **found_routes=NULL; +unsigned int *so_far=NULL; +int max_n_routes = 10; +int num_trace; +Kmer WORDFILTER; +unsigned int num_ed=0; +unsigned int num_ctg=0; +unsigned int num_ed_limit; +unsigned int extraEdgeNum; +EDGE *edge_array=NULL; +VERTEX *vt_array=NULL; +unsigned int *index_array=NULL; +CONTIG *contig_array=NULL; +int lineLen; +int len_bar=100; +int weakPE=3; +int fillGap=0; +boolean globalFlag; +long long arcCounter; +MEM_MANAGER *prearc_mem_manager=NULL; +MEM_MANAGER **preArc_mem_managers=NULL; +int maxReadLen=0; +int maxReadLen4all=0; +int minReadLen=0; +int maxNameLen=0; +ARC **arcLookupTable=NULL; +long long *markersArray=NULL; +boolean deLowKmer=0; +boolean deLowEdge=1; +long long newCntCounter; +boolean repsTie=0; +CONNECT **cntLookupTable=NULL; +int num_libs=0; +LIB_INFO *lib_array=NULL; +int libNo=0; +long long readNumBack; +int gradsCounter; +unsigned int ctg_short=0; +int thrd_num=8; +int cvgAvg=0; +KmerSet **KmerSets=NULL; +KmerSet **KmerSetsPatch=NULL; + +spcKmerSet *spcSet = NULL; + +DARRAY *readSeqInGap=NULL; +DARRAY *gapSeqDarray=NULL; +DARRAY **darrayBuf; +boolean orig2new; +int maxSteps; +boolean maskRep=1; +int GLDiff=50; +int initKmerSetSize = 0; +char *shortrdsfile; +char *graphfile; +double OverlapPercent = 0.05; +double ConflPercent = 0.05; +double close_threshold = 0.1; +int bund_threshold=5; +char *ctg_file=NULL; +//boolean large_kmer=0; diff --git a/fusion/inc/newhash.h b/fusion/inc/newhash.h new file mode 100644 index 0000000..6a1fd1d --- /dev/null +++ b/fusion/inc/newhash.h @@ -0,0 +1,122 @@ +#ifndef __NEW_HASH_RJ +#define __NEW_HASH_RJ + +#ifndef K_LOAD_FACTOR +#define K_LOAD_FACTOR 0.75 +#endif + +#define MAX_KMER_COV 63 +#define EDGE_BIT_SIZE 6 +#define EDGE_XOR_MASK 0x3FU +#define LINKS_BITS 0x00FFFFFFU + +#define get_kmer_seq(mer) ((mer).seq) +#define set_kmer_seq(mer, val) ((mer).seq = val) + +#define get_kmer_left_cov(mer, idx) (((mer).l_links>>((idx)*EDGE_BIT_SIZE))&EDGE_XOR_MASK) +#define set_kmer_left_cov(mer, idx, val) ((mer).l_links = ((mer).l_links&(~(EDGE_XOR_MASK<<((idx)*EDGE_BIT_SIZE)))) | (((val)&EDGE_XOR_MASK)<<((idx)*EDGE_BIT_SIZE)) ) +#define get_kmer_left_covs(mer) (get_kmer_left_cov(mer, 0) + get_kmer_left_cov(mer, 1) + get_kmer_left_cov(mer, 2) + get_kmer_left_cov(mer, 3)) + +#define get_kmer_right_cov(mer, idx) (((mer).r_links>>((idx)*EDGE_BIT_SIZE))&EDGE_XOR_MASK) +#define set_kmer_right_cov(mer, idx, val) ((mer).r_links = ((mer).r_links&(~(EDGE_XOR_MASK<<((idx)*EDGE_BIT_SIZE)))) | (((val)&EDGE_XOR_MASK)<<((idx)*EDGE_BIT_SIZE)) ) +#define get_kmer_right_covs(mer) (get_kmer_right_cov(mer, 0) + get_kmer_right_cov(mer, 1) + get_kmer_right_cov(mer, 2) + get_kmer_right_cov(mer, 3)) + + +#define is_kmer_entity_null(flags, idx) ((flags)[(idx)>>4]>>(((idx)&0x0f)<<1)&0x01) +#define is_kmer_entity_del(flags, idx) ((flags)[(idx)>>4]>>(((idx)&0x0f)<<1)&0x02) +#define set_kmer_entity_null(flags, idx) ((flags)[(idx)>>4] |= (0x01u<<(((idx)&0x0f)<<1))) +#define set_kmer_entity_del(flags, idx) ((flags)[(idx)>>4] |= (0x02u<<(((idx)&0x0f)<<1))) +#define clear_kmer_entity_null(flags, idx) ((flags)[(idx)>>4] &= ~(0x01u<<(((idx)&0x0f)<<1))) +#define clear_kmer_entity_del(flags, idx) ((flags)[(idx)>>4] &= ~(0x02u<<(((idx)&0x0f)<<1))) +#define exists_kmer_entity(flags, idx) (!((flags)[(idx)>>4]>>(((idx)&0x0f)<<1)&0x03)) + + +typedef struct kmer_st +{ + Kmer seq; + ubyte4 l_links; // sever as edgeID since make_edge + ubyte4 r_links:4*EDGE_BIT_SIZE; + ubyte4 linear:1; + ubyte4 deleted:1; + ubyte4 checked:1; + ubyte4 single:1; + ubyte4 twin:2; + ubyte4 inEdge:2; +} kmer_t; + +typedef struct kmerSet_st +{ + kmer_t *array; + ubyte4 *flags; + ubyte8 size; + ubyte8 count; + ubyte8 max; + double load_factor; + ubyte8 iter_ptr; + + ubyte8 searchCnt; + ubyte8 foundCnt; + ubyte8 delCnt; + ubyte8 searchSpcSeedCnt; + ubyte8 getSpcSeedCnt; + ubyte8 levelGet[3]; + +} KmerSet; + +typedef struct kmer_pt +{ + kmer_t *node; + Kmer kmer; + boolean isSmaller; + struct kmer_pt *next; +}KMER_PT; + +//////////////////////////////////////////////////////////////// spaced seed + +typedef struct spaced_base +{ + ubyte2 spaced_bases:14; + //ubyte2 repeat:1; + //ubyte4 edgeID; + kmer_t *large_kmer; + struct spaced_base *next; +}spcBase; + +typedef struct spaced_kmer +{ + Kmer seq; + struct spaced_base *start; + ubyte4 spaced_base_num; +}spcKmer; + +typedef struct spaced_kmer_set +{ + spcKmer *array; + ubyte4 *flags; + ubyte8 size; + ubyte8 count; + ubyte8 max; + double load_factor; +} spcKmerSet; + +extern spcKmerSet* init_spckmerset(ubyte8 init_size, float load_factor); +extern void buildSpcKmerSet(KmerSet *set, spcKmerSet *spaced_kset); +extern int search_spckmerset(spcKmerSet *set, ubyte8 seq, spcKmer **rs); +extern int put_spckmerset(spcKmerSet *set, Kmer spc_kmer, ubyte2 spaced_bases, kmer_t *node); + +//////////////////////////////////////////////////////////////// spaced seed END + +extern KmerSet* init_kmerset(ubyte8 init_size, float load_factor); +extern int search_kmerset(KmerSet *set, ubyte8 seq, kmer_t **rs); +extern int put_kmerset(KmerSet *set, ubyte8 seq, ubyte left, ubyte right,kmer_t **kmer_p); +extern byte8 count_kmerset(KmerSet *set); +extern void free_Sets(KmerSet **KmerSets,int num); +extern void free_kmerset(KmerSet *set); +extern void dislink2nextUncertain(kmer_t *node,char ch,boolean smaller); +extern void dislink2prevUncertain(kmer_t *node,char ch,boolean smaller); + +extern int count_branch2prev(kmer_t *node); +extern int count_branch2next(kmer_t *node); +extern char firstCharInKmer(Kmer kmer); + +#endif diff --git a/fusion/inc/nuc.h b/fusion/inc/nuc.h new file mode 100755 index 0000000..fdfe10b --- /dev/null +++ b/fusion/inc/nuc.h @@ -0,0 +1,13 @@ +/*************************************************************************** + * Title: nuc.h + * Author: Haixu Tang + * Created: Jun. 2002 + * Last modified: May. 2004 + * + * Copyright (c) 2001-2004 The Regents of the University of California + * All Rights Reserved + * See file LICENSE for details. + ***************************************************************************/ +int total_nuc = 16; +char na_name[17] = {'g', 'a', 't', 'c', + 'n', 'r', 'y', 'w', 's', 'm', 'k', 'h', 'b', 'v', 'd', 'x'}; diff --git a/fusion/inc/stack.h b/fusion/inc/stack.h new file mode 100755 index 0000000..c09ed5f --- /dev/null +++ b/fusion/inc/stack.h @@ -0,0 +1,35 @@ +#ifndef __STACK__ +#define __STACK__ + +#include +#include +#include + +typedef struct block_starter +{ + struct block_starter *prev; + struct block_starter *next; +}BLOCK_STARTER; + +typedef struct stack +{ + BLOCK_STARTER *block_list; + int index_in_block; + int items_per_block; + int item_c; + size_t item_size; + BLOCK_STARTER *block_backup; + int index_backup; + int item_c_backup; +}STACK; + +void stackBackup(STACK *astack); +void stackRecover(STACK *astack); +void *stackPush(STACK *astack); +void *stackPop(STACK *astack); +void freeStack(STACK *astack); +void emptyStack(STACK *astack); +STACK *createStack(int num_items,size_t unit_size); + + +#endif diff --git a/fusion/inc/stdinc.h b/fusion/inc/stdinc.h new file mode 100755 index 0000000..9700d5d --- /dev/null +++ b/fusion/inc/stdinc.h @@ -0,0 +1,40 @@ +/*************************************************************************** + + * Title: stdinc.h + + * Author: Haixu Tang + + * Created: Jun. 2002 + + * Last modified: May. 2004 + + * + + * Copyright (c) 2001-2004 The Regents of the University of California + + * All Rights Reserved + + * See file LICENSE for details. + + ***************************************************************************/ + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include "def.h" + +#include "general.h" + diff --git a/fusion/inc/types.h b/fusion/inc/types.h new file mode 100755 index 0000000..fdeb4f6 --- /dev/null +++ b/fusion/inc/types.h @@ -0,0 +1,14 @@ +#ifndef __TYPES_RJ +#define __TYPES_RJ + +typedef unsigned long long ubyte8; +typedef unsigned int ubyte4; +typedef unsigned short ubyte2; +typedef unsigned char ubyte; + +typedef long long byte8; +typedef int byte4; +typedef short byte2; +typedef char byte; + +#endif diff --git a/fusion/kmer.c b/fusion/kmer.c new file mode 100755 index 0000000..37c7da1 --- /dev/null +++ b/fusion/kmer.c @@ -0,0 +1,135 @@ +#include "stdinc.h" +#include "newhash.h" +#include "extfunc.h" +#include "extvab.h" + +static unsigned char filter_array[8] = { (unsigned char) 1,((unsigned char) 1) << 1,((unsigned char) 1) << 2,((unsigned char) 1) << 3,((unsigned char) 1) << 4,((unsigned char) 1) << 5,((unsigned char) 1) << 6,((unsigned char) 1) << 7}; + + +void link2next(NODE *node,char ch) +{ + if(node->links & filter_array[(int)ch]) + node->linksB = node->linksB | filter_array[(int)ch]; + else + node->links = node->links | filter_array[(int)ch]; + +} + +unsigned char check_linkB2next(NODE *node,char ch) +{ + return filter_array[(int)ch]&node->linksB; +} + +unsigned char check_link2next(NODE *node,char ch) +{ + return filter_array[(int)ch]&node->links; +} + +void unlink2next(NODE *node,char ch) +{ + node->links = node->links & (~filter_array[(int)ch]); +} + + +void link2prev(NODE *node,char ch) +{ + if(node->links & filter_array[ch+4]) + node->linksB = node->linksB | filter_array[ch+4]; + else + node->links = node->links | filter_array[ch+4]; +} + +unsigned char check_linkB2prev(NODE *node,char ch) +{ + return filter_array[ch+4]&node->linksB; +} + +unsigned char check_link2prev(NODE *node,char ch) +{ + return filter_array[ch+4]&node->links; +} + +void unlink2prev(NODE *node,char ch) +{ + node->links = node->links & (~filter_array[ch+4]); +} + +int count_link2next(NODE *node) +{ + int num = 0,i; + unsigned char ch = node->links; + + for(i=0;i<4;i++){ + num += ch&0x01; + ch >>= 1; + } + return num; +} + +int count_link2nextB(NODE *node) +{ + int num = 0,i; + unsigned char ch = node->linksB; + + for(i=0;i<4;i++){ + num += ch&0x01; + ch >>= 1; + } + return num; +} + +int count_link2prevB(NODE *node) +{ + int num = 0,i; + unsigned char ch = node->linksB; + + ch >>= 4; + for(i=0;i<4;i++){ + num += ch&0x01; + ch >>= 1; + } + return num; +} + +int count_link2prev(NODE *node) +{ + int num = 0,i; + unsigned char ch = node->links; + + ch >>= 4; + for(i=0;i<4;i++){ + num += ch&0x01; + ch >>= 1; + } + return num; +} + +Kmer KmerPlus(Kmer prev,char ch) +{ + Kmer word = prev; + word <<= 2; + word += ch; + return word; +} +Kmer nextKmer(Kmer prev,char ch) +{ + Kmer word = prev; + word <<= 2; + word &= WORDFILTER; + word += ch; + return word; +} + +Kmer prevKmer(Kmer next,char ch) +{ + Kmer word = next; + word >>= 2; + word += ((Kmer)ch) << 2*(overlaplen-1); + return word; +} + +char firstCharInKmer(Kmer kmer) +{ + return (char) (kmer >> 2*(overlaplen-1));// & 3; +} + diff --git a/fusion/lib.c b/fusion/lib.c new file mode 100755 index 0000000..ee40348 --- /dev/null +++ b/fusion/lib.c @@ -0,0 +1,329 @@ +#include "stdinc.h" +#include "newhash.h" +#include "extfunc.h" +#include "extvab.h" + +static char tabs[2][1024]; + +static boolean splitColumn(char *line) +{ + int len = strlen(line); + int i=0,j; + int tabs_n = 0; + + while(i=32&&line[i]<=126&&line[i]!='='){ + j=0; + while(i=32&&line[i]<=126&&line[i]!='='){ + tabs[tabs_n][j++] = line[i]; + i++; + } + tabs[tabs_n][j] = '\0'; + tabs_n++; + if(tabs_n==2) + return 1; + } + i++; + } + if(tabs_n==2) + return 1; + else + return 0; +} + +static int cmp_lib(const void *a,const void *b) +{ + LIB_INFO *A,*B; + A = (LIB_INFO *)a; + B = (LIB_INFO *)b; + + if(A->avg_ins>B->avg_ins) + return 1; + else if(A->avg_ins==B->avg_ins) + return 0; + else + return -1; +} + +void scan_libInfo(char *libfile) +{ + FILE *fp; + char line[1024],ch; + int i,j,index; + int libCounter; + boolean flag; + + fp = ckopen(libfile,"r"); + num_libs = 0; + while(fgets(line,1024,fp)){ + ch = line[5]; + line[5] = '\0'; + if(strcmp(line,"[LIB]")==0) + num_libs++; + if(!num_libs){ + line[5] = ch; + flag = splitColumn(line); + if(!flag) + continue; + if(strcmp(tabs[0],"max_rd_len")==0) + maxReadLen = atoi(tabs[1]); + } + } +//count file numbers of each type + lib_array = (LIB_INFO *)ckalloc(num_libs*sizeof(LIB_INFO)); + for(i=0;i0 ? maxLong:maxReadLen; +} + +void free_libs() +{ + + if(!lib_array) + return; + + int i,j; + for(i=0;i*B) + return 1; + else if(*A==*B) + return 0; + else + return -1; +} +int uniqueLenSearch(unsigned int *len_array,unsigned int *flag_array,int num,unsigned int target) +{ + int mid,low,high; + low = 1; + high = num; + + while(low<=high){ + mid = (low+high)/2; + if(len_array[mid]==target) + break; + else if(target>len_array[mid]) + low = mid+1; + else + high = mid-1; + } + if(low>high) + return -1; + //locate the first same length unflaged + return flag_array[mid]++; + +} + +int lengthSearch(unsigned int *len_array,unsigned int *flag_array,int num,unsigned int target) +{ + int mid,low,high,i; + low = 1; + high = num; + + while(low<=high){ + mid = (low+high)/2; + if(len_array[mid]==target) + break; + else if(target>len_array[mid]) + low = mid+1; + else + high = mid-1; + } + if(low>high) + return -1; + //locate the first same length unflaged + if(!flag_array[mid]){ + for(i=mid-1;i>0;i--){ + if(len_array[i]!=len_array[mid]||flag_array[i]) + break; + } + flag_array[i+1] = 1; + return i+1; + }else{ + for(i=mid+1;i<=num;i++){ + if(!flag_array[i]) + break; + } + flag_array[i] = 1; + return i; + } + +} + +void quick_sort_int(unsigned int *length_array, int low, int high) +{ + int i, j; + Kmer pivot; + if (low < high) + { + pivot=length_array[low]; + i=low; + j=high; + + while(i=pivot) + j--; + if(i'){ + sscanf(line+7,"%d",&length); + index_array[++index] = length; + length_array[++i] = length; + } + } + num_ctg = index; + orig2new = 1; + //quick_sort_int(length_array,1,num_ctg); + qsort(&(length_array[1]),num_ctg,sizeof(length_array[0]),cmp_int); + //extract unique length + diff_len = 0; + for(i=1;i<=num_ctg;i++){ + for(j=i+1;j<=num_ctg;j++) + if(length_array[j]!=length_array[i]) + break; + length_array[++diff_len] = length_array[i]; + flag_array[diff_len] = i; + i = j - 1; + } + /* + for(i=1;i<=num_ctg;i++) + flag_array[i] = 0; + */ + contig_array = (CONTIG *)ckalloc((num_ctg+1)*sizeof(CONTIG)); + + //load edges + index = 0; + rewind(fp); + while(fgets(line,sizeof(line),fp)!=NULL){ + if(line[0]=='>'){ +// if(overlaplen<=31) +// sscanf(line,">length %u,%llx,%llx,%d,%d",&length,&from_kmer,&to_kmer,&bal_ed,&cvg); +// else + sscanf(line,">length %u,%d,%d",&length,&bal_ed,&cvg); + newIndex = uniqueLenSearch(length_array,flag_array,diff_len,length); + index_array[++index]=newIndex; + + contig_array[newIndex].length = length; + contig_array[newIndex].bal_edge = bal_ed + 1; + contig_array[newIndex].downwardConnect = NULL; + contig_array[newIndex].mask = 0; + contig_array[newIndex].flag = 0; + contig_array[newIndex].arcs = NULL; + contig_array[newIndex].seq = NULL; + contig_array[newIndex].multi = 0; + contig_array[newIndex].inSubGraph = 0; + contig_array[newIndex].cvg = cvg/10; + if(cvg){ + counter += length; + cvgSum += cvg*length; + } + fprintf(out_fp,"%d %d %d\n",index,newIndex,contig_array[newIndex].bal_edge); + } + } + if(counter) + //cvgAvg = cvgSum/counter > 2 ? cvgSum/counter : 3; + cvgAvg = cvgSum/counter/10 > 2 ? cvgSum/counter/10 : 3; + + //mark repeats + int bal_i; + /*if(maskRep){ + counter = 0; + for(i=1;i<=num_ctg;i++){ + bal_i = getTwinCtg(i); + if((contig_array[i].cvg+contig_array[bal_i].cvg)>4*cvgAvg){ + contig_array[i].mask = 1; + contig_array[bal_i].mask = 1; + counter += 2; + } + if(isSmallerThanTwin(i)) + i++; + } + printf("average contig coverage : %d. Number of contig(s) masked because of high coverage: %llx\n", + cvgAvg,counter); + }*/ + + counter = 0; + for(i=1;i<=num_ctg;i++){ + if(contig_array[i].mask) + continue; + bal_i = getTwinCtg(i); + if(contig_array[i].lengthmultiplicity = weight; + parc->next = contig_array[from_c].arcs; + contig_array[from_c].arcs = parc; +}*/ + +/*void loadArcs(char *graphfile) +{ + FILE *fp; + char name[256],line[1024]; + unsigned int target,weight; + unsigned int from_ed; + char *seg; + + sprintf(name,"%s.Arc",graphfile); + fp = ckopen(name,"r"); + + createPreArcMemManager(); + arcCounter = 0; + while(fgets(line,sizeof(line),fp)!=NULL){ + seg = strtok(line," "); + from_ed = atoi(seg); + //printf("%d\n",from_ed); + while((seg=strtok(NULL," "))!=NULL){ + target = atoi(seg); + seg = strtok(NULL," "); + weight = atoi(seg); + add1Arc(from_ed,target,weight); + + } + } + printf("%lld arcs loaded\n",arcCounter); + fclose(fp); +}*/ + +void loadContig(char *graphfile) +{ + //fprintf(stderr,"[%s]entering this function\n",__FUNCTION__); + char c,name[256],line[1024],*tightSeq=NULL; + FILE *fp; + int n=0,length,index=-1,edgeno; + unsigned int i; + unsigned int newIndex; + + sprintf(name,"%s.contig",graphfile); + fp = ckopen(name,"r"); + + while(fgets(line,sizeof(line),fp)!=NULL){ + if(line[0]=='>'){ + if(index>=0){ + newIndex = index_array[edgeno]; + contig_array[newIndex].seq = tightSeq; + } + n=0; + index++; + sscanf(line+1,"%d %s %d",&edgeno,name,&length); + //printf("contig %d, length %d\n",edgeno,length); + tightSeq = (char *)ckalloc((length/4+1)*sizeof(char)); + //fprintf(stderr,"[%s]loaded %d.\n",__FUNCTION__,edgeno); + }else{ + int tmp_len=strlen(line); + for(i=0;i='a' && line[i]<='z'){ + c = base2int(line[i]-'a'+'A'); + writeChar2tightString(c,tightSeq,n++); + } + else if(line[i]>='A' && line[i]<='Z'){ + c = base2int(line[i]); + writeChar2tightString(c,tightSeq,n++); + } + } + } + + } + if(index>=0){ + newIndex = index_array[edgeno]; + contig_array[newIndex].seq = tightSeq; + } + printf("[%s]input %d contigs\n",__FUNCTION__,index+1); + fclose(fp); + + //printf("the %dth contig with index 107\n",index); +} +void freeContig_array() +{ + if(!contig_array) + return; + + unsigned int i; + + for(i=1;i<=num_ctg;i++){ + if(contig_array[i].seq) + free((void *)contig_array[i].seq); + if(contig_array[i].closeReads) + freeStack(contig_array[i].closeReads); + } + + free((void *)contig_array); + contig_array = NULL; +} +/* +void loadCvg(char *graphfile) +{ + char name[256],line[1024]; + FILE *fp; + int cvg; + unsigned int newIndex,edgeno,bal_ctg; + + sprintf(name,"%s.contigCVG",graphfile); + fp = fopen(name,"r"); + if(!fp){ + printf("contig coverage file %s is not found!\n",name); + return; + } + + while(fgets(line,sizeof(line),fp)!=NULL){ + if(line[0]=='>'){ + sscanf(line+1,"%d %d",&edgeno,&cvg); + newIndex = index_array[edgeno]; + cvg = cvg <= 255 ? cvg:255; + contig_array[newIndex].multi = cvg; + bal_ctg = getTwinCtg(newIndex); + contig_array[bal_ctg].multi= cvg; + } + } + fclose(fp); +} +*/ diff --git a/fusion/localAsm.c b/fusion/localAsm.c new file mode 100755 index 0000000..becf882 --- /dev/null +++ b/fusion/localAsm.c @@ -0,0 +1,1629 @@ +#include "stdinc.h" +#include "newhash.h" +#include "extfunc.h" +#include "extvab.h" + +#define CTGendLen 35 // shouldn't larger than max_read_len +#define UPlimit 5000 +#define MaxRouteNum 10 + +static Kmer pubKmer = 0x1b4d65165b; + +static void kmerSet_mark(KmerSet *set); +static void trace4Repeat(Kmer currW,int steps,int min,int max,int *num_route, + KmerSet *kset,Kmer kmerDest,int overlap,Kmer WORDF, + int *traceCounter,int maxRoute,kmer_t **soFarNode,short *multiOccu1,short *multiOccu2, + int *routeLens,char **foundRoutes,char *soFarSeq, + long long *soFarLinks,double *avgLinks); + +static Kmer prevKmerLocal(Kmer next,char ch,int overlap) +{ + Kmer word = next; + word >>= 2; + word += ((Kmer)ch) << 2*(overlap-1); + return word; +} +static Kmer nextKmerLocal(Kmer prev,char ch,Kmer WordFilter) +{ + Kmer word = prev; + word <<= 2; + word &= WordFilter; + word += ch; + return word; +} +static void singleKmer(int t, KmerSet *kset,int flag,Kmer *kmerBuffer,char *prevcBuffer,char *nextcBuffer) +{ + kmer_t *pos; + + put_kmerset(kset, kmerBuffer[t], prevcBuffer[t],nextcBuffer[t],&pos); + if(pos->inEdge==flag) + return; + else if(pos->inEdge==0) + pos->inEdge = flag; + else if(pos->inEdge==1&&flag==2) + pos->inEdge = 3; + else if(pos->inEdge==2&&flag==1) + pos->inEdge = 3; + +} + +static void putKmer2DBgraph(KmerSet *kset,int flag,int kmer_c,Kmer *kmerBuffer,char *prevcBuffer,char *nextcBuffer) +{ + int t; + for(t=0;t0) + prevcBuffer[index] = bal_seq[bal_j-1]; + else + prevcBuffer[index] = InvalidCh; + nextcBuffer[index++] = bal_seq[bal_j+overlap]; + //printf("%dth: %p with %p\n",kmer_c-1,bal_word,hashBanBuffer[kmer_c-1]); + } + } + *kmer_c = index; +} + +static void headTightStr(char *tightStr,int length,int start,int headLen,int revS,char *src_seq) +{ + int i,index=0; + + if(!revS){ + for(i=start;i=length-headLen-start;i--) + src_seq[index++] = int_comp(getCharInTightString(tightStr,i)); + } +} + +static int getSeqFromCtg(CTGinSCAF *ctg,boolean fromHead,unsigned int len,int originOverlap,char *src_seq) +{ + unsigned int ctgId = ctg->ctgID; + unsigned int bal_ctg = getTwinCtg(ctgId); + + if(contig_array[ctgId].length<1) + return 0; + unsigned int length = contig_array[ctgId].length + originOverlap; + + len = len < length ? len:length; + if(fromHead){ + if(contig_array[ctgId].seq) + headTightStr(contig_array[ctgId].seq,length,0,len,0,src_seq); + else + headTightStr(contig_array[bal_ctg].seq,length,0,len,1,src_seq); + }else{ + if(contig_array[ctgId].seq) + headTightStr(contig_array[ctgId].seq,length,length-len,len,0,src_seq); + else + headTightStr(contig_array[bal_ctg].seq,length,length-len,len,1,src_seq); + } + return len; +} + + +static KmerSet *readsInGap2DBgraph(READNEARBY *rdArray, int num, CTGinSCAF *ctg1,CTGinSCAF *ctg2,int originOverlap, + Kmer *kmerCtg1,Kmer *kmerCtg2,int overlap,Kmer WordFilter) +{ + int kmer_c; + Kmer *kmerBuffer; + char *nextcBuffer,*prevcBuffer; + int i; + int buffer_size=maxReadLen > CTGendLen ? maxReadLen:CTGendLen; + KmerSet *kmerS=NULL; + int lenCtg1; + int lenCtg2; + char *bal_seq; + char *src_seq; + + src_seq = (char *)ckalloc(buffer_size*sizeof(char)); + bal_seq = (char *)ckalloc(buffer_size*sizeof(char)); + + kmerBuffer = (Kmer *)ckalloc(buffer_size*sizeof(Kmer)); + prevcBuffer = (char *)ckalloc(buffer_size*sizeof(char)); + nextcBuffer = (char *)ckalloc(buffer_size*sizeof(char)); + + kmerS = init_kmerset(1024,0.77f); + + for(i=0;ictgID==3733&&ctg2->ctgID==3067){ + for(i=0;i=0;i--){ + ch = kmer&3; + kmer >>= 2; + kmerSeq[i] = ch; + } + for(i=0;iiter_ptr = 0; + while(set->iter_ptr < set->size){ + if(!is_kmer_entity_null(set->flags, set->iter_ptr)){ + in_num = out_num = 0; + rs = set->array + set->iter_ptr; + word = rs->seq; + for(i=0;i<4;i++){ + cvgSingle = get_kmer_left_cov(*rs,i); + if(cvgSingle>0){ + in_num++; + } + cvgSingle = get_kmer_right_cov(*rs,i); + if(cvgSingle>0){ + out_num++; + } + } + + if(rs->single){ + counter++; + } + if(in_num==1&&out_num==1){ + rs->linear = 1; + linear++; + } + } + set->iter_ptr ++; + } + //printf("Allocated %ld node, %ld single nodes, %ld linear\n",(long)count_kmerset(set),counter,linear); +} + +static kmer_t *searchNode(Kmer word,KmerSet *kset,int overlap) +{ + Kmer bal_word = reverseComplement(word,overlap); + kmer_t *node; + boolean found; + if(wordUPlimit){ + /* + if(overlap==19&&kmerDest[0]==pubKmer) + printf("UPlimit\n"); + */ + return; + } + if(steps>max||*num_route>=maxRoute){ + /* + if(overlap==19&&kmerDest[0]==pubKmer) + printf("max steps/maxRoute\n"); + */ + return; + } + Kmer word = reverseComplement(currW,overlap); + boolean isSmaller = currW < word; + int i; + char ch; + unsigned char links; + if(isSmaller) + word = currW; + + kmer_t *node; + boolean found = search_kmerset(kset,word,&node); + if(!found){ + printf("Trace: can't find kmer %llx (rc %llx, input %llx) at step %d\n",word, + reverseComplement(word,overlap),currW,steps); + return; + } + + if(node->twin>1) + return; + if(soFarNode) + soFarNode[steps] = node; + + if(steps>0) + soFarSeq[steps-1] = currW&0x03; + + int index,end; + int linkCounter = *soFarLinks; + if(steps>=min&&node->inEdge>1&&(end=searchKmerOnCtg(currW,kmerDest,num))>=0){ + index = *num_route; + if(steps>0) + avgLinks[index] = (double)linkCounter/steps; + else + avgLinks[index] = 0; + //find node that appears more than once in the path + multiOccu[index] = 0; + for(i=0;ideleted = 0; + for(i=0;ideleted){ + multiOccu[index] = 1; + break; + } + soFarNode[i]->deleted = 1; + } + + routeEndOnCtg2[index] = end; + routeLens[index] = steps; + char *array = foundRoutes[index]; + for(i=0;i0;i--){ + ch = nPick1(array,i); + links = get_kmer_right_cov(*node,ch); + if(!links) + continue; + *soFarLinks = linkCounter + links; + word = nextKmerLocal(currW,ch,WORDF); + traceAlongDBgraph(word,steps,min,max,num_route, + kset,kmerDest,num,overlap,WORDF, + foundRoutes,routeEndOnCtg2,routeLens,soFarSeq, + traceCounter,maxRoute,soFarNode,multiOccu, + soFarLinks,avgLinks); + } + }else{ + int array[] = {0,1,2,3}; + for(i=4;i>0;i--){ + ch = nPick1(array,i); + links = get_kmer_left_cov(*node,ch); + if(!links) + continue; + *soFarLinks = linkCounter + links; + word = nextKmerLocal(currW,int_comp(ch),WORDF); + traceAlongDBgraph(word,steps,min,max,num_route, + kset,kmerDest,num,overlap,WORDF, + foundRoutes,routeEndOnCtg2,routeLens,soFarSeq, + traceCounter,maxRoute,soFarNode,multiOccu, + soFarLinks,avgLinks); + } + } +} + +static int searchFgap(KmerSet *kset,CTGinSCAF *ctg1,CTGinSCAF *ctg2,Kmer *kmerCtg1, + Kmer *kmerCtg2,unsigned int origOverlap,int overlap,DARRAY *gapSeqArray, + int len1,int len2,Kmer WordFilter,int *offset1,int *offset2,char *seqGap,int *cut1,int *cut2) +{ + + int i; + int ret = 0; + kmer_t *node,**soFarNode; + int num_route; + int gapLen = ctg2->start - ctg1->end - origOverlap + overlap; + int min = gapLen-GLDiff>0 ? gapLen-GLDiff:0; //0531 + int max = gapLen + GLDiff < 10 ? 10 : gapLen + GLDiff; + char **foundRoutes; + char *soFarSeq; + int traceCounter; + int *routeEndOnCtg2; + int *routeLens; + boolean *multiOccu; + long long soFarLinks; + double *avgLinks; + + //mask linear internal linear kmer on contig1 end + routeEndOnCtg2 = (int *)ckalloc(MaxRouteNum*sizeof(int)); + routeLens = (int *)ckalloc(MaxRouteNum*sizeof(int)); + multiOccu = (boolean *)ckalloc(MaxRouteNum*sizeof(boolean)); + short *MULTI1 = (short *)ckalloc(MaxRouteNum*sizeof(short)); + short *MULTI2 = (short *)ckalloc(MaxRouteNum*sizeof(short)); + soFarSeq = (char *)ckalloc(max*sizeof(char)); + soFarNode = (kmer_t **)ckalloc((max+1)*sizeof(kmer_t *)); + foundRoutes = (char **)ckalloc(MaxRouteNum*sizeof(char *));; + avgLinks = (double *)ckalloc(MaxRouteNum*sizeof(double));; + for(i=0;i=0;i--){ + + num_route = traceCounter = soFarLinks = 0; + int steps=0; + traceAlongDBgraph(kmerCtg1[i],steps,min,max,&num_route, + kset,kmerCtg2,len2,overlap,WordFilter, + foundRoutes,routeEndOnCtg2,routeLens,soFarSeq, + &traceCounter,MaxRouteNum,soFarNode,multiOccu, + &soFarLinks,avgLinks); + if(num_route>0){ + int m,minEnd=routeEndOnCtg2[0]; + for(m=0;m1){ + for(m=0;m3) + break; + printf("%c",int2base((int)foundRoutes[m][j])); + } + printf(": %4.2f\n",avgLinks[m]); + } + } */ + + num_route = traceCounter = soFarLinks = 0; + steps=0; + trace4Repeat(kmerCtg1[i],steps,min,max,&num_route, + kset,kmerCtg2[minEnd],overlap,WordFilter, + &traceCounter,MaxRouteNum,soFarNode,MULTI1,MULTI2, + routeLens,foundRoutes,soFarSeq,&soFarLinks,avgLinks); + int j,best=0; + int maxLen=routeLens[0]; + double maxLink = avgLinks[0]; + char *pt; + boolean repeat=0,sameLen=1; + int leftMost=max,rightMost=max; + if(num_route<1){ + fprintf(stderr,"After trace4Repeat: non route was found\n"); + continue; + } + if(num_route>1){ + // if multi paths are found, we check on the repeatative occurrences and links/length + for(m=0;m=0&&MULTI2[m]>=0){ + repeat = 1; + leftMost = leftMost>MULTI1[m] ? MULTI1[m]:leftMost; + rightMost = rightMost>MULTI2[m] ? MULTI2[m]:rightMost; + } + if(routeLens[m]!=maxLen) + sameLen = 0; + if(routeLens[m]maxLink){ + maxLink = avgLinks[m]; + best = m; + } + } + } + + if(repeat){ + *offset1 = *offset2 = *cut1 = *cut2 = 0; + int index=0; + char ch; + for(j=0;j0||*offset2>0){ + *cut1 = len1-i-1; + *cut2 = minEnd; + //fprintf(stderr,"\n"); + for(m=0;m3) + break; + //fprintf(stderr,"%c",int2base((int)foundRoutes[m][j])); + } + //fprintf(stderr,": %4.2f\n",avgLinks[m]); + } + /* + fprintf(stderr,">Gap (%d + %d) (%d + %d)\n",*offset1,*offset2,*cut1,*cut2); + for(index=0;index<*offset1+*offset2;index++) + fprintf(stderr,"%c",int2base(seqGap[index])); + fprintf(stderr,"\n"); */ + } + + ret = 3; + break; + } + + if(overlap+(len1-i-1)+minEnd-routeLens[best]>(int)origOverlap) + continue; + + ctg1->gapSeqOffset = gapSeqArray->item_c; + ctg1->gapSeqLen = routeLens[best]; + if(!darrayPut(gapSeqArray,ctg1->gapSeqOffset+maxLen/4)) + continue; + pt = (char *)darrayPut(gapSeqArray,ctg1->gapSeqOffset); + /* + printKmer(stderr,kmerCtg1[i],overlap); + fprintf(stderr,"-"); + */ + for(j=0;j3) + break; + writeChar2tightString(foundRoutes[best][j],pt,j); + //fprintf(stderr,"%c",int2base((int)foundRoutes[best][j])); + } + //fprintf(stderr,": GAPSEQ %d + %d, avglink %4.2f\n",len1-i-1,minEnd,avgLinks[best]); + ctg1->cutTail = len1-i-1; + ctg2->cutHead = overlap + minEnd; + ctg2->scaftig_start = 0; + + ret = 1; + break; + /* }if(num_route>1){ + ret = 2; + break; */ + }else{ //mark node which leads to dead end + node = searchNode(kmerCtg1[i],kset,overlap); + if(node) + node->twin = 2; + } + + } + for(i=0;iUPlimit) + return; + if(steps>max||*num_route>=maxRoute) + return; + Kmer word = reverseComplement(currW,overlap); + boolean isSmaller = currW < word; + char ch; + unsigned char links; + int index,i; + + if(isSmaller) + word = currW; + + kmer_t *node; + boolean found = search_kmerset(kset,word,&node); + if(!found){ + printf("Trace: can't find kmer %llx (rc %llx, input %llx) at step %d\n",word, + reverseComplement(word,overlap),currW,steps); + return; + } + if(soFarNode) + soFarNode[steps] = node; + if(soFarSeq&&steps>0) + soFarSeq[steps-1] = currW&0x03; + int linkCounter; + if(soFarLinks) + linkCounter = *soFarLinks; + if(steps>=min&&currW==kmerDest){ + index = *num_route; + if(avgLinks&&steps>0) + avgLinks[index] = (double)linkCounter/steps; + else if(avgLinks) + avgLinks[index] = 0; + //find node that appears more than once in the path + if(multiOccu1&&multiOccu2){ + for(i=0;ideleted = 0; + int rightMost=0; + boolean MULTI=0; + for(i=0;ideleted){ + rightMost = rightMostdeleted = 1; + } + if(!MULTI) + multiOccu1[index] = multiOccu2[index] = -1; + else{ + multiOccu2[index] = steps-2-rightMost<0 ? 0:steps-2-rightMost; //[0 steps-2] + for(i=0;ideleted = 0; + int leftMost=steps-2; + for(i=steps;i>=0;i--){ + if(soFarNode[i]->deleted) + leftMost = leftMost>i-1 ? i-1:leftMost; + soFarNode[i]->deleted = 1; + } + multiOccu1[index] = leftMost<0 ? 0:leftMost; //[0 steps-2] + } + } + if(routeLens) + routeLens[index] = steps; + if(soFarSeq){ + char *array = foundRoutes[index]; + for(i=0;i0;i--){ + ch = nPick1(array,i); + links = get_kmer_right_cov(*node,ch); + if(!links) + continue; + if(soFarLinks) + *soFarLinks = linkCounter + links; + word = nextKmerLocal(currW,ch,WORDF); + trace4Repeat(word,steps,min,max,num_route, + kset,kmerDest,overlap,WORDF,traceCounter,maxRoute,soFarNode, + multiOccu1,multiOccu2,routeLens,foundRoutes,soFarSeq, + soFarLinks,avgLinks); + } + }else{ + int array[] = {0,1,2,3}; + for(i=4;i>0;i--){ + ch = nPick1(array,i); + links = get_kmer_left_cov(*node,ch); + if(!links) + continue; + if(soFarLinks) + *soFarLinks = linkCounter + links; + word = nextKmerLocal(currW,int_comp(ch),WORDF); + trace4Repeat(word,steps,min,max,num_route, + kset,kmerDest,overlap,WORDF,traceCounter,maxRoute,soFarNode, + multiOccu1,multiOccu2,routeLens,foundRoutes,soFarSeq, + soFarLinks,avgLinks); + } + } +} + +//found repeat node on contig ends +static void maskRepeatNode(KmerSet *kset,Kmer *kmerCtg1, + Kmer *kmerCtg2,int overlap, + int len1,int len2,int max,Kmer WordFilter) +{ + int i; + int num_route,steps; + int min = 1,maxRoute=1; + int traceCounter; + Kmer word,bal_word; + kmer_t *node; + boolean found; + int counter=0; + for(i=0;ibal_word) + word=bal_word; + found = search_kmerset(kset,word,&node); + if(!found||node->linear){ + //printf("Found no node for kmer %llx\n",word); + continue; + } + num_route = traceCounter = 0; + steps=0; + trace4Repeat(word,steps,min,max,&num_route, + kset,word,overlap,WordFilter, + &traceCounter,maxRoute,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL); + if(num_route<1) + continue; + counter++; + node->checked = 1; + } + for(i=0;ibal_word) + word=bal_word; + found = search_kmerset(kset,word,&node); + if(!found||node->linear){ + //printf("Found no node for kmer %llx\n",word); + continue; + } + num_route = traceCounter = 0; + steps=0; + trace4Repeat(word,steps,min,max,&num_route, + kset,word,overlap,WordFilter, + &traceCounter,maxRoute,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL); + if(num_route<1) + continue; + counter++; + node->checked = 1; + } + //printf("MR: %d(%d)\n",counter,len1+len2); +} + +/* +static boolean chopReadFillGap(int len_seq,int overlap,char *src_seq, char *bal_seq, + KmerSet *kset,Kmer WORDF,int *start,int *end,boolean *bal, + Kmer *KmerCtg1,int len1,Kmer *KmerCtg2,int len2,int *index1,int *index2) +{ + int index,j=0,bal_j; + Kmer word,bal_word; + int flag=0,bal_flag=0; + int ctg1start,bal_ctg1start,ctg2end,bal_ctg2end; + int seqStart,bal_start,seqEnd,bal_end; + kmer_t *node; + boolean found; + + if(len_seqlinear&&!node->checked){ + if(!flag&&node->inEdge==1){ + ctg1start = searchKmerOnCtg(word,KmerCtg1,len1); + if(ctg1start>0){ + flag = 1; + seqStart = j + overlap-1; + } + } + if(!bal_flag&&node->inEdge==2){ + bal_ctg2end = searchKmerOnCtg(bal_word,KmerCtg2,len2); + if(bal_ctg2end>0){ + bal_flag = 2; + bal_end = bal_j+overlap-1; + } + } + } + + for(j = 1; j <= len_seq - overlap; j ++) { + word = nextKmerLocal(word,src_seq[j-1+overlap],WORDF); + bal_j = len_seq-j-overlap; // j; + bal_word = prevKmerLocal(bal_word,bal_seq[bal_j],overlap); + + if(wordlinear&&!node->checked){ + if(!flag&&node->inEdge==1){ + ctg1start = searchKmerOnCtg(word,KmerCtg1,len1); + if(ctg1start>0){ + flag = 1; + seqStart = j + overlap-1; + } + }else if(flag==1&&node->inEdge==1){ + index = searchKmerOnCtg(word,KmerCtg1,len1); + if(index>ctg1start){ // choose hit closer to gap + ctg1start = index; + seqStart = j + overlap-1; + } + }else if(flag==1&&node->inEdge==2){ + ctg2end = searchKmerOnCtg(word,KmerCtg2,len2); + if(ctg2end>0){ + flag = 3; + seqEnd = j+overlap-1; + break; + } + } + + if(!bal_flag&&node->inEdge==2){ + bal_ctg2end = searchKmerOnCtg(bal_word,KmerCtg2,len2); + if(bal_ctg2end>0){ + bal_flag = 2; + bal_end = bal_j+overlap-1; + } + }else if(bal_flag==2&&node->inEdge==2){ + index = searchKmerOnCtg(bal_word,KmerCtg2,len2); + if(indexinEdge==1){ + bal_ctg1start = searchKmerOnCtg(bal_word,KmerCtg1,len1); + if(bal_ctg1start>0){ + bal_flag = 3; + bal_start = bal_j+overlap-1; + break; + } + } + } + } + if(flag==3){ + *start = seqStart; + *end = seqEnd; + *bal = 0; + *index1 = ctg1start; + *index2 = ctg2end; + return 1; + }else if(bal_flag==3){ + *start = bal_start; + *end = bal_end; + *bal = 1; + *index1 = bal_ctg1start; + *index2 = bal_ctg2end; + return 1; + } + return 0; +} + +static boolean readsCrossGap(READNEARBY *rdArray, int num, int originOverlap,DARRAY *gapSeqArray, + Kmer *kmerCtg1,Kmer *kmerCtg2,int overlap,int len1,int len2, + CTGinSCAF *ctg1,CTGinSCAF *ctg2,KmerSet *kmerS,Kmer WordFilter,int min,int max) +{ + int i,j,start,end,startOnCtg1,endOnCtg2; + char *bal_seq; + char *src_seq; + char *pt; + boolean bal,ret=0,FILL; + + src_seq = (char *)ckalloc(maxReadLen*sizeof(char)); + bal_seq = (char *)ckalloc(maxReadLen*sizeof(char)); + + for(i=0;imax) + continue; + fprintf(stderr,"Read across\n"); + //printf("Filled: K %d, ctg1 %d ctg2 %d,start %d end %d\n",overlap,startOnCtg1,endOnCtg2,start,end); + if(overlap+(len1-startOnCtg1-1)+endOnCtg2-(end-start)>(int)originOverlap) + continue; // contig1 and contig2 could not overlap more than origOverlap bases + + ctg1->gapSeqOffset = gapSeqArray->item_c; + ctg1->gapSeqLen = end-start; + if(!darrayPut(gapSeqArray,ctg1->gapSeqOffset+(end-start)/4)) + continue; + pt = (char *)darrayPut(gapSeqArray,ctg1->gapSeqOffset); + for(j=start+1;j<=end;j++){ + if(bal) + writeChar2tightString(bal_seq[j],pt,j-start-1); + else + writeChar2tightString(src_seq[j],pt,j-start-1); + + } + ctg1->cutTail = len1-startOnCtg1-1; + ctg2->cutHead = overlap + endOnCtg2; + ctg2->scaftig_start = 0; + + ret = 1; + break; + } + + free((void*)src_seq); + free((void*)bal_seq); + return ret; +} +*/ +static void kmerSet_markTandem(KmerSet *set,Kmer WordFilter,int overlap); +static boolean readsCrossGap(READNEARBY *rdArray, int num, int originOverlap,DARRAY *gapSeqArray, + Kmer *kmerCtg1,Kmer *kmerCtg2,int overlap, + CTGinSCAF *ctg1,CTGinSCAF *ctg2,KmerSet *kmerS,Kmer WordFilter,int min,int max, + int offset1,int offset2,char *seqGap,char *seqCtg1,char *seqCtg2,int cut1,int cut2); + +int localGraph(READNEARBY *rdArray,int num,CTGinSCAF *ctg1,CTGinSCAF *ctg2, + int origOverlap,Kmer *kmerCtg1,Kmer *kmerCtg2, + int overlap,DARRAY *gapSeqArray,char *seqCtg1,char *seqCtg2,char *seqGap) +{ + /**************** put kmer in DBgraph ****************/ + KmerSet *kmerSet; + Kmer WordFilter = (((Kmer) 1) << (2*overlap)) - 1; +/* + if(ctg1->ctgID==56410&&ctg2->ctgID==61741) + printf("Extract %d reads for gap [%d %d]\n",num,ctg1->ctgID,ctg2->ctgID); +*/ + kmerSet = readsInGap2DBgraph(rdArray,num,ctg1,ctg2,origOverlap, + kmerCtg1,kmerCtg2,overlap,WordFilter); + time_t tt; + time(&tt); +// srand48((int)tt); +/* + int i,j; + for(i=0;i<2;i++){ + int array[] = {0,1,2,3}; + for(j=4;j>0;j--) + fprintf(stderr,"%d ", nPick1(array,j)); + } + fprintf(stderr,"\n"); +*/ + /***************** search path to connect contig ends ********/ + int gapLen = ctg2->start - ctg1->end - origOverlap + overlap; + int min = gapLen-GLDiff>0 ? gapLen-GLDiff:0; + int max = gapLen + GLDiff < 10 ? 10 : gapLen + GLDiff; + //count kmer number for contig1 and contig2 ends + int len1,len2; + len1 = CTGendLenctgID].length+origOverlap ? + CTGendLen:contig_array[ctg1->ctgID].length+origOverlap; + len2 = CTGendLenctgID].length+origOverlap ? + CTGendLen:contig_array[ctg2->ctgID].length+origOverlap; + len1 -= overlap-1; + len2 -= overlap-1; + + //int pathNum = 2; + int offset1=0,offset2=0,cut1=0,cut2=0; + int pathNum = searchFgap(kmerSet,ctg1,ctg2,kmerCtg1,kmerCtg2, + origOverlap,overlap,gapSeqArray, + len1,len2,WordFilter,&offset1,&offset2,seqGap,&cut1,&cut2); + + //printf("SF: %d K %d\n",pathNum,overlap); + if(pathNum==0){ + free_kmerset(kmerSet); + return 0; + }else if(pathNum==1){ + free_kmerset(kmerSet); + return 1; + }/* + else{ + printf("ret %d\n",pathNum); + free_kmerset(kmerSet); + return 0; + } */ + + /******************* cross the gap by single reads *********/ + //kmerSet_markTandem(kmerSet,WordFilter,overlap); + maskRepeatNode(kmerSet,kmerCtg1,kmerCtg2,overlap, + len1,len2,max,WordFilter); + boolean found = readsCrossGap(rdArray,num,origOverlap,gapSeqArray, + kmerCtg1,kmerCtg2,overlap,ctg1,ctg2,kmerSet,WordFilter,min,max, + offset1,offset2,seqGap,seqCtg1,seqCtg2,cut1,cut2); + if(found){ + //fprintf(stderr,"read across\n"); + free_kmerset(kmerSet); + return found; + } + else{ + free_kmerset(kmerSet); + return 0; + } + +} + +static void kmerSet_markTandem(KmerSet *set,Kmer WordFilter,int overlap) +{ + kmer_t *rs; + long long counter = 0; + int num_route,steps; + int min=1,max=overlap,maxRoute=1; + int traceCounter; + + set->iter_ptr = 0; + while(set->iter_ptr < set->size){ + if(!is_kmer_entity_null(set->flags, set->iter_ptr)){ + rs = set->array + set->iter_ptr; + if(rs->inEdge>0){ + set->iter_ptr ++; + continue; + } + num_route = traceCounter = 0; + steps=0; + trace4Repeat(rs->seq,steps,min,max,&num_route, + set,rs->seq,overlap,WordFilter, + &traceCounter,maxRoute,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL); + if(num_route<1){ + set->iter_ptr ++; + continue; + } + /* + printKmer(stderr,rs->seq,overlap); + fprintf(stderr, "\n"); + */ + rs->checked = 1; + counter++; + } + set->iter_ptr ++; + } +} +/******************* the following is for read-crossing gaps *************************/ + +#define MAXREADLENGTH 100 + +static const int INDEL = 0; +static const int SIM[4][4] = { + {1, 0, 0, 0}, + {0, 1, 0, 0}, + {0, 0, 1, 0}, + {0, 0, 0, 1} +}; +static char fastSequence[MAXREADLENGTH]; +static char slowSequence[MAXREADLENGTH]; + +static int Fmatrix[MAXREADLENGTH + 1][MAXREADLENGTH + 1]; +static int slowToFastMapping[MAXREADLENGTH + 1]; +static int fastToSlowMapping[MAXREADLENGTH + 1]; + +static int max(int A, int B, int C) +{ + A = A>=B ? A:B; + return (A>=C ? A:C); + +} + +static int compareSequences(char * sequence1, char * sequence2, int length1, int length2) +{ + if(length1<1||length2<1||length1>MAXREADLENGTH||length2>MAXREADLENGTH) + return 0; + int i, j; + int Choice1, Choice2, Choice3; + int maxScore; + + for (i = 0; i <= length1; i++) + Fmatrix[i][0] = 0; + for (j = 0; j <= length2; j++) + Fmatrix[0][j] = 0; + + for (i = 1; i <= length1; i++) { + for (j = 1; j <= length2; j++) { + Choice1 = + Fmatrix[i - 1][j - 1] + + SIM[(int) sequence1[i-1]] + [(int) sequence2[j-1]]; + Choice2 = Fmatrix[i - 1][j] + INDEL; + Choice3 = Fmatrix[i][j - 1] + INDEL; + Fmatrix[i][j] = max(Choice1, Choice2, Choice3); + } + } + + maxScore = Fmatrix[length1][length2]; + return maxScore; +} + +static void mapSlowOntoFast(int slowSeqLength,int fastSeqLength) +{ + int slowIndex = slowSeqLength; + int fastIndex = fastSeqLength; + int fastn, slown; + + if (slowIndex == 0) { + slowToFastMapping[0] = fastIndex; + + while (fastIndex >= 0) + fastToSlowMapping[fastIndex--] = 0; + + return; + } + + if (fastIndex == 0) { + while (slowIndex >= 0) + slowToFastMapping[slowIndex--] = 0; + + fastToSlowMapping[0] = slowIndex; + + return; + } + + while (slowIndex > 0 && fastIndex > 0) { + fastn = (int) fastSequence[fastIndex-1]; //getCharInTightString(fastSequence,fastIndex-1); + slown = (int) slowSequence[slowIndex-1]; //getCharInTightString(slowSequence,slowIndex-1); + + if (Fmatrix[fastIndex][slowIndex] == + Fmatrix[fastIndex - 1][slowIndex - 1] + + SIM[fastn][slown]) { + fastToSlowMapping[--fastIndex] = --slowIndex; + slowToFastMapping[slowIndex] = fastIndex; + } else if (Fmatrix[fastIndex][slowIndex] == + Fmatrix[fastIndex - 1][slowIndex] + INDEL) + fastToSlowMapping[--fastIndex] = slowIndex - 1; + + else if (Fmatrix[fastIndex][slowIndex] == + Fmatrix[fastIndex][slowIndex - 1] + INDEL) + slowToFastMapping[--slowIndex] = fastIndex - 1; + + else { + printf("compareSequence: Error trace\n"); + fflush(stdout); + abort(); + } + } + + while (slowIndex > 0) + slowToFastMapping[--slowIndex] = -1; + while (fastIndex > 0) + fastToSlowMapping[--fastIndex] = -1; + + slowToFastMapping[slowSeqLength] = + fastSeqLength; + fastToSlowMapping[fastSeqLength] = + slowSeqLength; +} + +static boolean chopReadFillGap(int len_seq,int overlap,char *src_seq, char *bal_seq, + KmerSet *kset,Kmer WORDF,int *start,int *end,boolean *bal, + Kmer *KmerCtg1,int len1,Kmer *KmerCtg2,int len2,int *index1,int *index2) +{ + int index,j=0,bal_j; + Kmer word,bal_word; + int flag=0,bal_flag=0; + int ctg1start,bal_ctg1start,ctg2end,bal_ctg2end; + int seqStart,bal_start,seqEnd,bal_end; + kmer_t *node; + boolean found; + + if(len_seqlinear&&!node->checked){ + if(!flag&&node->inEdge==1){ + ctg1start = searchKmerOnCtg(word,KmerCtg1,len1); + if(ctg1start>=0){ + flag = 1; + seqStart = j + overlap-1; + } + } + if(!bal_flag&&node->inEdge==2){ + bal_ctg2end = searchKmerOnCtg(bal_word,KmerCtg2,len2); + if(bal_ctg2end>=0){ + bal_flag = 2; + bal_end = bal_j+overlap-1; + } + } + } + + for(j = 1; j <= len_seq - overlap; j ++) { + word = nextKmerLocal(word,src_seq[j-1+overlap],WORDF); + bal_j = len_seq-j-overlap; // j; + bal_word = prevKmerLocal(bal_word,bal_seq[bal_j],overlap); + + if(wordlinear&&!node->checked){ + if(!flag&&node->inEdge==1){ + ctg1start = searchKmerOnCtg(word,KmerCtg1,len1); + if(ctg1start>=0){ + flag = 1; + seqStart = j + overlap-1; + } + }else if(flag==1&&node->inEdge==1){ + index = searchKmerOnCtg(word,KmerCtg1,len1); + if(index>=0&&index>ctg1start){ // choose hit closer to gap + ctg1start = index; + seqStart = j + overlap-1; + } + }else if(flag==1&&node->inEdge==2){ + ctg2end = searchKmerOnCtg(word,KmerCtg2,len2); + if(ctg2end>=0){ + flag = 3; + seqEnd = j+overlap-1; + break; + } + } + + if(!bal_flag&&node->inEdge==2){ + bal_ctg2end = searchKmerOnCtg(bal_word,KmerCtg2,len2); + if(bal_ctg2end>=0){ + bal_flag = 2; + bal_end = bal_j+overlap-1; + } + }else if(bal_flag==2&&node->inEdge==2){ + index = searchKmerOnCtg(bal_word,KmerCtg2,len2); + if(index>=0&&indexinEdge==1){ + bal_ctg1start = searchKmerOnCtg(bal_word,KmerCtg1,len1); + if(bal_ctg1start>=0){ + bal_flag = 3; + bal_start = bal_j+overlap-1; + break; + } + } + } + } + if(flag==3){ + *start = seqStart; + *end = seqEnd; + *bal = 0; + *index1 = ctg1start; + *index2 = ctg2end; + return 1; + }else if(bal_flag==3){ + *start = bal_start; + *end = bal_end; + *bal = 1; + *index1 = bal_ctg1start; + *index2 = bal_ctg2end; + return 1; + } + return 0; +} + + +static int cutSeqFromTightStr(char *tightStr,int length,int start,int end,int revS,char *src_seq) +{ + int i,index=0; + end = end < length ? end:length-1; + start = start>=0 ? start:0; + + if(!revS){ + for(i=start;i<=end;i++) + src_seq[index++] = getCharInTightString(tightStr,i); + } + else{ + for(i=length-1-start;i>=length-end-1;i--) + src_seq[index++] = int_comp(getCharInTightString(tightStr,i)); + } + return end-start+1; +} + +static int cutSeqFromCtg(unsigned int ctgID,int start,int end, char *sequence,int originOverlap) +{ + + unsigned int bal_ctg = getTwinCtg(ctgID); + if(contig_array[ctgID].length<1) + return 0; + int length = contig_array[ctgID].length+originOverlap; + if(contig_array[ctgID].seq) + return cutSeqFromTightStr(contig_array[ctgID].seq,length,start,end,0,sequence); + else + return cutSeqFromTightStr(contig_array[bal_ctg].seq,length,start,end,1,sequence); + +} + +static int cutSeqFromRead(char *src_seq,int length,int start,int end,char *sequence) +{ + if(end>=length) + printf("******: end %d length %d\n",end,length); + end = end=0 ? start:0; + int i; + for(i=start;i<=end;i++) + sequence[i-start] = src_seq[i]; + return end-start+1; +} + +static void printSeq(FILE *fo,char *seq,int len) +{ + int i; + for(i=0;i 100 ? maxReadLen:100; + int length = contig_array[ctg1->ctgID].length+originOverlap; + if(buffer_size>offset1){ + lenCtg1 = cutSeqFromCtg(ctg1->ctgID,length-cut1-(buffer_size-offset1),length-1-cut1,seqCtg1,originOverlap); + for(i=0;ictgID].length+originOverlap; + if(buffer_size>offset2){ + lenCtg2 = cutSeqFromCtg(ctg2->ctgID,cut2,buffer_size-offset2-1+cut2,&(seqCtg2[offset2]),originOverlap); + for(i=0;i0||offset2>0){ + for(i=0;imax) + continue; + if(overlap+(len1-startOnCtg1-1)+endOnCtg2-(end-start)>(int)originOverlap) + continue; // contig1 and contig2 could not overlap more than origOverlap bases + START[i] = start; + END[i] = end; + INDEX1[i] = startOnCtg1; + INDEX2[i] = endOnCtg2; + BAL[i] = bal; + + int matchLen = 2*overlap<(end-start+overlap) ? 2*overlap:(end-start+overlap); + int match; + int alignLen = matchLen; + //compare the left of hit kmer on ctg1 + //int ctgLeft = (contig_array[ctg1->ctgID].length+originOverlap)-(len1+overlap-1)+startOnCtg1; + int ctgLeft = (lenCtg1)-(len1+overlap-1)+startOnCtg1; + int readLeft = start-overlap+1; + int cmpLen = ctgLeftctgID,ctgLeft-cmpLen,ctgLeft-1,fastSequence,originOverlap); + cutSeqFromRead(seqCtg1,lenCtg1,ctgLeft-cmpLen,ctgLeft-1,fastSequence); + if(!bal) + cutSeqFromRead(src_seq,rdArray[i].len,readLeft-cmpLen,readLeft-1,slowSequence); + else + cutSeqFromRead(bal_seq,rdArray[i].len,readLeft-cmpLen,readLeft-1,slowSequence); + match = compareSequences(fastSequence,slowSequence, cmpLen, cmpLen); + + alignLen += cmpLen; + matchLen += match; + + //compare the right of hit kmer on ctg1 + int ctgRight = len1-startOnCtg1-1; + + cmpLen = ctgRight<(rdArray[i].len-start-1) ? ctgRight:(rdArray[i].len-start-1); + cmpLen = cmpLen<=MAXREADLENGTH ? cmpLen:MAXREADLENGTH; + //cutSeqFromCtg(ctg1->ctgID,ctgLeft+overlap,ctgLeft+overlap+cmpLen-1,fastSequence,originOverlap); + cutSeqFromRead(seqCtg1,lenCtg1,ctgLeft+overlap,ctgLeft+overlap+cmpLen-1,fastSequence); + if(!bal) + cutSeqFromRead(src_seq,rdArray[i].len,start+1,start+cmpLen,slowSequence); + else + cutSeqFromRead(bal_seq,rdArray[i].len,start+1,start+cmpLen,slowSequence); + match = compareSequences(fastSequence,slowSequence, cmpLen, cmpLen); + //fprintf(stderr,"%d -- %d\n",match,cmpLen); + + alignLen += cmpLen; + matchLen += match; + + //compare the left of hit kmer on ctg2 + ctgLeft = endOnCtg2; + readLeft = end-overlap+1; + cmpLen = ctgLeftctgID,endOnCtg2-cmpLen,endOnCtg2-1,fastSequence,originOverlap); + cutSeqFromRead(seqCtg2,lenCtg2,endOnCtg2-cmpLen,endOnCtg2-1,fastSequence); + if(!bal) + cutSeqFromRead(src_seq,rdArray[i].len,readLeft-cmpLen,readLeft-1,slowSequence); + else + cutSeqFromRead(bal_seq,rdArray[i].len,readLeft-cmpLen,readLeft-1,slowSequence); + match = compareSequences(fastSequence,slowSequence, cmpLen, cmpLen); + alignLen += cmpLen; + matchLen += match; + + //compare the right of hit kmer on ctg2 + //ctgRight = contig_array[ctg2->ctgID].length+originOverlap-endOnCtg2-overlap; + ctgRight = lenCtg2-endOnCtg2-overlap; + cmpLen = ctgRight<(rdArray[i].len-end-1) ? ctgRight:(rdArray[i].len-end-1); + cmpLen = cmpLen<=MAXREADLENGTH ? cmpLen:MAXREADLENGTH; + //cutSeqFromCtg(ctg2->ctgID,endOnCtg2+overlap,endOnCtg2+overlap+cmpLen-1,fastSequence,originOverlap); + cutSeqFromRead(seqCtg2,lenCtg2,endOnCtg2+overlap,endOnCtg2+overlap+cmpLen-1,fastSequence); + if(!bal) + cutSeqFromRead(src_seq,rdArray[i].len,end+1,end+cmpLen,slowSequence); + else + cutSeqFromRead(bal_seq,rdArray[i].len,end+1,end+cmpLen,slowSequence); + match = compareSequences(fastSequence,slowSequence, cmpLen, cmpLen); + alignLen += cmpLen; + matchLen += match; + /* + if(cmpLen>0&&match!=cmpLen+overlap){ + printSeq(stderr,fastSequence,cmpLen+overlap); + printSeq(stderr,slowSequence,cmpLen+overlap); + printKmer(stderr,kmerCtg2[endOnCtg2],overlap); + fprintf(stderr,": %d(%d)\n",bal,endOnCtg2); + }else if(cmpLen>0&&match==cmpLen+overlap) + fprintf(stderr,"Perfect\n"); + */ + double score = (double)matchLen/alignLen; + if(maxScore0.0) + fprintf(stderr,"SCORE: %4.2f\n",maxScore); + */ + if(maxScore>0.9){ + /* + for(i=0;i0 ? offset1-(len1-INDEX1[maxIndex]-1):0; + int rightRemain = offset2-(overlap+INDEX2[maxIndex])>0 ? offset2-(overlap+INDEX2[maxIndex]):0; + + ctg1->gapSeqOffset = gapSeqArray->item_c; + ctg1->gapSeqLen = END[maxIndex]-START[maxIndex]+leftRemain+rightRemain; + if(darrayPut(gapSeqArray,ctg1->gapSeqOffset+(END[maxIndex]-START[maxIndex]+leftRemain+rightRemain)/4)){ + pt = (char *)darrayPut(gapSeqArray,ctg1->gapSeqOffset); + for(j=0;jcutTail=len1-INDEX1[maxIndex]-1-offset1+cut1>cut1 ?len1-INDEX1[maxIndex]-1-offset1+cut1:cut1; + ctg2->cutHead=overlap+INDEX2[maxIndex]-offset2+cut2>cut2 ?overlap+INDEX2[maxIndex]-offset2+cut2:cut2; + ctg2->scaftig_start = 0; + ret = 1; + } + } + free((void*)START); + free((void*)END); + free((void*)INDEX1); + free((void*)INDEX2); + free((void*)SCORE); + free((void*)BAL); + + free((void*)src_seq); + free((void*)bal_seq); + return ret; +} + diff --git a/fusion/main.c b/fusion/main.c new file mode 100755 index 0000000..ddd9ee6 --- /dev/null +++ b/fusion/main.c @@ -0,0 +1,163 @@ +#include "stdinc.h" +#include "newhash.h" +#include "extfunc.h" +#include "global.h" + + +extern int call_scaffold(); +extern int call_align(); +extern int call_bundle(); +extern int data_prepare(); + +#define MAPPING 0 +#define SCAFF 1 +#define BUNDLE 2 +#define PREPARE 3 +#define POTENT 4 +static void usage(); +int main(int argc, char **argv) +{ + printf("Mapping & Scaffolding module.\n"); + + if(argc==1){ + usage(); + return 0; + } + int c=0; + int inpseq, outseq; + //char optarg[256]; + int mode =-1; + + //char temp[100]; + while((c=getopt(argc,argv,"s:g:p:L:t:i:u:c:P:K:MSBDO"))!=EOF){ + switch(c){ + case 'M': + mode=MAPPING; + break; + case 'S': + mode=SCAFF; + break; + case 'B': + mode=BUNDLE; + break; + case 'D': + mode=PREPARE; + break; + case 'O': + mode=POTENT; + break; + case 's': + inpseq = 1; + shortrdsfile=(char *)ckalloc(256*sizeof(char)); + strcpy(shortrdsfile,optarg); + break; + case 'g': + outseq = 1; + graphfile=(char *)ckalloc(256*sizeof(char)); + strcpy(graphfile,optarg); + break; + case 'p': + thrd_num = atoi(optarg); + break; + case 'L': + ctg_short = atoi(optarg); + break; + case 'P': + OverlapPercent = atof (optarg); + break; + case 't': + close_threshold = atof (optarg); + break; + case 'i': + ins_size_var = atoi (optarg); + break; + case 'u': + bund_threshold = atoi (optarg); + break; + case 'c': + ctg_file = (char *)ckalloc(256*sizeof(char)); + strcpy(ctg_file,optarg); + break; + case 'K': + overlaplen = atoi(optarg); + break; + case 'h': + usage(); + break; + case '?': + usage(); + exit(1); + default: + usage(); + exit(1); + } + } + + if(mode==-1){ + usage(); + exit(1); + }else if(mode==MAPPING){ + printf("[%s]Mapping mode selected .\n",__FUNCTION__); + if(outseq==0||inpseq==0){ + usage(); + exit(1); + } + + call_align(); + }else if(mode==SCAFF){ + printf("[%s]Scaffolding mode selected .\n",__FUNCTION__); + if(outseq==0){ + usage(); + exit(1); + } + call_scaffold(); + }else if(mode==BUNDLE){ + printf("[%s]Bundling mode selected .\n",__FUNCTION__); + if(outseq==0){ + usage(); + exit(1); + } + call_bundle(); + }else if(mode==PREPARE){ + printf("[%s]Data prepare mode selected .\n",__FUNCTION__); + if(outseq==0||ctg_file==NULL){ + usage(); + exit(1); + } + data_prepare(); + }else if(mode==POTENT){ + printf("[%s]Potential analysis mode selected .\n",__FUNCTION__) ; + if(outseq==NULL){ + usage(); + exit(1); + } + potential(); + } + + return 0; +} + +static void usage(){ + printf("parameters:\n"); + printf("global:\n"); + printf("-s\tLibrary file.\n"); + printf("-g\tPrefix of input files.\n"); + printf("-p\tThreads.\n\n"); + printf("Data prepare mode:\n"); + printf("-D\tEnable this mode.\n"); + printf("-K\tKmer.\n"); + printf("-c\tInput contig file.(can't be name prefix.contig)\n\n"); + printf("Mapping mode:\n"); + printf("-M\tEnable this mode.\n\n"); + printf("Bundling mode.\n"); + printf("-B\tEnable this mode.\n"); + printf("-u\tWeight threshold for outputting bundle file.(default 3)\n\n"); + printf("Potential analysis mode.\n"); + printf("-O\tEnable this mode.\n"); + printf("Scaffolding mode:\n"); + printf("-S\tEnable this mode.\n"); + printf("-L\tthreshold for minimum length of contig(default K+2).\n"); + printf("-P\tOverlap percent threshold for a subgraph(default 0.075).\n"); + printf("-t\tOverlap percent threshold for a PE(default 0.2).\n"); + printf("-i\tOverlap length threshold for remove transitive connect(default 20).\n"); +} diff --git a/fusion/map.c b/fusion/map.c new file mode 100755 index 0000000..8c46099 --- /dev/null +++ b/fusion/map.c @@ -0,0 +1,42 @@ +#include "stdinc.h" +#include "newhash.h" +#include "extfunc.h" +#include "extvab.h" + +//static void initenv(int argc, char **argv); + + +static void display_map_usage(); + +int call_align() +{ + time_t start_t,stop_t,time_bef,time_aft; + time(&start_t); + + + time(&time_bef); + ctg_short = overlaplen+2; + //printf("contig len cutoff: %d\n",ctg_short); + prlContig2nodes(graphfile,ctg_short); + time(&time_aft); + //printf("time spent on De bruijn graph construction: %ds\n\n", + // (int)(time_aft-time_bef)); + //map read to edge one by one + //printf("All contigs loaded"); + time(&time_bef); + prlLongRead2Ctg(shortrdsfile,graphfile); + time(&time_aft); + //printf("time spent on mapping long reads: %ds\n\n",(int)(time_aft-time_bef)); + + time(&time_bef); + prlRead2Ctg(shortrdsfile,graphfile); + time(&time_aft); + //printf("time spent on mapping reads: %ds\n\n",(int)(time_aft-time_bef)); + + free_Sets(KmerSets,thrd_num); + + time(&stop_t); + //printf("overall time for alignment: %dm\n\n",(int)(stop_t-start_t)/60); + printf("[%s]total time on mapping reads to contig :%dm\n",__FUNCTION__,(int)(stop_t-start_t)/60); + return 0; +} diff --git a/fusion/mem_manager.c b/fusion/mem_manager.c new file mode 100755 index 0000000..77f8024 --- /dev/null +++ b/fusion/mem_manager.c @@ -0,0 +1,89 @@ +#include "stdinc.h" +#include "newhash.h" +#include "extfunc.h" +#include "extvab.h" + +MEM_MANAGER *createMem_manager(int num_items,size_t unit_size) +{ + MEM_MANAGER *mem_Manager = (MEM_MANAGER *)ckalloc(1*sizeof(MEM_MANAGER)); + + mem_Manager->block_list = NULL; + mem_Manager->items_per_block = num_items; + mem_Manager->item_size = unit_size; + mem_Manager->recycle_list = NULL; + mem_Manager->counter = 0; + return mem_Manager; +} + +void freeMem_manager(MEM_MANAGER *mem_Manager) +{ + BLOCK_START *ite_block,*temp_block; + + if(!mem_Manager) + return; + + ite_block = mem_Manager->block_list; + while(ite_block){ + temp_block = ite_block; + ite_block = ite_block->next; + free((void *)temp_block); + } + + free((void *)mem_Manager); +} + +void *getItem(MEM_MANAGER *mem_Manager) +{ + RECYCLE_MARK *mark; //this is the type of return value + BLOCK_START *block; + + if(!mem_Manager) + return NULL; + + if(mem_Manager->recycle_list){ + mark = mem_Manager->recycle_list; + mem_Manager->recycle_list = mark->next; + return mark; + } + mem_Manager->counter++; + if(!mem_Manager->block_list||mem_Manager->index_in_block==mem_Manager->items_per_block){ + //pthread_mutex_lock(&gmutex); + block = ckalloc(sizeof(BLOCK_START)+mem_Manager->items_per_block*mem_Manager->item_size); + //mem_Manager->counter += sizeof(BLOCK_START)+mem_Manager->items_per_block*mem_Manager->item_size; + //pthread_mutex_unlock(&gmutex); + block->next = mem_Manager->block_list; + mem_Manager->block_list = block; + mem_Manager->index_in_block = 1; + return (RECYCLE_MARK *)((void *)block+sizeof(BLOCK_START)); + } + + block = mem_Manager->block_list; + return (RECYCLE_MARK *)((void *)block+sizeof(BLOCK_START)+mem_Manager->item_size*(mem_Manager->index_in_block++)); + +} + +void returnItem(MEM_MANAGER *mem_Manager,void *item) +{ + RECYCLE_MARK *mark; + + mark = item; + + mark->next = mem_Manager->recycle_list; + mem_Manager->recycle_list = mark; + +} + +/* +void test_mem_manager() +{ + MEM_MANAGER *test_manager; + NODE *temp_node; + + test_manager = createMem_manager(NODEBLOCKSIZE,sizeof(NODE)); + temp_node = (NODE *)getItem(test_manager); + returnItem(test_manager,temp_node); + + freeMem_manager(test_manager); +} +*/ + diff --git a/fusion/newhash.c b/fusion/newhash.c new file mode 100755 index 0000000..b568afd --- /dev/null +++ b/fusion/newhash.c @@ -0,0 +1,465 @@ +#include "stdinc.h" +#include "newhash.h" +#include "extfunc.h" +#include "extvab.h" + +#define PUBLIC_FUNC +#define PROTECTED_FUNC + +static const kmer_t empty_kmer = {0, 0, 0, 0, 0, 0, 1, 0,0}; + +static inline void update_kmer(kmer_t *mer, ubyte left, ubyte right){ + ubyte4 cov; + + if(left<4){ + cov = get_kmer_left_cov(*mer, left); + if(cov < MAX_KMER_COV){ + set_kmer_left_cov(*mer, left, cov + 1); + } + } + + if(right<4){ + cov = get_kmer_right_cov(*mer, right); + if(cov < MAX_KMER_COV){ + set_kmer_right_cov(*mer, right, cov + 1); + } + } +} + +static inline void set_new_kmer(kmer_t *mer, ubyte8 seq, ubyte left, ubyte right){ + *mer = empty_kmer; + set_kmer_seq(*mer, seq); + if(left<4) + set_kmer_left_cov(*mer, left, 1); + if(right<4) + set_kmer_right_cov(*mer, right, 1); +} + + +static inline int is_prime_kh(ubyte8 num){ + ubyte8 i, max; + if(num < 4) return 1; + if(num % 2 == 0) return 0; + max = (ubyte8)sqrt((float)num); + for(i=3;isize = init_size; + set->count = 0; + + set->searchCnt = 0; + set->foundCnt = 0; + set->delCnt = 0; + set->searchSpcSeedCnt = 0; + set->getSpcSeedCnt = 0; + set->levelGet[0] = 0; + set->levelGet[1] = 0; + set->levelGet[2] = 0; + + set->max = set->size * load_factor; + if(load_factor <= 0) load_factor = 0.25f; + else if(load_factor >= 1) load_factor = 0.75f; + set->load_factor = load_factor; + set->iter_ptr = 0; + set->array = calloc(set->size, sizeof(kmer_t)); + set->flags = malloc((set->size + 15)/16 * 4); + memset(set->flags, 0x55, (set->size + 15) / 16 * 4); + return set; +} + +PROTECTED_FUNC static inline ubyte8 get_kmerset(KmerSet *set, ubyte8 seq){ + ubyte8 hc; + hc = seq % set->size; + while(1){ + if(is_kmer_entity_null(set->flags, hc)){ + return hc; + } else { + if(get_kmer_seq(set->array[hc]) == seq) return hc; + } + hc ++; + if(hc == set->size) hc = 0; + } + return set->size; +} + +PUBLIC_FUNC int search_kmerset(KmerSet *set, ubyte8 seq, kmer_t **rs){ + ubyte8 hc; + hc = seq % set->size; + while(1){ + if(is_kmer_entity_null(set->flags, hc)){ + return 0; + } else { + if(get_kmer_seq(set->array[hc]) == seq){ + *rs = set->array + hc; + return 1; + } + } + hc ++; + if(hc == set->size) hc = 0; + } + return 0; +} + +PUBLIC_FUNC static inline int exists_kmerset(KmerSet *set, ubyte8 seq){ + ubyte8 idx; + idx = get_kmerset(set, seq); + return !is_kmer_entity_null(set->flags, idx); +} + +PROTECTED_FUNC static inline void encap_kmerset(KmerSet *set, ubyte8 num){ + ubyte4 *flags, *f; + ubyte8 i, n, size, hc; + kmer_t key, tmp; + if(set->count + num <= set->max) return; + n = set->size; + do{ + if(n < 0xFFFFFFFU) + n <<= 1; + else + n += 0xFFFFFFU; + n = find_next_prime_kh(n); + } while(n * set->load_factor < set->count + num); + + set->array = realloc(set->array, n * sizeof(kmer_t)); + if(set->array == NULL){ + fprintf(stderr, "-- Out of memory --\n"); + abort(); + } + flags = malloc((n+15)/16 * 4); + memset(flags, 0x55, (n+15)/16 * 4); + size = set->size; + set->size = n; + set->max = n * set->load_factor; + f = set->flags; + set->flags = flags; + flags = f; + for(i=0;iarray[i]; + set_kmer_entity_del(flags, i); + while(1){ + hc = get_kmer_seq(key) % set->size; + while(!is_kmer_entity_null(set->flags, hc)){ hc ++; if(hc == set->size) hc = 0; } + clear_kmer_entity_null(set->flags, hc); + if(hc < size && exists_kmer_entity(flags, hc)){ + tmp = key; + key = set->array[hc]; + set->array[hc] = tmp; + set_kmer_entity_del(flags, hc); + } else { + set->array[hc] = key; + break; + } + } + } + free(flags); +} + +PUBLIC_FUNC int put_kmerset(KmerSet *set, ubyte8 seq, ubyte left, ubyte right, kmer_t **kmer_p){ + ubyte8 hc; + encap_kmerset(set, 1); + hc = seq % set->size; + do{ + if(is_kmer_entity_null(set->flags, hc)){ + clear_kmer_entity_null(set->flags, hc); + set_new_kmer(set->array + hc, seq, left, right); + set->count ++; + *kmer_p = set->array + hc; + return 0; + } else { + if(get_kmer_seq(set->array[hc]) == seq){ + update_kmer(set->array + hc, left, right); + set->array[hc].single = 0; + *kmer_p = set->array + hc; + return 1; + } + } + hc ++; + if(hc == set->size) hc = 0; + } while(1); + *kmer_p = NULL; + return 0; +} + +PUBLIC_FUNC byte8 count_kmerset(KmerSet *set){ return set->count; } + +PUBLIC_FUNC static inline void reset_iter_kmerset(KmerSet *set){ set->iter_ptr = 0; } + +PUBLIC_FUNC static inline ubyte8 iter_kmerset(KmerSet *set, kmer_t **rs){ + while(set->iter_ptr < set->size){ + if(!is_kmer_entity_null(set->flags, set->iter_ptr)){ + *rs = set->array + set->iter_ptr; + set->iter_ptr ++; + return 1; + } + set->iter_ptr ++; + } + return 0; +} + +PUBLIC_FUNC void free_kmerset(KmerSet *set){ + free(set->array); + free(set->flags); + free(set); +} + +PUBLIC_FUNC void free_Sets(KmerSet **sets,int num){ + int i; + for(i=0;i0) + num++; + } + return num; +} + +int count_branch2next(kmer_t *node) +{ + int num = 0,i; + + for(i=0;i<4;i++){ + if(get_kmer_right_cov(*node,i)>0) + num++; + } + return num; +} + +void dislink2prevUncertain(kmer_t *node,char ch,boolean smaller) +{ + if(smaller) + set_kmer_left_cov(*node,ch,0); + else + set_kmer_right_cov(*node,int_comp(ch),0); + +} + +void dislink2nextUncertain(kmer_t *node,char ch,boolean smaller) +{ + if(smaller) + set_kmer_right_cov(*node,ch,0); + else + set_kmer_left_cov(*node,int_comp(ch),0); +} + + + + + + +////////////////// functions for spaced seed Kmer hash + +static const spcKmer empty_spckmer = {0, NULL, 1}; + +static inline int update_spckmer(spcKmer *mer, ubyte2 s_bases, kmer_t *node){ +// if(mer->start == NULL) +// fprintf(stderr, "start err at:\t%llu\n",mer->seq); + + spcBase *tmpBase=mer->start; + + spcBase *newSpcBase; + newSpcBase = (spcBase*)malloc(sizeof(spcBase)); + newSpcBase->spaced_bases = s_bases; +// newSpcBase->edgeID = edgeID; + newSpcBase->large_kmer = node; + newSpcBase->next = tmpBase->next; + tmpBase->next = newSpcBase; + + mer->spaced_base_num++; + +// mvnv(0,"update %llu :\t%hu\tnum: %u\n", mer->seq, tmpBase->next->spaced_bases, mer->spaced_base_num); + return 0; +} + +static inline void set_new_spckmer(spcKmer *mer, Kmer spc_kmer, ubyte2 s_bases, kmer_t *node){ + *mer = empty_spckmer; + set_kmer_seq(*mer, spc_kmer); + + spcBase *newSpcBase; + newSpcBase = (spcBase*)malloc(sizeof(spcBase)); + newSpcBase->spaced_bases = s_bases; +// newSpcBase->repeat = 0; +// newSpcBase->edgeID = edgeID; + newSpcBase->large_kmer = node; + newSpcBase->next = NULL; + + mer->start = newSpcBase; + +// mvnv(0,"new %llu :\t%hu\n", mer->seq, mer->start->spaced_bases) + +} + +PUBLIC_FUNC spcKmerSet* init_spckmerset(ubyte8 init_size, float load_factor){ + spcKmerSet *set; + if(init_size < 3) init_size = 3; + else init_size = find_next_prime_kh(init_size); + + set = (spcKmerSet*)malloc(sizeof(spcKmerSet)); + set->size = init_size; + set->count = 0; + set->max = set->size * load_factor; + if(load_factor <= 0) load_factor = 0.25f; + else if(load_factor >= 1) load_factor = 0.75f; + set->load_factor = load_factor; + //set->iter_ptr = 0; + set->array = calloc(set->size, sizeof(spcKmer)); + set->flags = malloc((set->size + 15)/16 * 4); + memset(set->flags, 0x55, (set->size + 15) / 16 * 4); + return set; +} + +PUBLIC_FUNC int search_spckmerset(spcKmerSet *set, ubyte8 seq, spcKmer **rs){ + ubyte8 hc; + hc = seq % set->size; + while(1){ + if(is_kmer_entity_null(set->flags, hc)){ + return 0; + } else { + if(get_kmer_seq(set->array[hc]) == seq){ + *rs = set->array + hc; + return 1; + } + } + hc ++; + if(hc == set->size) hc = 0; + } + return 0; +} + +PROTECTED_FUNC static inline void encap_spckmerset(spcKmerSet *set, ubyte8 num){ + ubyte4 *flags, *f; + ubyte8 i, n, size, hc; + spcKmer key, tmp; + if(set->count + num <= set->max) return; + + n = set->size; + do{ + if(n < 0xFFFFFFFU) + n <<= 1; + else + n += 0xFFFFFFU; + n = find_next_prime_kh(n); + } while(n * set->load_factor < set->count + num); + + set->array = realloc(set->array, n * sizeof(spcKmer)); + if(set->array == NULL){ + fprintf(stderr, "-- Out of memory --\n"); + abort(); + } + flags = malloc((n+15)/16 * 4); + memset(flags, 0x55, (n+15)/16 * 4); + size = set->size; + set->size = n; + set->max = n * set->load_factor; + f = set->flags; + set->flags = flags; + flags = f; + for(i=0;iarray[i]; + set_kmer_entity_del(flags, i); + while(1){ + hc = get_kmer_seq(key) % set->size; + while(!is_kmer_entity_null(set->flags, hc)){ hc ++; if(hc == set->size) hc = 0; } + clear_kmer_entity_null(set->flags, hc); + if(hc < size && exists_kmer_entity(flags, hc)){ + tmp = key; + key = set->array[hc]; + set->array[hc] = tmp; + set_kmer_entity_del(flags, hc); + } else { + set->array[hc] = key; + break; + } + } + } + free(flags); +} + +PUBLIC_FUNC int put_spckmerset(spcKmerSet *set, Kmer spc_kmer, ubyte2 spaced_bases, kmer_t *node){ + ubyte8 hc; + encap_spckmerset(set, 1); + hc = spc_kmer % set->size; + do{ + if(is_kmer_entity_null(set->flags, hc)){ //new! repeat_flag==0 + clear_kmer_entity_null(set->flags, hc); + set_new_spckmer(set->array + hc, spc_kmer, spaced_bases, node); + set->count ++; + return 0; + } else { + if(get_kmer_seq(set->array[hc]) == spc_kmer){ //exists! repeat_flag==1 or 0 + return update_spckmer(set->array + hc, spaced_bases, node); + } + } + hc ++; + if(hc == set->size) hc = 0; + } while(1); + return 3; +} + +PUBLIC_FUNC void buildSpcKmerSet(KmerSet *set, spcKmerSet *spaced_kset) +{ + boolean spcFlag; + Kmer buff_kmer, spc_kmer; + ubyte2 spc_bases; + + ubyte8 i=0,j=0; + for(i=0;isize;i++) + { + if(is_kmer_entity_null(set->flags, i)) + continue; + else + { +// kmer_t **kmer_p; +// *kmer_p = set->array+i; + if(set->array[i].deleted != 1) //kmer not repeat + { + //spaced seed: 18 of 25, build masker and use >>,&,| for each part, only assign once + // 1 1111 1010 1100 1111 1101 0110 !!!OLD!!! + // 1 1111 1111 1111 1010 1100 1000 !!!NEW!!! + // 11 11111111 11111111 11111111 11001100 11110000 11000000 !!!NEW!!! + + buff_kmer = get_kmer_seq(set->array[i]); + + spc_kmer = ((buff_kmer>>14)&0xFFFFFFF00) | ((buff_kmer>>12)&0xC0) | ((buff_kmer>>10)&0x3C) | ((buff_kmer>>6)&0x3); + //0xFFFFFFF00 = 1111 11111111 11111111 11111111 00000000 + // 0xC0 = 0000 00000000 00000000 00000000 11000000 + // 0x3C = 0000 00000000 00000000 00000000 00111100 + // 0x3 = 0000 00000000 00000000 00000000 00000011 + + spc_bases = ((buff_kmer>>8)&0x3000) | ((buff_kmer>>6)&0xC00) | ((buff_kmer>>2)&0x3C0) | (buff_kmer&0x3F); + // 0x3000 = 110000 00000000 + // 0xC00 = 001100 00000000 + // 0x3C0 = 000011 11000000 + // 0x3F = 000000 00111111 + + //build the 18mer and the spaced bases(7mer), put them in the spaced_kmer hash + spcFlag = put_spckmerset(spaced_kset, spc_kmer, spc_bases, set->array+i); + if(spcFlag!=0) + fprintf(stderr, "flag error: %c\tkmer exists: %llu %hu\n", spcFlag, spc_kmer, spc_bases); +// if((++j)%100000==0) +// fprintf(stderr,"--- %lluth spaced Kmer built\n",j); + } + + } + } + //fprintf(stderr,"--- total %llu spaced Kmer built in a KmerSet\n",j); +} diff --git a/fusion/orderContig.c b/fusion/orderContig.c new file mode 100755 index 0000000..b81fa18 --- /dev/null +++ b/fusion/orderContig.c @@ -0,0 +1,3485 @@ +#include "stdinc.h" +#include "newhash.h" +#include "extfunc.h" +#include "extvab.h" +#include "dfibHeap.h" +#include "fibHeap.h" +#include "darray.h" + +#define CNBLOCKSIZE 10000 +#define MAXC 10000 +#define MAXCinBetween 200 + +#define MaxNodeInSub 10000 +#define GapLowerBound -2000 +#define GapUpperBound 300000 + +//static boolean static_f=0; + + +static int gapCounter; +static int orienCounter; +static int throughCounter; + +static DARRAY *solidArray; +static DARRAY *tempArray; + +static int solidCounter; + +static CTGinHEAP ctg4heapArray[MaxNodeInSub+1]; // index in this array are put to heaps, start from 1 +static unsigned int nodesInSub[MaxNodeInSub]; +static int nodeDistance[MaxNodeInSub]; +static int nodeCounter; + +static unsigned int nodesInSubInOrder[MaxNodeInSub]; +static int nodeDistanceInOrder[MaxNodeInSub]; + +static DARRAY *scaf3,*scaf5; +static DARRAY *gap3,*gap5; + +static unsigned int downstreamCTG[MAXCinBetween]; +static unsigned int upstreamCTG[MAXCinBetween]; +static int dsCtgCounter; +static int usCtgCounter; + +static CONNECT *checkConnect(unsigned int from_c,unsigned int to_c); +static int maskPuzzle(int num_connect,unsigned int contigLen); +static void freezing(); +static boolean checkOverlapInBetween(double tolerance); +static int setConnectDelete(unsigned int from_c,unsigned int to_c,char flag,boolean cleanBinding); +static int setConnectWP(unsigned int from_c,unsigned int to_c,char flag); + +static void general_linearization(boolean strict); +static void debugging2(); +static void smallScaf(); +static void detectBreakScaf(); +static boolean checkSimple(DARRAY *ctgArray,int count); +static void checkCircle(); + +//find the only connection involved in connection binding +static CONNECT *getBindCnt(unsigned int ctg) +{ + CONNECT *ite_cnt; + CONNECT *bindCnt=NULL; + CONNECT *temp_cnt=NULL; + CONNECT *temp3_cnt=NULL; + int count = 0; + int count2 = 0; + int count3 = 0; + + ite_cnt = contig_array[ctg].downwardConnect; + while(ite_cnt){ + if(ite_cnt->nextInScaf){ + count++; + bindCnt = ite_cnt; + } + if(ite_cnt->prevInScaf){ + temp_cnt = ite_cnt; + count2++; + } + if(ite_cnt->singleInScaf){ + temp3_cnt = ite_cnt; + count3++; + } + ite_cnt = ite_cnt->next; + } + if(count==1) + return bindCnt; + + if(count==0&&count2==1) + return temp_cnt; + if(count==0&&count2==0&&count3==1) + return temp3_cnt; + return NULL; +} + +static void createAnalogousCnt(unsigned int sourceStart, + CONNECT *originCnt, int gap, + unsigned int targetStart,unsigned int targetStop) +{ + CONNECT *temp_cnt; + unsigned int balTargetStart=getTwinCtg(targetStart); + unsigned int balTargetStop=getTwinCtg(targetStop); + + unsigned int balSourceStart = getTwinCtg(sourceStart); + unsigned int balSourceStop = getTwinCtg(originCnt->contigID); + + originCnt->deleted = 1; + temp_cnt = getCntBetween(balSourceStop,balSourceStart); + temp_cnt->deleted = 1; + + if(gapweight,1); + if(temp_cnt) + temp_cnt->inherit = 1; + temp_cnt = add1Connect(balTargetStop,balTargetStart,gap,originCnt->weight,1); + if(temp_cnt) + temp_cnt->inherit = 1; +} +// increase #long_pe_support for a conncet by 1 +static void add1LongPEcov(unsigned int fromCtg,unsigned int toCtg,int weight) +{ + //check if they are on the same scaff + if(contig_array[fromCtg].from_vt!=contig_array[toCtg].from_vt || + contig_array[fromCtg].to_vt!=contig_array[toCtg].to_vt){ + printf("Warning from add1LongPEcov: contig %d and %d not on the same scaffold\n", + fromCtg,toCtg); + return; + } + if(contig_array[fromCtg].indexInScaf>=contig_array[toCtg].indexInScaf){ + printf("Warning from add1LongPEcov: wrong about order between contig %d and %d\n", + fromCtg,toCtg); + return; + } + CONNECT *bindCnt; + unsigned int prevCtg = fromCtg; + bindCnt = getBindCnt(fromCtg); + while(bindCnt){ + if(bindCnt->maxGap + weight<=1000) + bindCnt->maxGap += weight; + else + bindCnt->maxGap = 1000; + + if(fromCtg==0&&toCtg==0) + printf("link (%d %d ) covered by link (%d %d), wt %d\n", + prevCtg,bindCnt->contigID,fromCtg,toCtg,weight); + if(bindCnt->contigID==toCtg) + break; + prevCtg = bindCnt->contigID; + bindCnt = bindCnt->nextInScaf; + } + unsigned int bal_fc = getTwinCtg(fromCtg); + unsigned int bal_tc = getTwinCtg(toCtg); + bindCnt = getBindCnt(bal_tc); + prevCtg = bal_tc; + while(bindCnt){ + if(bindCnt->maxGap + weight<=1000) + bindCnt->maxGap += weight; + else + bindCnt->maxGap = 1000; + if(fromCtg==0&&toCtg==0) + printf("link (%d %d ) covered by link (%d %d), wt %d\n", + prevCtg,bindCnt->contigID,fromCtg,toCtg,weight); + if(bindCnt->contigID==bal_fc) + return; + prevCtg = bindCnt->contigID; + bindCnt = bindCnt->nextInScaf; + } + printf("Warning from add1LongPEcov: not reach the end (%d %d) (B)\n",bal_tc,bal_fc); +} + +// for long pair ends, move the connections along scaffolds established by shorter pair ends till reach the ends +static void downSlide() +{ + fprintf(stderr,"[%s]entering this function.\n",__FUNCTION__); + int len=0,gap; + unsigned int i; + CONNECT *ite_cnt,*bindCnt,*temp_cnt; + unsigned int bottomCtg,topCtg,bal_i; + unsigned int targetCtg,bal_target; + boolean getThrough,orienConflict; + int slideLen,slideLen2; + + orienCounter = throughCounter = 0; + for(i=1;i<=num_ctg;i++){ + if(contig_array[i].mask||!contig_array[i].downwardConnect) + continue; + bindCnt = getBindCnt(i); + if(!bindCnt) + continue; + bal_i = getTwinCtg(i); + len = slideLen = 0; + bottomCtg = i; + + //find the last unmasked contig in this binding + while(bindCnt->nextInScaf){ + len += bindCnt->gapLen + contig_array[bindCnt->contigID].length; + if(contig_array[bindCnt->contigID].mask==0){ + bottomCtg = bindCnt->contigID; + slideLen = len; + } + bindCnt = bindCnt->nextInScaf; + } + len += bindCnt->gapLen + contig_array[bindCnt->contigID].length; + + if(contig_array[bindCnt->contigID].mask==0||bottomCtg==0){ + bottomCtg = bindCnt->contigID; + slideLen = len; + } + //check each connetion from long pair ends + ite_cnt = contig_array[i].downwardConnect; + while(ite_cnt){ + if(ite_cnt->deleted||ite_cnt->mask||ite_cnt->singleInScaf + ||ite_cnt->nextInScaf||ite_cnt->prevInScaf||ite_cnt->inherit){ + ite_cnt = ite_cnt->next; + continue; + } + targetCtg = ite_cnt->contigID; + if(contig_array[i].from_vt==contig_array[targetCtg].from_vt){ // on the same scaff + if(contig_array[i].indexInScaf>contig_array[targetCtg].indexInScaf) + orienCounter++; + else + throughCounter++; + + setConnectDelete(i,ite_cnt->contigID,1,0); + ite_cnt = ite_cnt->next; + continue; + + } + //check if this connection conflicts with previous scaffold orientationally + temp_cnt = getBindCnt(targetCtg); + orienConflict = 0; + if(temp_cnt){ + while(temp_cnt->nextInScaf){ + if(temp_cnt->contigID==i){ + orienConflict = 1; + printf("Warning from downSlide: still on the same scaff: %d and %d\n" + ,i,targetCtg); + printf("on scaff %d and %d\n", + contig_array[i].from_vt,contig_array[targetCtg].from_vt); + printf("on bal_scaff %d and %d\n", + contig_array[bal_target].to_vt,contig_array[bal_i].to_vt); + break; + } + temp_cnt = temp_cnt->nextInScaf; + } + if(temp_cnt->contigID==i) + orienConflict = 1; + } + if(orienConflict){ + orienCounter++; + setConnectDelete(i,ite_cnt->contigID,1,0); + ite_cnt = ite_cnt->next; + continue; + } + //find the most top contig along previous scaffold starting with the target contig of this connection + bal_target = getTwinCtg(targetCtg); + slideLen2 = 0; + if(contig_array[targetCtg].mask==0){ + topCtg = bal_target; + }else{ + topCtg = 0; + } + + temp_cnt = getBindCnt(bal_target); + getThrough = len = 0; + if(temp_cnt){ + //find the last contig in this binding + while(temp_cnt->nextInScaf){ + //check if this route reaches bal_i + if(temp_cnt->contigID==bal_i){ + printf("Warning from downSlide: (B) still on the same scaff: %d and %d (%d and %d)\n", + i,targetCtg,bal_target,bal_i); + printf("on scaff %d and %d\n", + contig_array[i].from_vt,contig_array[targetCtg].from_vt); + printf("on bal_scaff %d and %d\n", + contig_array[bal_target].to_vt,contig_array[bal_i].to_vt); + getThrough = 1; + break; + } + len += temp_cnt->gapLen + contig_array[temp_cnt->contigID].length; + if(contig_array[temp_cnt->contigID].mask==0){ + topCtg = temp_cnt->contigID; + slideLen2 = len; + } + temp_cnt = temp_cnt->nextInScaf; + } + len += temp_cnt->gapLen + contig_array[temp_cnt->contigID].length; + if(contig_array[temp_cnt->contigID].mask==0||topCtg==0){ + topCtg = temp_cnt->contigID; + slideLen2 = len; + } + if(temp_cnt->contigID==bal_i) + getThrough = 1; + else + topCtg = getTwinCtg(topCtg); + }else + topCtg = targetCtg; + + if(getThrough){ + throughCounter++; + setConnectDelete(i,ite_cnt->contigID,1,0); + ite_cnt = ite_cnt->next; + continue; + } + //add a connection between bottomCtg and topCtg + gap = ite_cnt->gapLen - slideLen - slideLen2; + if(bottomCtg!=topCtg&&!(i==bottomCtg&&targetCtg==topCtg)){ + createAnalogousCnt(i,ite_cnt,gap,bottomCtg,topCtg); + if(contig_array[bottomCtg].mask||contig_array[topCtg].mask) + printf("downSlide to masked contig\n"); + } + ite_cnt = ite_cnt->next; + } //for each connect + } // for each contig + //printf("downSliding is done...orienConflict %d, fall inside %d\n", + // orienCounter,throughCounter); +} + +static boolean setNextInScaf(CONNECT *cnt, CONNECT *nextCnt) +{ + if(!cnt){ + printf("setNextInScaf: empty pointer\n"); + return 0; + } + if(!nextCnt){ + cnt->nextInScaf = nextCnt; + return 1; + } + if(cnt->mask||cnt->deleted){ + printf("setNextInScaf: cnt is masked or deleted\n"); + return 0; + } + if(nextCnt->deleted||nextCnt->mask){ + printf("setNextInScaf: nextCnt is masked or deleted\n"); + return 0; + } + cnt->nextInScaf = nextCnt; + return 1; +} + +static boolean setPrevInScaf(CONNECT *cnt, boolean flag) +{ + if(!cnt){ + printf("setPrevInScaf: empty pointer\n"); + return 0; + } + if(!flag){ + cnt->prevInScaf = flag; + return 1; + } + if(cnt->mask||cnt->deleted){ + printf("setPrevInScaf: cnt is masked or deleted\n"); + return 0; + } + cnt->prevInScaf = flag; + return 1; +} + +/* +connect A is upstream to B, replace A with C +from_c + > branch_c - to_c +from_c_new +*/ +static void substitueUSinScaf(CONNECT *origin, unsigned int from_c_new) +{ + if(!origin||!origin->nextInScaf) + return; + + unsigned int branch_c, to_c; + unsigned int bal_branch_c, bal_to_c; + unsigned int bal_from_c_new = getTwinCtg(from_c_new); + CONNECT *bal_origin,*bal_nextCNT,*prevCNT,*bal_prevCNT; + + + branch_c = origin->contigID; + to_c = origin->nextInScaf->contigID; + bal_branch_c = getTwinCtg(branch_c); + bal_to_c = getTwinCtg(to_c); + + prevCNT = checkConnect(from_c_new,branch_c); + bal_nextCNT = checkConnect(bal_to_c,bal_branch_c); + if(!bal_nextCNT){ + printf("substitueUSinScaf: no connect between %d and %d\n",bal_to_c,bal_branch_c); + return; + } + bal_origin = bal_nextCNT->nextInScaf; + bal_prevCNT = checkConnect(bal_branch_c,bal_from_c_new); + + setPrevInScaf(bal_nextCNT->nextInScaf,0); + setNextInScaf(prevCNT,origin->nextInScaf); + setNextInScaf(bal_nextCNT,bal_prevCNT); + setPrevInScaf(bal_prevCNT,1); + + setNextInScaf(origin,NULL); + setPrevInScaf(bal_origin,0); +} + +/* +connect B is downstream to C, replace B with A + to_c +from_c - branch_c < + to_c_new +*/ +static void substitueDSinScaf(CONNECT *origin, unsigned int branch_c, unsigned int to_c_new) +{ + if(!origin||!origin->prevInScaf) + return; + + unsigned int to_c; + unsigned int bal_branch_c, bal_to_c,bal_to_c_new; + unsigned int from_c,bal_from_c; + CONNECT *bal_origin,*prevCNT,*bal_prevCNT; + CONNECT *nextCNT,*bal_nextCNT; + + + to_c = origin->contigID; + bal_branch_c = getTwinCtg(branch_c); + bal_to_c = getTwinCtg(to_c); + bal_origin = getCntBetween(bal_to_c,bal_branch_c); + if(!bal_origin){ + printf("substitueDSinScaf: no connect between %d and %d\n",bal_to_c,bal_branch_c); + return; + } + bal_from_c = bal_origin->nextInScaf->contigID; + from_c = getTwinCtg(bal_from_c); + bal_to_c_new = getTwinCtg(to_c_new); + + prevCNT = checkConnect(from_c,branch_c); + nextCNT = checkConnect(branch_c,to_c_new); + setNextInScaf(prevCNT,nextCNT); + setPrevInScaf(nextCNT,1); + + bal_nextCNT = checkConnect(bal_to_c_new,bal_branch_c); + bal_prevCNT = checkConnect(bal_branch_c,bal_from_c); + + setNextInScaf(bal_nextCNT,bal_prevCNT); + setPrevInScaf(origin,0); + setNextInScaf(bal_origin,NULL); +} + +static int validConnect(unsigned int ctg, CONNECT *preCNT) +{ + if(preCNT&&preCNT->nextInScaf) + return 1; + + CONNECT *cn_temp; + int count=0; + if(!contig_array[ctg].downwardConnect) + return count; + cn_temp = contig_array[ctg].downwardConnect; + while(cn_temp){ + if(!cn_temp->deleted&&!cn_temp->mask) + count++; + cn_temp = cn_temp->next; + } + return count; +} + +static CONNECT *getNextContig(unsigned int ctg, CONNECT *preCNT, boolean *exception) +{ + CONNECT *cn_temp,*retCNT=NULL; + int count=0,valid_in; + unsigned int nextCtg,bal_ctg; + + *exception = 0; + if(preCNT&&preCNT->nextInScaf){ + if(preCNT->contigID!=ctg) + printf("pre cnt does not lead to %d\n",ctg); + nextCtg = preCNT->nextInScaf->contigID; + cn_temp = getCntBetween(ctg,nextCtg); + if(cn_temp&&(cn_temp->mask||cn_temp->deleted)){ + printf("getNextContig: arc(%d %d) twin (%d %d) with mask %d deleted %d\n" + ,ctg,nextCtg,getTwinCtg(nextCtg),getTwinCtg(ctg) + ,cn_temp->mask,cn_temp->deleted); + if(!cn_temp->prevInScaf) + printf("not even has a prevInScaf\n"); + cn_temp = getCntBetween(getTwinCtg(nextCtg), + getTwinCtg(ctg)); + if(!cn_temp->nextInScaf) + printf("its twin cnt not has a nextInScaf\n"); + fflush(stdout); + *exception = 1; + }else + return preCNT->nextInScaf; + } + + bal_ctg = getTwinCtg(ctg); + valid_in = validConnect(bal_ctg,NULL); + if(valid_in>1) + return NULL; + if(!contig_array[ctg].downwardConnect) + return NULL; + cn_temp = contig_array[ctg].downwardConnect; + while(cn_temp){ + if(cn_temp->mask||cn_temp->deleted){ + cn_temp = cn_temp->next; + continue; + } + count++; + if(count==1) + retCNT = cn_temp; + else if(count==2) + return NULL; + cn_temp = cn_temp->next; + } + return retCNT; +} + +// get the valid connect between 2 given ctgs +static CONNECT *checkConnect(unsigned int from_c,unsigned int to_c) +{ + CONNECT *cn_temp=getCntBetween(from_c,to_c); + if(!cn_temp) + return NULL; + if(!cn_temp->mask&&!cn_temp->deleted) + return cn_temp; + return NULL; +} + +static int setConnectMask(unsigned int from_c,unsigned int to_c,char mask) +{ + CONNECT *cn_temp,*cn_bal,*cn_ds,*cn_us; + unsigned int bal_fc = getTwinCtg(from_c); + unsigned int bal_tc = getTwinCtg(to_c); + unsigned int ctg3,bal_ctg3; + + cn_temp = getCntBetween(from_c,to_c); + cn_bal = getCntBetween(bal_tc,bal_fc); + if(!cn_temp||!cn_bal){ + return 0; + } + cn_temp->mask = mask; + cn_bal->mask = mask; + if(!mask) + return 1; + + if(cn_temp->nextInScaf){ //undo the binding + setPrevInScaf(cn_temp->nextInScaf,0); + ctg3 = cn_temp->nextInScaf->contigID; + setNextInScaf(cn_temp,NULL); + bal_ctg3 = getTwinCtg(ctg3); + cn_ds = getCntBetween(bal_ctg3,bal_tc); + setNextInScaf(cn_ds,NULL); + setPrevInScaf(cn_bal,0); + } + + // ctg3 -> from_c -> to_c + // bal_ctg3 <- bal_fc <- bal_tc + if(cn_bal->nextInScaf){ + setPrevInScaf(cn_bal->nextInScaf,0); + bal_ctg3 = cn_bal->nextInScaf->contigID; + setNextInScaf(cn_bal,NULL); + ctg3 = getTwinCtg(bal_ctg3); + cn_us = getCntBetween(ctg3,from_c); + setNextInScaf(cn_us,NULL); + setPrevInScaf(cn_temp,0); + } + + return 1; +} + + +static boolean setConnectUsed(unsigned int from_c,unsigned int to_c,char flag) +{ + CONNECT *cn_temp,*cn_bal; + unsigned int bal_fc = getTwinCtg(from_c); + unsigned int bal_tc = getTwinCtg(to_c); + + cn_temp = getCntBetween(from_c,to_c); + cn_bal = getCntBetween(bal_tc,bal_fc); + if(!cn_temp||!cn_bal){ + return 0; + } + cn_temp->used = flag; + cn_bal->used = flag; + + return 1; +} + +static int setConnectWP(unsigned int from_c,unsigned int to_c,char flag) +{ + CONNECT *cn_temp,*cn_bal; + unsigned int bal_fc = getTwinCtg(from_c); + unsigned int bal_tc = getTwinCtg(to_c); + + cn_temp = getCntBetween(from_c,to_c); + cn_bal = getCntBetween(bal_tc,bal_fc); + if(!cn_temp||!cn_bal){ + return 0; + } + cn_temp->weakPoint = flag; + cn_bal->weakPoint = flag; + //fprintf(stderr,"contig %d and %d, weakPoint %d\n",from_c,to_c,cn_temp->weakPoint); + //fprintf(stderr,"contig %d and %d, weakPoint %d\n",bal_tc,bal_fc,cn_bal->weakPoint); + return 1; +} + +static int setConnectDelete(unsigned int from_c,unsigned int to_c,char flag,boolean cleanBinding) +{ + CONNECT *cn_temp,*cn_bal; + unsigned int bal_fc = getTwinCtg(from_c); + unsigned int bal_tc = getTwinCtg(to_c); + + cn_temp = getCntBetween(from_c,to_c); + cn_bal = getCntBetween(bal_tc,bal_fc); + + if(!cn_temp||!cn_bal){ + return 0; + } + cn_temp->deleted = flag; + cn_bal->deleted = flag; + if(!flag) + return 1; + if(cleanBinding){ + cn_temp->prevInScaf = 0; + cn_temp->nextInScaf = NULL; + cn_bal->prevInScaf = 0; + cn_bal->nextInScaf = NULL; + } + return 1; +} + +static void maskContig(unsigned int ctg,boolean flag) +{ + unsigned int bal_ctg,ctg2,bal_ctg2; + CONNECT *cn_temp; + + bal_ctg = getTwinCtg(ctg); + cn_temp = contig_array[ctg].downwardConnect; + while(cn_temp){ + if(cn_temp->mask||cn_temp->prevInScaf||cn_temp->nextInScaf||cn_temp->singleInScaf){ + cn_temp = cn_temp->next; + continue; + } + ctg2 = cn_temp->contigID; + setConnectMask(ctg,ctg2,flag); + cn_temp = cn_temp->next; + } + // bal_ctg2 <- bal_ctg + cn_temp = contig_array[bal_ctg].downwardConnect; + while(cn_temp){ + if(cn_temp->mask||cn_temp->prevInScaf||cn_temp->nextInScaf||cn_temp->singleInScaf){ + cn_temp = cn_temp->next; + continue; + } + bal_ctg2 = cn_temp->contigID; + setConnectMask(bal_ctg,bal_ctg2,flag); + cn_temp = cn_temp->next; + } + + contig_array[ctg].mask = flag; + contig_array[bal_ctg].mask = flag; +} + +static int maskPuzzle(int num_connect,unsigned int contigLen) +{ + int in_num,out_num,flag=0,puzzleCounter=0; + unsigned int i,bal_i; + + for(i=1;i<=num_ctg;i++){ + if(contigLen&&contig_array[i].length>contigLen) + break; + if(contig_array[i].mask) + continue; + bal_i = getTwinCtg(i); + in_num = validConnect(bal_i,NULL); + out_num = validConnect(i,NULL); + if((in_num>1||out_num>1)&&(in_num+out_num>=num_connect)){ + flag++; + maskContig(i,1); + } + in_num = validConnect(bal_i,NULL); + out_num = validConnect(i,NULL); + if(in_num>1||out_num>1){ + puzzleCounter++; + //debugging2(i); + } + + if(isSmallerThanTwin(i)) + i++; + } + //printf("Masked %d contigs, %d puzzle left\n",flag,puzzleCounter); + return flag; +} + +static void deleteWeakCnt(int cut_off) +{ + unsigned int i; + CONNECT *cn_temp1; + int weaks=0,counter=0; + //fprintf(stderr,"[%s]entering this function. num_ctg=%d\n",__FUNCTION__,num_ctg); + for(i=1;i<=num_ctg;i++){ + //fprintf(stderr,"[%s]iterating %d.\n",__FUNCTION__,i); + cn_temp1 = contig_array[i].downwardConnect; + while(cn_temp1){ + if(!cn_temp1->mask&&!cn_temp1->deleted&&!cn_temp1->nextInScaf + &&!cn_temp1->singleInScaf&&!cn_temp1->prevInScaf){ + counter++; + } + if(cn_temp1->weak&&cn_temp1->deleted&&cn_temp1->weight>=cut_off){ + cn_temp1->deleted = 0; + cn_temp1->weak = 0; + } + else if(!cn_temp1->deleted&&cn_temp1->weight>0&&cn_temp1->weightnextInScaf&&!cn_temp1->prevInScaf){ + cn_temp1->deleted = 1; + cn_temp1->weak = 1; + if(cn_temp1->singleInScaf) + cn_temp1->singleInScaf = 0; + if(!cn_temp1->mask) + weaks++; + } + cn_temp1 = cn_temp1->next; + } + + } + fprintf(stderr,"[%s]%d connects doesn't meet weight threshold .\n",__FUNCTION__,weaks); + checkCircle(); +} + +//check if one contig is linearly connected to the other ->C1->C2... +static int linearC2C(unsigned int starter,CONNECT *cnt2c1,unsigned int c2,int min_dis,int max_dis) +{ + int out_num,in_num; + CONNECT *prevCNT,*cnt,*cn_temp; + unsigned int c1,bal_c1,ctg,bal_c2; + int len=0; + unsigned int bal_start = getTwinCtg(starter); + boolean excep; + + c1 = cnt2c1->contigID; + + if(c1==c2){ + printf("linearC2C: c1(%d) and c2(%d) are the same contig\n",c1,c2); + return -1; + } + + bal_c1 = getTwinCtg(c1); + in_num = validConnect(bal_c1,NULL); + if(in_num>1) + return 0; + + dsCtgCounter = 1; + usCtgCounter = 0; + downstreamCTG[dsCtgCounter++] = c1; + bal_c2 = getTwinCtg(c2); + upstreamCTG[usCtgCounter++] = bal_c2; + // check if c1 is linearly connected to c2 by pe connections + cnt = prevCNT = cnt2c1; + while((cnt=getNextContig(c1,prevCNT,&excep))!=NULL){ + c1 = cnt->contigID; + len += cnt->gapLen+contig_array[c1].length; + if(c1==c2) + return 1; + + if(len>max_dis||c1==starter||c1==bal_start) + return 0; + downstreamCTG[dsCtgCounter++] = c1; + if(dsCtgCounter>=MAXCinBetween){ + printf("%d downstream contigs, start at %d, max_dis %d, current dis %d\n" + ,dsCtgCounter,starter,max_dis,len); + return 0; + } + prevCNT = cnt; + } + out_num = validConnect(c1,NULL); + if(out_num) + return 0; + + + //find the most upstream contig to c2 + cnt = prevCNT = NULL; + ctg = bal_c2; + while((cnt=getNextContig(ctg,prevCNT,&excep))!=NULL){ + ctg = cnt->contigID; + len += cnt->gapLen+contig_array[ctg].length; + if(len>max_dis||ctg==starter||ctg==bal_start) + return 0; + + prevCNT = cnt; + upstreamCTG[usCtgCounter++] = ctg; + if(usCtgCounter>=MAXCinBetween){ + printf("%d upstream contigs, start at %d, max_dis %d, current dis %d\n" + ,usCtgCounter,starter,max_dis,len); + return 0; + } + } + if(dsCtgCounter+usCtgCounter>MAXCinBetween){ + printf("%d downstream and %d upstream contigs\n",dsCtgCounter,usCtgCounter); + return 0; + } + out_num = validConnect(ctg,NULL); + if(out_num){ + return 0; + } + + c2 = getTwinCtg(ctg); + min_dis -= len; + max_dis -= len; + if(c1==c2||c1==ctg||max_dis<0) + return 0; + + cn_temp = getCntBetween(c1,c2); + if(cn_temp){ + setConnectMask(c1,c2,0); + setConnectDelete(c1,c2,0,0); + return 1; + } + len = (min_dis+max_dis)/2 >= 0 ? (min_dis+max_dis)/2 : 0; + cn_temp = allocateCN(c2,len); + if(cntLookupTable) + putCnt2LookupTable(c1,cn_temp); + cn_temp->weight = 0; // special connect from the original graph + cn_temp->next = contig_array[c1].downwardConnect; + contig_array[c1].downwardConnect = cn_temp; + + bal_c1 = getTwinCtg(c1); + bal_c2 = getTwinCtg(c2); + + cn_temp = allocateCN(bal_c1,len); + if(cntLookupTable) + putCnt2LookupTable(bal_c2,cn_temp); + cn_temp->weight = 0; // special connect from the original graph + cn_temp->next = contig_array[bal_c2].downwardConnect; + contig_array[bal_c2].downwardConnect = cn_temp; + return 1; +} +//catenate upstream contig array and downstream contig array to solidArray +static void catUsDsContig() +{ + int i; + + for(i=0;i=0;i--){ + *(unsigned int *)darrayPut(solidArray,dsCtgCounter++) = getTwinCtg(upstreamCTG[i]); + } + + solidCounter = dsCtgCounter; +} + +//binding the connections between contigs in solidArray +static void consolidate() +{ + int i,j; + CONNECT *prevCNT=NULL; + CONNECT *cnt; + unsigned int to_ctg; + unsigned int from_ctg = *(unsigned int *)darrayGet(solidArray,0); + + for(i=1;i",*(unsigned int *)darrayGet(solidArray,j)); + printf("\n"); + return; + } + cnt->singleInScaf = solidCounter==2 ? 1:0; + if(prevCNT){ + setNextInScaf(prevCNT,cnt); + setPrevInScaf(cnt,1); + } + prevCNT = cnt; + from_ctg = to_ctg; + } + + //the reverse complementary path + from_ctg = getTwinCtg(*(unsigned int*)darrayGet(solidArray,solidCounter-1)); + prevCNT = NULL; + for(i=solidCounter-2;i>=0;i--){ + to_ctg = getTwinCtg(*(unsigned int *)darrayGet(solidArray,i)); + cnt = checkConnect(from_ctg,to_ctg); + if(!cnt){ + printf("consolidate B: no connect from %d to %d\n",from_ctg,to_ctg); + return; + } + cnt->singleInScaf = solidCounter==2 ? 1:0; + if(prevCNT){ + setNextInScaf(prevCNT,cnt); + setPrevInScaf(cnt,1); + } + prevCNT = cnt; + from_ctg = to_ctg; + } + +} + +static void debugging1(unsigned int ctg1,unsigned int ctg2) +{ + CONNECT *cn1; + cn1 = getCntBetween(ctg1,ctg2); + if(cn1){ + printf("(%d,%d) mask %d deleted %d w %d,singleInScaf %d\n", + ctg1,ctg2,cn1->mask,cn1->deleted,cn1->weight,cn1->singleInScaf); + if(cn1->nextInScaf) + printf("%d->%d->%d\n",ctg1,ctg2,cn1->nextInScaf->contigID); + if(cn1->prevInScaf) + printf("*->%d->%d\n",ctg1,ctg2); + else if(!cn1->nextInScaf) + printf("NULL->%d->%d->NULL\n",ctg1,ctg2); + }else + printf("%d -X- %d\n",ctg1,ctg2); +} +//remove transitive connections which cross linear paths (these paths may be broken) +//if a->b->c and a->c, mask a->c +static void removeTransitive() +{ + unsigned int i,bal_ctg; + int flag=1,out_num,in_num,count,min,max,linear; + CONNECT *cn_temp,*cn1=NULL,*cn2=NULL; + + while(flag){ + flag = 0; + for(i=1;i<=num_ctg;i++){ + if(contig_array[i].mask) + continue; + out_num = validConnect(i,NULL); + if(out_num!=2) + continue; + cn_temp = contig_array[i].downwardConnect; + count = 0; + while(cn_temp){ + if(cn_temp->deleted||cn_temp->mask){ + cn_temp = cn_temp->next; + continue; + } + count++; + if(count==1) + cn1 = cn_temp; + else if(count==2){ + cn2 = cn_temp; + }else // count > 2 + break; + + cn_temp = cn_temp->next; + } + if(count>2){ + printf("%d valid connections from ctg %d\n",count,i); + continue; + } + if(cn1->gapLen>cn2->gapLen){ + cn_temp = cn1; + cn1 = cn2; + cn2 = cn_temp; + } //make sure cn1 is closer to contig i than cn2 + if(cn1->prevInScaf&&cn2->prevInScaf) + continue; + bal_ctg = getTwinCtg(cn2->contigID); + in_num = validConnect(bal_ctg,NULL); + if(in_num>2) + continue; + min = cn2->gapLen - cn1->gapLen - contig_array[cn1->contigID].length - ins_size_var/2; + max = cn2->gapLen - cn1->gapLen - contig_array[cn1->contigID].length + ins_size_var/2; + + if(max<0) + continue; + //temprarily delete cn2 + setConnectDelete(i,cn2->contigID,1,0); + linear = linearC2C(i,cn1,cn2->contigID,min,max); + if(linear!=1){ + setConnectDelete(i,cn2->contigID,0,0); + continue; + }else{ + downstreamCTG[0] = i; + catUsDsContig(); + if(!checkSimple(solidArray,solidCounter)) + continue; + cn1 = getCntBetween(*(unsigned int *)darrayGet(solidArray,solidCounter-2), + *(unsigned int *)darrayGet(solidArray,solidCounter-1)); + if(cn1&&cn1->nextInScaf&&cn2->nextInScaf){ + setConnectDelete(i,cn2->contigID,0,0); + continue; + } + consolidate(); + if(cn2->prevInScaf) + substitueDSinScaf(cn2,*(unsigned int *)darrayGet(solidArray,0), + *(unsigned int *)darrayGet(solidArray,1)); + if(cn2->nextInScaf) + substitueUSinScaf(cn2,*(unsigned int *)darrayGet(solidArray,solidCounter-2)); + flag++; + } + } //for each contig + //printf("a remove transitive lag, %d connections removed\n",flag); + } + +} + +//get repeat contigs back into the scaffold according to connected unique contigs on both sides +/* + A ------ D + > [i] < + B E +*/ +static void debugging2(unsigned int ctg) +{ + CONNECT *cn1 = contig_array[ctg].downwardConnect; + while(cn1){ + if(cn1->nextInScaf) + fprintf(stderr,"with nextInScaf,"); + if(cn1->prevInScaf) + fprintf(stderr,"with prevInScaf,"); + fprintf(stderr,"%u >> %d, mask %d deleted %d, inherit %d, singleInScaf %d\n", + ctg,cn1->contigID,cn1->mask,cn1->deleted,cn1->inherit,cn1->singleInScaf); + cn1 = cn1->next; + } +} +static void debugging() +{ +/* + debugging1(1777,1468); + debugging2(8065); + debugging2(8066); +*/ +} + +static void simplifyCnt() +{ + removeTransitive(); + debugging(); + general_linearization(1); + debugging(); +} + +static int getIndexInArray(unsigned int node) +{ + int index; + for(index=0;index0){ + //printf("exists\n"); + return 0; + } + if(index>=MaxNodeInSub) + return -1; + insertNodeIntoHeap(heap,distance,node); + nodesInSub[index] = node; + nodeDistance[index] = distance; + return 1; +} + +static boolean putChainIntoSubgraph(FibHeap *heap,int distance,unsigned int node,int *index,CONNECT *prevC) +{ + unsigned int ctg = node; + CONNECT *nextCnt; + boolean excep,flag; + int counter = *index; + + while(1){ + nextCnt=getNextContig(ctg,prevC,&excep); + if(excep||!nextCnt){ + *index = counter; + return 1; + } + ctg = nextCnt->contigID; + distance += nextCnt->gapLen + ctg; + flag = putNodeIntoSubgraph(heap,distance,ctg,counter); + if(flag<0) + return 0; + if(flag>0) + counter++; + prevC = nextCnt; + } +} +//check if nodes in subgraph have a potential heter form +static boolean check_het_overlap(double tolerance){ + + int i,gap,overlap_point; + unsigned int node; + int len_sum,over3_len,over3_sum; + boolean flag=0; + len_sum=0; + over3_len=0; + over3_sum=0; + for(i=1;i<=nodeCounter;i++){ + node = ctg4heapArray[i].ctgID; + len_sum += contig_array[node].length; + } + if(len_sum<1) + return 2; + for(i=1;i0){ + flag=0; + } + else{ + if(flag){ + over3_len=ctg4heapArray[i+1].dis - overlap_point + - contig_array[ctg4heapArray[i+1].ctgID].length; + over3_sum+=over3_len; + if((double)over3_sum/len_sum>tolerance) + return 0; + } + flag=1; + overlap_point=ctg4heapArray[i].dis; + } + } + + return 2; +} + +// check if a contig is unique by trying to line its downstream/upstream nodes together +static boolean checkUnique(unsigned int node,double tolerance) +{ + CONNECT *ite_cnt; + unsigned int currNode; + int distance; + int popCounter = 0; + boolean flag; + + currNode = node; + FibHeap *heap = newFibHeap(); + + putNodeIntoSubgraph(heap, 0, currNode, 0); + nodeCounter = 1; + ite_cnt = contig_array[currNode].downwardConnect; + while(ite_cnt){ + if(ite_cnt->deleted||ite_cnt->mask){ + ite_cnt = ite_cnt->next; + continue; + } + currNode = ite_cnt->contigID; + distance = ite_cnt->gapLen + contig_array[currNode].length; + flag = putNodeIntoSubgraph(heap, distance, currNode, nodeCounter); + if(flag<0){ + destroyHeap(heap); + return 0; + } + if(flag>0) + nodeCounter++; + + flag = putChainIntoSubgraph(heap,distance,currNode,&nodeCounter,ite_cnt); + if(!flag){ + destroyHeap(heap); + return 0; + } + + ite_cnt = ite_cnt->next; + } + if(nodeCounter<=2){ // no more than 2 valid connections + destroyHeap(heap); + return 1; + } + + while((currNode=removeNextNodeFromHeap(heap))!=0) + nodesInSubInOrder[popCounter++] = currNode; + + destroyHeap(heap); + + flag = checkOverlapInBetween(tolerance); + if(flag==1){ + return 1; + }else{ + flag = check_het_overlap(0.02);//check the heter form + } + return flag; +} + +//find longest path and break the other +static void process_ds_contig(unsigned int ctg){ + unsigned int target=ctg4heapArray[nodeCounter].ctgID; + //int boarder = ctg4heapArray[nodeCounter].dis; + boolean excep; + CONNECT *route=contig_array[ctg].downwardConnect; + CONNECT *max_route=route; + + int max_dis=0; + + boolean end_flag=0; + while(route){ + + int dis=0; + CONNECT *tmp_cnt=route; + while(tmp_cnt){ + dis+=route->gapLen+contig_array[route->contigID].length; + if(route->contigID==target){ + end_flag=1; + break; + } + tmp_cnt=getNextContig(route->contigID,tmp_cnt,&excep); + } + if(dis>max_dis){ + max_dis=dis; + max_route=route; + } + if(end_flag){ + max_route=route; + break; + } + route=route->next; + } + //delete connect except max_route + route=contig_array[ctg].downwardConnect; + while(route){ + if(route!=max_route){ + setConnectMask(ctg,route->contigID,1); + } + route=route->next; + } + +} +static void process_us_contig(unsigned int ctg){ + unsigned int target=ctg4heapArray[1].ctgID; + //int boarder = ctg4heapArray[1].dis; + boolean excep; + CONNECT *route=contig_array[ctg].downwardConnect; + CONNECT *min_route=route; + + int min_dis=0; + + boolean end_flag=0; + while(route){ + + int dis=0; + CONNECT *tmp_cnt=route; + while(tmp_cnt){ + dis-=route->gapLen+contig_array[route->contigID].length; + if(route->contigID==target){ + end_flag=1; + break; + } + tmp_cnt=getNextContig(route->contigID,tmp_cnt,&excep); + } + if(disnext; + } + //delete connect except min_route + route=contig_array[ctg].downwardConnect; + while(route){ + if(route!=min_route){ + setConnectMask(ctg,route->contigID,1); + } + route=route->next; + } + +} + +//mask contigs with downstream and/or upstream can not be lined +static void maskRepeat() +{ + int in_num,out_num,flagA,flagB; + int counter = 0; + int puzzleCounter = 0; + unsigned int i,bal_i; + int het_counter = 0; + for(i=1;i<=num_ctg;i++){ + if(contig_array[i].mask) + continue; + bal_i = getTwinCtg(i); + in_num = validConnect(bal_i,NULL); + out_num = validConnect(i,NULL); + if(in_num>1||out_num>1) + puzzleCounter++; + else{ + if(isSmallerThanTwin(i)) + i++; + continue; + + } + + if(contig_array[i].cvg>2*cvgAvg){ + counter++; + maskContig(i,1); + //printf("thick mask contig %d and %d\n",i,bal_i); + if(isSmallerThanTwin(i)) + i++; + continue; + } + + if(in_num>1) + flagA = checkUnique(bal_i,OverlapPercent); + else + flagA = 1; + if(out_num>1) + flagB = checkUnique(i,OverlapPercent); + else + flagB = 1; + + if(flagA==0||flagB==0){ + counter++; + maskContig(i,1); + }else{ + if(flagA==2){//us find longest path + process_us_contig(bal_i); + } + if(flagB==2){//ds find longest path + process_ds_contig(i); + } + } + if(flagA==2||flagB==2) + het_counter++; + + if(isSmallerThanTwin(i)) + i++; + } + printf("[%s]%d contigs masked from %d puzzles\n",__FUNCTION__,counter,puzzleCounter); + printf("[%s]%d processed as heterozygous .\n",__FUNCTION__,het_counter); +} + + +static void ordering(boolean deWeak,boolean downS, boolean nonlinear, char *infile) +{ + //debugging(); + if(downS){ + downSlide(); + //debugging(); + if(deWeak) + deleteWeakCnt(weakPE); + }else{ + if(deWeak) + deleteWeakCnt(weakPE); + } + //output_scaf(infile); + //debugging(); + //printf("variance for insert size %d\n",ins_size_var); + simplifyCnt(); + //debugging(); + + maskRepeat(); + //debugging(); + simplifyCnt(); + + if(nonlinear){ + //printf("non-strict linearization\n"); + general_linearization(0); + //linearization(0,0); + } + //maskRepeat();//??? + + maskPuzzle(2,0); + //debugging(); + freezing(); + //debugging(); + +} + +//check if contigs next to each other have reasonable overlap +boolean checkOverlapInBetween(double tolerance) +{ + int i,gap; + int index; + unsigned int node; + int lenSum,lenOlp; + lenSum = lenOlp = 0; + for(i=0;i0) + lenOlp += -gap; + //if(-gap>ins_size_var) + if((double)lenOlp/lenSum>tolerance) + return 0; + } + return 1; +} + + +/********* the following codes are for freezing current scaffolds ****************/ +//set connections between contigs in a array to used or not +//meanwhile set mask to the opposite value +static boolean setUsed(unsigned int start,unsigned int *array,int max_steps,boolean flag) +{ + unsigned int prevCtg = start; + unsigned int twinA,twinB; + int j; + CONNECT *cnt; + boolean usedFlag=0; + // save 'used' to 'checking' + prevCtg = start; + for(j=0;jused==flag||cnt->nextInScaf||cnt->prevInScaf||cnt->singleInScaf){ + return 1; + } + cnt->checking = cnt->used; + twinA = getTwinCtg(prevCtg); + twinB = getTwinCtg(array[j]); + cnt = getCntBetween(twinB,twinA); + if(cnt) + cnt->checking = cnt->used; + prevCtg = array[j]; + } + // set used to flag + prevCtg = start; + for(j=0;jused==flag){ + usedFlag = 1; + break; + } + cnt->used = flag; + twinA = getTwinCtg(prevCtg); + twinB = getTwinCtg(array[j]); + cnt = getCntBetween(twinB,twinA); + if(cnt) + cnt->used = flag; + prevCtg = array[j]; + } + // set mask to 'NOT flag' or set used to original value + prevCtg = start; + for(j=0;jmask = 1-flag; + else + cnt->used = cnt->checking; + twinA = getTwinCtg(prevCtg); + twinB = getTwinCtg(array[j]); + cnt = getCntBetween(twinB,twinA); + cnt->used = 1-flag; + if(!usedFlag) + cnt->mask = 1-flag; + else + cnt->used = cnt->checking; + prevCtg = array[j]; + } + return usedFlag; +} +// break down scaffolds poorly supported by longer PE +static void recoverMask() +{ + unsigned int i,ctg,bal_ctg,start,finish; + int num3,num5,j,t; + CONNECT *bindCnt,*cnt; + int min,max,max_steps=5,num_route,length; + int tempCounter,recoverCounter=0; + boolean multiUSE,change; + + for(i=1;i<=num_ctg;i++) + contig_array[i].flag = 0; + + so_far = (unsigned int *)ckalloc(max_n_routes*sizeof(unsigned int)); + found_routes = (unsigned int **)ckalloc(max_n_routes*sizeof(unsigned int *)); + for(j=0;jused) + break; + setConnectUsed(ctg,bindCnt->contigID,1); + ctg = bindCnt->contigID; + *(unsigned int *)darrayPut(scaf5,num5++) = ctg; + bal_ctg = getTwinCtg(ctg); + contig_array[ctg].flag = 1; + contig_array[bal_ctg].flag = 1; + bindCnt = bindCnt->nextInScaf; + } + + ctg = getTwinCtg(i); + bindCnt = getBindCnt(ctg); + while(bindCnt){ + if(bindCnt->used) + break; + setConnectUsed(ctg,bindCnt->contigID,1); + ctg = bindCnt->contigID; + bal_ctg = getTwinCtg(ctg); + contig_array[ctg].flag = 1; + contig_array[bal_ctg].flag = 1; + *(unsigned int *)darrayPut(scaf3,num3++) = bal_ctg; + bindCnt = bindCnt->nextInScaf; + } + if(num5+num3<2) + continue; + tempCounter = solidCounter = 0; + for(j=num3-1;j>=0;j--) + *(unsigned int *)darrayPut(tempArray,tempCounter++) = + *(unsigned int *)darrayGet(scaf3,j); + for(j=0;jgapLen + contig_array[finish].length; + min = length - 1.5*ins_size_var; + max = length + 1.5*ins_size_var; + traceAlongMaskedCnt(finish,start,max_steps,min,max,0,0,&num_route); + if(finish==start){ + for(j=0;j%d",*(unsigned int *)darrayGet(tempArray,j)); + printf(": start at %d\n",i); + } + + if(num_route==1){ + for(j=0;jused = 0; + cnt->checking = 0; + cnt = cnt->next; + } + } + + for(j=0;j B -> C -> D un-bind link B->C to link A->B and B->C +// A' <- B' <- C' <- D' +static void unBindLink(unsigned int CB,unsigned int CC) +{ + //fprintf(stderr,"Unbind link (%d %d) to others...\n",CB,CC); + CONNECT *cnt1 = getCntBetween(CB,CC); + if(!cnt1) + return; + if(cnt1->singleInScaf) + cnt1->singleInScaf = 0; + CONNECT *cnt2 = getCntBetween(getTwinCtg(CC),getTwinCtg(CB)); + if(!cnt2) + return; + if(cnt2->singleInScaf) + cnt2->singleInScaf = 0; + if(cnt1->nextInScaf){ + unsigned int CD = cnt1->nextInScaf->contigID; + cnt1->nextInScaf->prevInScaf = 0; + cnt1->nextInScaf = NULL; + CONNECT *cnt3 = getCntBetween(getTwinCtg(CD),getTwinCtg(CC)); + if(cnt3) + cnt3->nextInScaf = NULL; + cnt2->prevInScaf = 0; + } + if(cnt2->nextInScaf){ + unsigned int bal_CA = cnt2->nextInScaf->contigID; + cnt2->nextInScaf->prevInScaf = 0; + cnt2->nextInScaf = NULL; + CONNECT *cnt4 = getCntBetween(getTwinCtg(bal_CA),CB); + if(cnt4) + cnt4->nextInScaf = NULL; + cnt1->prevInScaf = 0; + } +} + +static void freezing() +{ + int num5,num3; + unsigned int ctg,bal_ctg; + unsigned int i; + int j,t; + CONNECT *cnt,*prevCNT,*nextCnt; + boolean excep; + + for(i=1;i<=num_ctg;i++){ + contig_array[i].flag = 0; + contig_array[i].from_vt = 0; + contig_array[i].to_vt = 0; + cnt = contig_array[i].downwardConnect; + while(cnt){ + cnt->used = 0; + cnt->checking = 0; + cnt->singleInScaf = 0; + cnt = cnt->next; + } + } + + for(i=1;i<=num_ctg;i++){ + if(contig_array[i].flag||contig_array[i].mask) + continue; + + if(!contig_array[i].downwardConnect||!validConnect(i,NULL)){ + continue; + } + + num5 = num3 = 0; + ctg = i; + *(unsigned int *)darrayPut(scaf5,num5++) = i; + contig_array[i].flag = 1; + contig_array[getTwinCtg(i)].flag = 1; + prevCNT = NULL; + cnt = getNextContig(ctg,prevCNT,&excep); + while(cnt){ + if(contig_array[cnt->contigID].flag){ + unBindLink(ctg,cnt->contigID); + break; + } + nextCnt=getNextContig(cnt->contigID,cnt,&excep); + setConnectUsed(ctg,cnt->contigID,1); + ctg = cnt->contigID; + *(unsigned int *)darrayPut(scaf5,num5++) = ctg; + bal_ctg = getTwinCtg(ctg); + contig_array[ctg].flag = 1; + contig_array[bal_ctg].flag = 1; + prevCNT = cnt; + cnt = nextCnt; + } + + ctg = getTwinCtg(i); + if(num5>=2) + prevCNT = checkConnect(getTwinCtg(*(unsigned int *)darrayGet(scaf5,1)),ctg); + else + prevCNT = NULL; + cnt = getNextContig(ctg,prevCNT,&excep); + while(cnt){ + if(contig_array[cnt->contigID].flag){ + unBindLink(ctg,cnt->contigID); + break; + } + nextCnt=getNextContig(cnt->contigID,cnt,&excep); + setConnectUsed(ctg,cnt->contigID,1); + ctg = cnt->contigID; + bal_ctg = getTwinCtg(ctg); + contig_array[ctg].flag = 1; + contig_array[bal_ctg].flag = 1; + *(unsigned int *)darrayPut(scaf3,num3++) = bal_ctg; + prevCNT = cnt; + cnt = nextCnt; + } + if(num5+num3<2) + continue; + solidCounter = 0; + for(j=num3-1;j>=0;j--) + *(unsigned int *)darrayPut(solidArray,solidCounter++) = + *(unsigned int *)darrayGet(scaf3,j); + for(j=0;j=0;t--) + if(!contig_array[*(unsigned int *)darrayGet(solidArray,t)].mask){ + lastCtg = *(unsigned int *)darrayGet(solidArray,t); + break; + } + if(firstCtg==0||lastCtg==0){ + printf("scaffold start at %d, stop at %d, freezing began with %d\n",firstCtg,lastCtg,i); + for(j=0;j%d(%d %d)",*(unsigned int *)darrayGet(solidArray,j) + ,contig_array[*(unsigned int *)darrayGet(solidArray,j)].mask + ,contig_array[*(unsigned int *)darrayGet(solidArray,j)].flag); + printf("\n"); + }else{ + firstTwin = getTwinCtg(firstCtg); + lastTwin = getTwinCtg(lastCtg); + } + for(t=0;t0){ + contig_array[ctg].mask = 1; + contig_array[getTwinCtg(ctg)].mask = 1; + printf("Repeat: contig %d (%d) appears more than once\n",ctg,getTwinCtg(ctg)); + }else{ + contig_array[ctg].from_vt = firstCtg; + contig_array[ctg].to_vt = lastCtg; + contig_array[ctg].indexInScaf = t+1; + contig_array[getTwinCtg(ctg)].from_vt = lastTwin; + contig_array[getTwinCtg(ctg)].to_vt = firstTwin; + contig_array[getTwinCtg(ctg)].indexInScaf = solidCounter-t; + } + } + consolidate(); + } + + //printf("Freezing is done....\n"); + fflush(stdout); + + for(i=1;i<=num_ctg;i++){ + if(contig_array[i].flag) + contig_array[i].flag = 0; + + if(contig_array[i].from_vt==0){ + contig_array[i].from_vt = i; + contig_array[i].to_vt = i; + } + cnt = contig_array[i].downwardConnect; + while(cnt){ + cnt->used = 0; + cnt->checking = 0; + cnt = cnt->next; + } + } + +} + +/************** codes below this line are for pulling the scaffolds out ************/ +void output1gap(FILE *fo,int max_steps) +{ + int i,len,seg; + len = seg = 0; + + for(i=0;ibySmall&&bindCnt->weakPoint){ + weakCounter++; + fprintf(fp,"\tWP"); + ret = 1; + } + + while(cnt){ + if(cnt->weight&&!cnt->inherit){ + if(!flag){ + flag = 1; + fprintf(fp,"\t#DOWN "); + } + linkCtg = cnt->contigID; + if(isLargerThanTwin(linkCtg)) + linkCtg = getTwinCtg(linkCtg); + + fprintf(fp,"%d:%d:%d ",index_array[linkCtg],cnt->weight,cnt->gapLen); + } + cnt = cnt->next; + } + flag = 0; + cnt = contig_array[bal_ctg].downwardConnect; + while(cnt){ + if(cnt->weight&&!cnt->inherit){ + if(!flag){ + flag = 1; + fprintf(fp,"\t#UP "); + } + linkCtg = cnt->contigID; + if(isLargerThanTwin(linkCtg)) + linkCtg = getTwinCtg(linkCtg); + + fprintf(fp,"%d:%d:%d ",index_array[linkCtg],cnt->weight,cnt->gapLen); + } + cnt = cnt->next; + } + fprintf(fp,"\n"); + return ret; +} + +void scaffolding(unsigned int len_cut,char *outfile) +{ + unsigned int prev_ctg,ctg,bal_ctg,*length_array,count=0,num_lctg=0; + unsigned int i,max_steps=5; + int num5,num3,j,len,flag,num_route,gap_c=0; + short gap=0; + long long sum=0,N50,N90; + FILE *fp,*fo=NULL; + char name[256]; + CONNECT *cnt,*prevCNT,*nextCnt; + boolean excep,weak; + weakCounter = 0; + + so_far = (unsigned int *)ckalloc(max_n_routes*sizeof(unsigned int)); + found_routes = (unsigned int **)ckalloc(max_n_routes*sizeof(unsigned int*)); + for(j=0;j0) + length_array[index_array[i]] = i; + } + for(i=1;i<=num_ctg;i++) + index_array[i] = length_array[i]; //contig i with original index: index_array[i] + + orig2new = 0; + + sprintf(name,"%s.scaf",outfile); + fp = ckopen(name,"w"); + sprintf(name,"%s.scaf_gap",outfile); + fo = ckopen(name,"w"); + + scaf3 = (DARRAY *)createDarray(1000,sizeof(unsigned int)); + scaf5 = (DARRAY *)createDarray(1000,sizeof(unsigned int)); + gap3 = (DARRAY *)createDarray(1000,sizeof(int)); + gap5 = (DARRAY *)createDarray(1000,sizeof(int)); + + for(i=1;i<=num_ctg;i++) + contig_array[i].flag = 0; + for(i=1;i<=num_ctg;i++){ + if(contig_array[i].length+(unsigned int)overlaplen>=len_cut) + num_lctg++; + else + continue; + if(contig_array[i].flag||contig_array[i].mask||!contig_array[i].downwardConnect||!validConnect(i,NULL)) + continue; + + num5 = num3 = 0; + ctg = i; + //printf("%d",i); + *(unsigned int *)darrayPut(scaf5,num5++) = i; + contig_array[i].flag = 1; + bal_ctg = getTwinCtg(ctg); + contig_array[bal_ctg].flag = 1; + len = contig_array[i].length; + prevCNT = NULL; + cnt = getNextContig(ctg,prevCNT,&excep); + while(cnt){ + nextCnt = getNextContig(cnt->contigID,cnt,&excep); + if(excep&&prevCNT) + printf("scaffolding: exception --- prev cnt from %u\n",prevCNT->contigID); + if(nextCnt&&nextCnt->used) + break; + setConnectUsed(ctg,cnt->contigID,1); + *(int *)darrayPut(gap5,num5-1) = cnt->gapLen; + ctg = cnt->contigID; + *(unsigned int *)darrayPut(scaf5,num5++) = ctg; + len += cnt->gapLen+contig_array[ctg].length; + bal_ctg = getTwinCtg(ctg); + contig_array[ctg].flag = 1; + contig_array[bal_ctg].flag = 1; + prevCNT = cnt; + cnt = nextCnt; + //printf("->%d",ctg); + } + //printf("\n"); + + ctg = getTwinCtg(i); + if(num5>=2) + prevCNT = checkConnect(getTwinCtg(*(unsigned int *)darrayGet(scaf5,1)),ctg); + else + prevCNT = NULL; + //printf("%d",i); + //fflush(stdout); + cnt = getNextContig(ctg,prevCNT,&excep); + while(cnt){ + nextCnt=getNextContig(cnt->contigID,cnt,&excep); + if(excep&&prevCNT) + printf("scaffolding: exception -- prev cnt from %u\n",prevCNT->contigID); + if(nextCnt&&nextCnt->used) + break; + setConnectUsed(ctg,cnt->contigID,1); + ctg = cnt->contigID; + len += cnt->gapLen+contig_array[ctg].length; + bal_ctg = getTwinCtg(ctg); + contig_array[ctg].flag = 1; + contig_array[bal_ctg].flag = 1; + //printf("<-%d",bal_ctg); + //fflush(stdout); + *(int *)darrayPut(gap3,num3) = cnt->gapLen; + *(unsigned int *)darrayPut(scaf3,num3++) = bal_ctg; + prevCNT = cnt; + cnt = nextCnt; + } + //printf("\n"); + len += overlaplen; + sum += len; + length_array[count++] = len; + if(num5+num3<1){ + //printf("no scaffold created for contig %d\n",i); + continue; + } + fprintf(fp,">scaffold%d %d %d\n",count,num5+num3,len); + fprintf(fo,">scaffold%d %d %d\n",count,num5+num3,len); + len = prev_ctg = 0; + for(j=num3-1;j>=0;j--){ + if(!isLargerThanTwin(*(unsigned int *)darrayGet(scaf3,j))){ + fprintf(fp,"%-10d %-10d + %d " + ,index_array[*(unsigned int *)darrayGet(scaf3,j)],len, + contig_array[*(unsigned int *)darrayGet(scaf3,j)].length+overlaplen); + weak = printCnts(fp,*(unsigned int *)darrayGet(scaf3,j)); + /* + if(weak) + fprintf(stderr,"scaffold%d\n",count); + */ + }else{ + fprintf(fp,"%-10d %-10d - %d " + ,index_array[getTwinCtg(*(unsigned int *)darrayGet(scaf3,j))],len + ,contig_array[*(unsigned int *)darrayGet(scaf3,j)].length+overlaplen); + weak = printCnts(fp,*(unsigned int *)darrayGet(scaf3,j)); + /* + if(weak) + fprintf(stderr,"scaffold%d\n",count); + */ + } + if(prev_ctg){ + num_route = num_trace = 0; + traceAlongArc(*(unsigned int *)darrayGet(scaf3,j),prev_ctg,max_steps + ,gap-ins_size_var,gap+ins_size_var,0,0,&num_route); + if(num_route==1){ + output1gap(fo,max_steps); + gap_c++; + } + } + fprintf(fo,"%-10d %-10d\n",*(unsigned int *)darrayGet(scaf3,j),len); + len += contig_array[*(unsigned int *)darrayGet(scaf3,j)].length + *(int *)darrayGet(gap3,j); + prev_ctg = *(unsigned int *)darrayGet(scaf3,j); + gap = *(int *)darrayGet(gap3,j)>0 ? *(int *)darrayGet(gap3,j):0; + } + for(j=0;j0 ? *(int *)darrayGet(gap5,j):0; + } + } + + } + + freeDarray(scaf3); + freeDarray(scaf5); + freeDarray(gap3); + freeDarray(gap5); + + fclose(fp); + fclose(fo); + //printf("\n%d scaffolds from %d contigs sum up %lldbp, with average length %lld, %d gaps filled\n" + // ,count,num_lctg/2,sum,sum/count,gap_c); + printf("[%s]scaffold(s) created : %d , total length : %lld.\n",__FUNCTION__,count ,sum); + //output singleton + for(i=1;i<=num_ctg;i++){ + if(contig_array[i].length+(unsigned int)overlaplen=0;j--){ + sum += length_array[j]; + if(!flag&&sum>=N50){ + printf("[%s]N50 : %d bp, ",__FUNCTION__,length_array[j]); + flag++; + } + if(sum>=N90){ + printf(" N90 : %d bp\n",length_array[j]); + break; + } + } + //printf("Found %d weak points in scaffolds\n",weakCounter); + fflush(stdout); + free((void *)length_array); + for(j=0;jweight<1){ + cnts = cnts->next; + continue; + } + fprintf(fp,"%-10d %-10d\t%d\t%d\t%d\n" + ,i,cnts->contigID,cnts->gapLen,cnts->weight,insertS); + cnts->weight = 0; + + bal_toCtg = getTwinCtg(cnts->contigID); + temp_cnt = getCntBetween(bal_toCtg,bal_ctg); + if(temp_cnt) + temp_cnt->weight = 0; + + cnts = cnts->next; + } + } +} + +//use pe info in ascent order +void PE2Links(char *infile) +{ + fprintf(stderr,"[%s]entering this function.\n",__FUNCTION__); + char name[256],*line; + FILE *fp,*linkF; + int i; + int flag=0; + unsigned int j; + + + sprintf(name,"%s.links",infile); + /*linkF = fopen(name,"r"); + if(linkF){ + printf("file %s exists, skip creating the links...\n",name); + fclose(linkF); + return; + }*/ + + linkF = ckopen(name,"w"); + + if(!pes) + loadPEgrads(infile); + + sprintf(name,"%s.readOnContig",infile); + fp = ckopen(name,"r"); + + lineLen = 1024; + line = (char *)ckalloc(lineLen*sizeof(char)); + + fgets(line,lineLen,fp); + line[0] = '\0'; + + //printf("\n"); + for(i=0;i=ctg_short&&contig_array[toCtg].length>=ctg_short){ + if(1){ + bal_ctg = getTwinCtg(ctg); + bal_toCtg = getTwinCtg(toCtg); + add1Connect(ctg,toCtg,gap,wt,0); + add1Connect(bal_toCtg,bal_ctg,gap,wt,0); + counter++; + if(contig_array[ctg].mask||contig_array[toCtg].mask) + maskCounter++; + + if(insertS>1000&& + contig_array[ctg].from_vt==contig_array[toCtg].from_vt&& // on the same scaff + contig_array[ctg].indexInScafinsertS) + break; + /* + if(contig_array[ctg].length1000&& + contig_array[ctg].from_vt==contig_array[toCtg].from_vt&& // on the same scaff + contig_array[ctg].indexInScaf1000&&isPrevSmall){ + smallScaf(); + isPrevSmall = 0; + }*/ + flag2 = inputLinks(fp,pes[i].insertS,line); + //printf("Insert size %d: %d links input\n",pes[i].insertS,flag2); + if(flag2){ + lib_n++; + cutoff_sum += pes[i].pair_num_cut; + weakPE=cutoff_sum; + } + flag += flag2; + if(!flag){ + //printf("\n"); + continue; + } + if(i==gradsCounter-1|| pes[i+1].rank!=pes[i].rank){ + flag = nonLinear = downS = markSmall = 0; + + if(pes[i].insertS>1000&&pes[i].rank>1) + downS = 1; + if(pes[i].insertS<=1000) + smallPE = 1; + + if(pes[i].insertS>=1000){ + ins_size_var = 50; + //OverlapPercent = 0.05; + }else if(pes[i].insertS>=300){ + ins_size_var = 30; + //OverlapPercent = 0.05; + }else{ + ins_size_var = 20; + //OverlapPercent = 0.05; + } + //if(pes[i].insertS>1000) + //weakPE = 5; + //static_f = 1; + //if(lib_n>0){ + //weakPE = weakPEmaxNodes) + return 0; + if(contig_array[getTwinCtg(node)].inSubGraph) + return 0; + ctg4heapArray[index].ctgID = node; + ctg4heapArray[index].dis = dis; + contig_array[node].inSubGraph = 1; + + ctg4heapArray[index].ds_shut4dheap = 0; + ctg4heapArray[index].us_shut4dheap = 0; + ctg4heapArray[index].ds_shut4uheap = 0; + ctg4heapArray[index].us_shut4uheap = 0; + + return 1; +} + +static void setInGraph(boolean flag) +{ + int i; + int node; + nodeCounter = nodeCounter>MaxNodeInSub ? MaxNodeInSub:nodeCounter; + for(i=1;i<=nodeCounter;i++){ + node = ctg4heapArray[i].ctgID; + if(node>0) + contig_array[node].inSubGraph = flag; + } +} + +static boolean dispatch1node(int dis,unsigned int tempNode,int maxNodes, + FibHeap *dheap,FibHeap *uheap,int *DmaxDis,int *UmaxDis) +{ + boolean ret; + if(dis>=0){ // put it to Dheap + nodeCounter++; + ret = putNodeInArray(tempNode,maxNodes,dis); + if(!ret) + return 0; + insertNodeIntoHeap(dheap,dis,nodeCounter); + if(dis>*DmaxDis) + *DmaxDis = dis; + return 1; + }else{ // put it to Uheap + nodeCounter++; + ret = putNodeInArray(tempNode,maxNodes,dis); + if(!ret) + return 0; + insertNodeIntoHeap(uheap,-dis,nodeCounter); + int temp_len = contig_array[tempNode].length; + if(-dis+temp_len>*UmaxDis) + *UmaxDis = -dis+contig_array[tempNode].length; + return -1; + } + return 0; +} + +static boolean canDheapWait(unsigned int currNode,int dis, int DmaxDis) +{ + if(disctgID; + dis0 = ctgInHeap->dis; + + isEmpty = IsHeapEmpty(dheap); + + twin = getTwinCtg(currNode); + us_cnt = ctgInHeap->us_shut4dheap? NULL:contig_array[twin].downwardConnect; + while(us_cnt){ + if(us_cnt->deleted||us_cnt->mask|| + contig_array[getTwinCtg(us_cnt->contigID)].inSubGraph){ + us_cnt = us_cnt->next; + continue; + } + + tempNode = getTwinCtg(us_cnt->contigID); + if(contig_array[tempNode].inSubGraph){ + us_cnt = us_cnt->next; + continue; + } + dis = dis0 - us_cnt->gapLen - (int)contig_array[twin].length; + + ret = dispatch1node(dis,tempNode,maxNodes,dheap,uheap,DmaxDis,UmaxDis); + if(ret==0) + return 0; + else if(ret<0) + *Uwait = 0; + + us_cnt = us_cnt->next; + } + + if(nodeCounter>1&&isEmpty){ + *Dwait = canDheapWait(currNode,dis0,*DmaxDis); + if(*Dwait){ + isEmpty = IsHeapEmpty(dheap); + insertNodeIntoHeap(dheap,dis0,indexInArray); + ctg4heapArray[indexInArray].us_shut4dheap = 1; + if(isEmpty) + return 1; + else + continue; + } + } + ds_cnt = ctgInHeap->ds_shut4dheap? NULL:contig_array[currNode].downwardConnect; + while(ds_cnt){ + if(ds_cnt->deleted||ds_cnt->mask||contig_array[ds_cnt->contigID].inSubGraph){ + ds_cnt = ds_cnt->next; + continue; + } + tempNode = ds_cnt->contigID; + dis = dis0 + ds_cnt->gapLen + (int)contig_array[tempNode].length; + ret = dispatch1node(dis,tempNode,maxNodes,dheap,uheap,DmaxDis,UmaxDis); + if(ret==0) + return 0; + else if(ret<0) + *Uwait = 0; + } // for each downstream connections + } // for each node comes off the heap + + *Dwait = 1; + return 1; +} + +static boolean canUheapWait(unsigned int currNode,int dis, int UmaxDis) +{ + int temp_len = contig_array[currNode].length; + if(-dis+temp_lenctgID; + dis0 = ctgInHeap->dis; + + isEmpty = IsHeapEmpty(uheap); + ds_cnt = ctgInHeap->ds_shut4uheap? NULL:contig_array[currNode].downwardConnect; + while(ds_cnt){ + if(ds_cnt->deleted||ds_cnt->mask||contig_array[ds_cnt->contigID].inSubGraph){ + ds_cnt = ds_cnt->next; + continue; + } + tempNode = ds_cnt->contigID; + dis = dis0 + ds_cnt->gapLen + contig_array[tempNode].length; + ret = dispatch1node(dis,tempNode,maxNodes,dheap,uheap,DmaxDis,UmaxDis); + if(ret==0) + return 0; + else if(ret>0) + *Dwait = 0; + + } // for each downstream connections + + if(nodeCounter>1&&isEmpty){ + *Uwait = canUheapWait(currNode,dis0,*UmaxDis); + if(*Uwait){ + isEmpty = IsHeapEmpty(uheap); + insertNodeIntoHeap(uheap,dis0,indexInArray); + ctg4heapArray[indexInArray].ds_shut4uheap = 1; + if(isEmpty) + return 1; + else + continue; + } + } + + twin = getTwinCtg(currNode); + us_cnt = ctgInHeap->us_shut4uheap? NULL:contig_array[twin].downwardConnect; + while(us_cnt){ + if(us_cnt->deleted||us_cnt->mask|| + contig_array[getTwinCtg(us_cnt->contigID)].inSubGraph){ + us_cnt = us_cnt->next; + continue; + } + + tempNode = getTwinCtg(us_cnt->contigID); + if(contig_array[tempNode].inSubGraph){ + us_cnt = us_cnt->next; + continue; + } + dis = dis0 - us_cnt->gapLen - contig_array[twin].length; + + ret = dispatch1node(dis,tempNode,maxNodes,dheap,uheap,DmaxDis,UmaxDis); + if(ret==0) + return 0; + else if(ret>0) + *Dwait = 1; + + us_cnt = us_cnt->next; + } + + } // for each node comes off the heap + + *Uwait = 1; + return 1; +} + +static boolean pickUpGeneralSubgraph(unsigned int node1,int maxNodes) +{ + FibHeap *Uheap = newFibHeap(); // heap for upstream contigs to node1 + FibHeap *Dheap = newFibHeap(); + int UmaxDis; // max distance upstream to node1 + int DmaxDis; + boolean Uwait; // wait signal for Uheap + boolean Dwait; + int dis; + boolean ret; + + //initiate: node1 is put to array once, and to both Dheap and Uheap + dis = 0; + nodeCounter = 1; + putNodeInArray(node1,maxNodes,dis); + insertNodeIntoHeap(Dheap,dis,nodeCounter); + ctg4heapArray[nodeCounter].us_shut4dheap = 1; + Dwait = 0; + DmaxDis = 0; + + insertNodeIntoHeap(Uheap,dis,nodeCounter); + ctg4heapArray[nodeCounter].ds_shut4uheap = 1; + Uwait = 1; + UmaxDis = contig_array[node1].length; + + while(1){ + ret = workOnDheap(Dheap,Uheap,&Dwait,&Uwait,&DmaxDis,&UmaxDis,maxNodes); + if(!ret){ + setInGraph(0); + destroyHeap(Dheap); + destroyHeap(Uheap); + return 0; + } + ret = workOnUheap(Dheap,Uheap,&Dwait,&Uwait,&DmaxDis,&UmaxDis,maxNodes); + if(!ret){ + setInGraph(0); + destroyHeap(Dheap); + destroyHeap(Uheap); + return 0; + } + if(Uwait&&Dwait){ + destroyHeap(Dheap); + destroyHeap(Uheap); + return 1; + } + } + +} + +static int cmp_ctg(const void *a,const void *b) +{ + CTGinHEAP *A,*B; + A = (CTGinHEAP *)a; + B = (CTGinHEAP *)b; + + if(A->dis>B->dis) + return 1; + else if(A->dis==B->dis) + return 0; + else + return -1; +} + +static boolean checkEligible() +{ + unsigned int firstNode = ctg4heapArray[1].ctgID; + unsigned int twin; + int i; + boolean flag = 0; + + //check if the first node has incoming link from twin of any node in subgraph + // or it has multi outgoing links bound to incoming links + twin = getTwinCtg(firstNode); + CONNECT *ite_cnt = contig_array[twin].downwardConnect; + while(ite_cnt){ + if(ite_cnt->deleted||ite_cnt->mask){ + ite_cnt = ite_cnt->next; + continue; + } + if(contig_array[ite_cnt->contigID].inSubGraph){ +/* + if(firstNode==3693) + printf("eligible link %d -> %d\n",twin,ite_cnt->contigID); +*/ + return 0; + } + if(ite_cnt->prevInScaf){ + if(flag) + return 0; + flag = 1; + } + ite_cnt = ite_cnt->next; + } + + //check if the last node has outgoing link to twin of any node in subgraph + // or it has multi outgoing links bound to incoming links + unsigned int lastNode = ctg4heapArray[nodeCounter].ctgID; + ite_cnt = contig_array[lastNode].downwardConnect; + flag = 0; + while(ite_cnt){ + if(ite_cnt->deleted||ite_cnt->mask){ + ite_cnt = ite_cnt->next; + continue; + } + twin = getTwinCtg(ite_cnt->contigID); + if(contig_array[twin].inSubGraph){ +/* + if(firstNode==3693) + printf("eligible link %d -> %d\n",lastNode,ite_cnt->contigID); +*/ + return 0; + } + if(ite_cnt->prevInScaf){ + if(flag) + return 0; + flag = 1; + } + ite_cnt = ite_cnt->next; + } + //check if any node has outgoing link to node outside the subgraph + for(i=1;ideleted||ite_cnt->mask){ + ite_cnt = ite_cnt->next; + continue; + } + if(!contig_array[ite_cnt->contigID].inSubGraph){ + /* + printf("eligible check: ctg %d links to ctg %d\n", + ctg4heapArray[i].ctgID,ite_cnt->contigID); + */ + return 0; + } + ite_cnt = ite_cnt->next; + } + } + //check if any node has incoming link from node outside the subgraph + for(i=2;i<=nodeCounter;i++){ + twin = getTwinCtg(ctg4heapArray[i].ctgID); + ite_cnt = contig_array[twin].downwardConnect; + while(ite_cnt){ + if(ite_cnt->deleted||ite_cnt->mask){ + ite_cnt = ite_cnt->next; + continue; + } + if(!contig_array[getTwinCtg(ite_cnt->contigID)].inSubGraph){ + /* + printf("eligible check: ctg %d links to ctg %d\n", + ctg4heapArray[i].ctgID,ite_cnt->contigID); + */ + return 0; + } + ite_cnt = ite_cnt->next; + } + } + + return 1; +} + +//put nodes in sub-graph in a line +static void arrangeNodes_general() +{ + int i,gap; + CONNECT *ite_cnt,*temp_cnt,*bal_cnt,*prev_cnt,*next_cnt; + unsigned int node1,node2; + unsigned int bal_nd1,bal_nd2; + //delete original connections + for(i=1;i<=nodeCounter;i++){ + node1 = ctg4heapArray[i].ctgID; + ite_cnt = contig_array[node1].downwardConnect; + while(ite_cnt){ + if(ite_cnt->mask||ite_cnt->deleted||!contig_array[ite_cnt->contigID].inSubGraph){ + ite_cnt = ite_cnt->next; + continue; + } + ite_cnt->deleted = 1; + setNextInScaf(ite_cnt,NULL); + setPrevInScaf(ite_cnt,0); + ite_cnt = ite_cnt->next; + } + + bal_nd1 = getTwinCtg(node1); + ite_cnt = contig_array[bal_nd1].downwardConnect; + while(ite_cnt){ + if(ite_cnt->mask||ite_cnt->deleted||!contig_array[getTwinCtg(ite_cnt->contigID)].inSubGraph){ + ite_cnt = ite_cnt->next; + continue; + } + ite_cnt->deleted = 1; + setNextInScaf(ite_cnt,NULL); + setPrevInScaf(ite_cnt,0); + ite_cnt = ite_cnt->next; + } + } + //create new connections + prev_cnt = next_cnt = NULL; + for(i=1;ideleted = 0; + temp_cnt->mask = 0; + //temp_cnt->gapLen = gap; + bal_cnt = getCntBetween(bal_nd2,bal_nd1); + bal_cnt->deleted = 0; + bal_cnt->mask = 0; + //bal_cnt->gapLen = gap; + } + else{ + temp_cnt = allocateCN(node2,gap); + if(cntLookupTable) + putCnt2LookupTable(node1,temp_cnt); + temp_cnt->next = contig_array[node1].downwardConnect; + contig_array[node1].downwardConnect = temp_cnt; + bal_cnt = allocateCN(bal_nd1,gap); + if(cntLookupTable) + putCnt2LookupTable(bal_nd2,bal_cnt); + bal_cnt->next = contig_array[bal_nd2].downwardConnect; + contig_array[bal_nd2].downwardConnect = bal_cnt; + } + if(prev_cnt){ + setNextInScaf(prev_cnt,temp_cnt); + setPrevInScaf(temp_cnt,1); + } + if(next_cnt){ + setNextInScaf(bal_cnt,next_cnt); + setPrevInScaf(next_cnt,1); + } + prev_cnt = temp_cnt; + next_cnt = bal_cnt; + } + + //re-binding connection at both ends + bal_nd2 = getTwinCtg(ctg4heapArray[1].ctgID); + ite_cnt = contig_array[bal_nd2].downwardConnect; + while(ite_cnt){ + if(ite_cnt->deleted||ite_cnt->mask){ + ite_cnt = ite_cnt->next; + continue; + } + if(ite_cnt->prevInScaf) + break; + ite_cnt = ite_cnt->next; + } + if(ite_cnt){ + bal_nd1 = ite_cnt->contigID; + node1 = getTwinCtg(bal_nd1); + node2 = ctg4heapArray[1].ctgID; + temp_cnt = checkConnect(node1,node2); + bal_cnt = ite_cnt; + next_cnt = checkConnect(ctg4heapArray[1].ctgID,ctg4heapArray[2].ctgID); + prev_cnt = checkConnect(getTwinCtg(ctg4heapArray[2].ctgID), getTwinCtg(ctg4heapArray[1].ctgID)); + if(temp_cnt){ + setNextInScaf(temp_cnt,next_cnt); + setPrevInScaf(temp_cnt->nextInScaf,0); + setPrevInScaf(next_cnt,1); + setNextInScaf(prev_cnt,bal_cnt); + } + } + + node1 = ctg4heapArray[nodeCounter].ctgID; + ite_cnt = contig_array[node1].downwardConnect; + while(ite_cnt){ + if(ite_cnt->deleted||ite_cnt->mask){ + ite_cnt = ite_cnt->next; + continue; + } + if(ite_cnt->prevInScaf) + break; + ite_cnt = ite_cnt->next; + } + if(ite_cnt){ + node2 = ite_cnt->contigID; + bal_nd1 = getTwinCtg(node1); + bal_nd2 = getTwinCtg(node2); + temp_cnt = ite_cnt; + bal_cnt = checkConnect(bal_nd2,bal_nd1); + next_cnt = checkConnect(getTwinCtg(ctg4heapArray[nodeCounter].ctgID), + getTwinCtg(ctg4heapArray[nodeCounter-1].ctgID)); + prev_cnt = checkConnect(ctg4heapArray[nodeCounter-1].ctgID,ctg4heapArray[nodeCounter].ctgID); + setNextInScaf(prev_cnt,temp_cnt); + setNextInScaf(bal_cnt,next_cnt); + setPrevInScaf(next_cnt,1); + } +} +//check if contigs next to each other have reasonable overlap +boolean checkOverlapInBetween_general(double tolerance) +{ + int i,gap; + unsigned int node; + int lenSum,lenOlp; + lenSum = lenOlp = 0; + for(i=1;i<=nodeCounter;i++){ + node = ctg4heapArray[i].ctgID; + lenSum += contig_array[node].length; + } + if(lenSum<1) + return 1; + for(i=1;i0) + lenOlp += -gap; + //if(-gap>ins_size_var) + + } + double olp_pect=(double)lenOlp/lenSum; + fprintf(stderr,"[%s]existing with olp_pect %.3f.\n",__FUNCTION__,olp_pect); + if(olp_pect>tolerance){ + return 0; + } + return 1; +} + +//check if there's any connect indicates the opposite order between nodes in sub-graph +static boolean checkConflictCnt_general(double tolerance) +{ + int i,j; + int supportCounter=0; + int objectCounter=0; + CONNECT *cnt; + for(i=1;iweight; + cnt = checkConnect(ctg4heapArray[j].ctgID,ctg4heapArray[i].ctgID); + if(cnt) + objectCounter += cnt->weight; + //return 1; + } + } + if(supportCounter<1) + return 1; + if((double)objectCounter/supportCounter0;i--){ + if(contig_array[i].mask) + continue; + out_num = validConnect(i,NULL); + + if(out_num<2) + continue; + + //flag = pickSubGraph(i,strict); + flag = pickUpGeneralSubgraph(i,MaxNodeInSub); + if(!flag) + continue; + subCounter++; + qsort(&ctg4heapArray[1],nodeCounter,sizeof(CTGinHEAP),cmp_ctg); + flag = checkEligible(); + if(!flag){ + eligibleCounter++; + setInGraph(0); + continue; + } + if(strict){ + overlapTolerance = OverlapPercent; + conflTolerance = ConflPercent; + }else{ + overlapTolerance = 2*OverlapPercent; + conflTolerance = 2*ConflPercent; + } + flag = checkOverlapInBetween_general(overlapTolerance); + if(!flag){ + overlapCounter++; + setInGraph(0); + continue; + } + flag = checkConflictCnt_general(conflTolerance); + if(flag){ + conflCounter++; + setInGraph(0); + continue; + } + arrangeNodes_general(); + setInGraph(0); + } + fprintf(stdout,"[%s]Picked %d subgraphs,%d have conflicting connections,%d have significant overlapping, %d eligible\n", + __FUNCTION__,subCounter,conflCounter,overlapCounter,eligibleCounter); + +} + +/**** the fowllowing codes for detecting and break down scaffold at weak point **********/ +// mark connections in scaffolds made by small pe +static void smallScaf() +{ + unsigned int i,ctg,bal_ctg,prevCtg; + int counter=0; + CONNECT *bindCnt,*cnt; + + for(i=1;i<=num_ctg;i++) + contig_array[i].flag = 0; + for(i=1;i<=num_ctg;i++){ + if(contig_array[i].flag||contig_array[i].mask||!contig_array[i].downwardConnect) + continue; + bindCnt = getBindCnt(i); + if(!bindCnt) + continue; + counter++; + + contig_array[i].flag = 1; + contig_array[getTwinCtg(i)].flag = 1; + prevCtg = getTwinCtg(i); + while(bindCnt){ + ctg = bindCnt->contigID; + bal_ctg = getTwinCtg(ctg); + bindCnt->bySmall = 1; + cnt = getCntBetween(bal_ctg,prevCtg); + if(cnt) + cnt->bySmall = 1; + + contig_array[ctg].flag = 1; + contig_array[bal_ctg].flag = 1; + prevCtg = bal_ctg; + bindCnt = bindCnt->nextInScaf; + } + + ctg = getTwinCtg(i); + bindCnt = getBindCnt(ctg); + prevCtg = i; + while(bindCnt){ + ctg = bindCnt->contigID; + bal_ctg = getTwinCtg(ctg); + bindCnt->bySmall = 1; + cnt = getCntBetween(bal_ctg,prevCtg); + if(cnt) + cnt->bySmall = 1; + + contig_array[ctg].flag = 1; + contig_array[bal_ctg].flag = 1; + prevCtg = bal_ctg; + bindCnt = bindCnt->nextInScaf; + } + } + //printf("Report from smallScaf: %d scaffolds by smallPE\n",counter); +} + +static boolean putItem2Sarray(unsigned int scaf,int wt,DARRAY *SCAF,DARRAY *WT,int counter) +{ + int i; + unsigned int *scafP,*wtP; + for(i=0;ideleted||ite_cnt->mask||ite_cnt->singleInScaf + ||ite_cnt->nextInScaf||ite_cnt->prevInScaf||ite_cnt->inherit){ + ite_cnt = ite_cnt->next; + continue; + } + targetCtg = ite_cnt->contigID; + if(contig_array[ctg].from_vt==contig_array[targetCtg].from_vt){ // on the same scaff + ite_cnt = ite_cnt->next; + continue; + } + inc = putItem2Sarray(contig_array[targetCtg].from_vt,ite_cnt->weight,SCAF,WT,counter); + if(inc) + counter++; + ite_cnt = ite_cnt->next; + } + } + return counter; + +} + +static int getScaffold(unsigned int start, STACK *scafStack) +{ + int len = contig_array[start].length; + unsigned int *pt,ctg; + + emptyStack(scafStack); + pt = (unsigned int*)stackPush(scafStack); + *pt = start; + CONNECT *bindCnt = getBindCnt(start); + while(bindCnt){ + ctg = bindCnt->contigID; + pt = (unsigned int*)stackPush(scafStack); + *pt = ctg; + len += contig_array[ctg].length; + bindCnt = bindCnt->nextInScaf; + } + stackBackup(scafStack); + return len; +} + +static boolean isLinkReliable(DARRAY *WT,int count) +{ + int i; + for(i=0;i=weakPE) + return 1; + + return 0; +} + +static int getWtFromSarray(DARRAY *SCAF,DARRAY *WT,int count,unsigned int scaf) +{ + int i; + for(i=0;i + scaf1 --- --- -- -- --- + scaf2 -- --- --- -- + ----> +*/ +static boolean checkScafConsist(STACK *scafStack1,STACK *scafStack2) +{ + DARRAY *downwardTo1 = (DARRAY *)createDarray(1000,sizeof(unsigned int));// scaf links to those scaffolds + DARRAY *downwardTo2 = (DARRAY *)createDarray(1000,sizeof(unsigned int)); + DARRAY *downwardWt1 = (DARRAY *)createDarray(1000,sizeof(unsigned int));// scaf links to scaffolds with those wt + DARRAY *downwardWt2 = (DARRAY *)createDarray(1000,sizeof(unsigned int)); + + int linkCount1 = getDSLink2Scaf(scafStack1,downwardTo1,downwardWt1); + int linkCount2 = getDSLink2Scaf(scafStack2,downwardTo2,downwardWt2); + if(!linkCount1||!linkCount2){ + freeDarray(downwardTo1); + freeDarray(downwardTo2); + freeDarray(downwardWt1); + freeDarray(downwardWt2); + return 1; + } + boolean flag1 = isLinkReliable(downwardWt1,linkCount1); + boolean flag2 = isLinkReliable(downwardWt2,linkCount2); + if(!flag1||!flag2){ + freeDarray(downwardTo1); + freeDarray(downwardTo2); + freeDarray(downwardWt1); + freeDarray(downwardWt2); + return 1; + } + + unsigned int scaf; + int i,wt1,wt2,ret=1; + + for(i=0;i=0){ + thisCtg = *(unsigned int *)darrayGet(ctgArray,index); + cnt = getCntBetween(thisCtg,nextCtg); + if(cnt->maxGap>2) + break; + else + *start = index; + nextCtg = thisCtg; + index--; + } + unsigned int prevCtg = *(unsigned int *)darrayGet(ctgArray,weakest+1); + *finish = weakest+1; + index = weakest+2; + while(indexmaxGap>2) + break; + else + *finish = index; + prevCtg = thisCtg; + index++; + } + +} + +static void changeScafEnd(STACK *scafStack,unsigned int end) +{ + + unsigned int ctg,*pt; + unsigned int start=getTwinCtg(end); + stackRecover(scafStack); + while((pt=(unsigned int*)stackPop(scafStack))!=NULL){ + ctg = *pt; + contig_array[ctg].to_vt = end; + contig_array[getTwinCtg(ctg)].from_vt = start; + } +} + +static void changeScafBegin(STACK *scafStack,unsigned int start) +{ + + unsigned int ctg,*pt; + unsigned int end=getTwinCtg(start); + stackRecover(scafStack); + while((pt=(unsigned int*)stackPop(scafStack))!=NULL){ + ctg = *pt; + contig_array[ctg].from_vt = start; + contig_array[getTwinCtg(ctg)].to_vt = end; + } +} +// break down scaffolds poorly supported by longer PE +static void detectBreakScaf() +{ + fprintf(stderr,"[%s]entering this function.\n",__FUNCTION__); + unsigned int i,avgPE,scafLen,len,ctg,bal_ctg,prevCtg,thisCtg; + long long peCounter,linkCounter; + int num3,num5,weakPoint,tempCounter,j,t,counter=0; + CONNECT *bindCnt,*cnt,*weakCnt; + + STACK *scafStack1 = (STACK *)createStack(1000,sizeof(unsigned int)); + STACK *scafStack2 = (STACK *)createStack(1000,sizeof(unsigned int)); + + for(i=1;i<=num_ctg;i++) + contig_array[i].flag = 0; + for(i=1;i<=num_ctg;i++){ + if(contig_array[i].flag||contig_array[i].mask||!contig_array[i].downwardConnect) + continue; + bindCnt = getBindCnt(i); + if(!bindCnt) + continue; + //first scan get the average coverage by longer pe + num5 = num3 = peCounter = linkCounter = 0; + scafLen = contig_array[i].length; + ctg = i; + *(unsigned int *)darrayPut(scaf5,num5++) = i; + contig_array[i].flag = 1; + contig_array[getTwinCtg(i)].flag = 1; + while(bindCnt){ + if(!bindCnt->bySmall) + break; + linkCounter++; + peCounter += bindCnt->maxGap; + ctg = bindCnt->contigID; + scafLen += contig_array[ctg].length; + *(unsigned int *)darrayPut(scaf5,num5++) = ctg; + bal_ctg = getTwinCtg(ctg); + contig_array[ctg].flag = 1; + contig_array[bal_ctg].flag = 1; + bindCnt = bindCnt->nextInScaf; + } + + ctg = getTwinCtg(i); + bindCnt = getBindCnt(ctg); + while(bindCnt){ + if(!bindCnt->bySmall) + break; + linkCounter++; + peCounter += bindCnt->maxGap; + ctg = bindCnt->contigID; + scafLen += contig_array[ctg].length; + bal_ctg = getTwinCtg(ctg); + contig_array[ctg].flag = 1; + contig_array[bal_ctg].flag = 1; + *(unsigned int *)darrayPut(scaf3,num3++) = bal_ctg; + bindCnt = bindCnt->nextInScaf; + } + if(linkCounter<1||scafLen<5000) + continue; + + avgPE = peCounter/linkCounter; + + if(avgPE<10) + continue; + + tempCounter = 0; + for(j=num3-1;j>=0;j--) + *(unsigned int *)darrayPut(tempArray,tempCounter++) = + *(unsigned int *)darrayGet(scaf3,j); + + for(j=0;jscafLen-2000) + break; + len += contig_array[thisCtg].length; + if(contig_array[prevCtg].from_vt!=contig_array[thisCtg].from_vt|| + contig_array[prevCtg].indexInScaf>contig_array[thisCtg].indexInScaf){ + prevCtg = thisCtg; + continue; + } + cnt = getCntBetween(prevCtg,thisCtg); + if(!weakCnt||weakCnt->maxGap>cnt->maxGap){ + weakCnt = cnt; + weakPoint = t; + } + prevCtg = thisCtg; + } + if(!weakCnt||(weakCnt->maxGap>2&&weakCnt->maxGap>avgPE/5)) + continue; + prevCtg = *(unsigned int *)darrayGet(tempArray,weakPoint-1); + thisCtg = *(unsigned int *)darrayGet(tempArray,weakPoint); + if(contig_array[prevCtg].from_vt!=contig_array[thisCtg].from_vt|| + contig_array[prevCtg].indexInScaf>contig_array[thisCtg].indexInScaf){ + printf("contig %d and %d not on the same scaff\n",prevCtg,thisCtg); + continue; + } + setConnectWP(prevCtg,thisCtg,1); + /* + fprintf(stderr,"scaffold len %d, avg long pe cov %d (%ld/%ld)\n", + scafLen,avgPE,peCounter,linkCounter); + fprintf(stderr,"Weak connect (%d) between %d(%dth of %d) and %d\n" + ,weakCnt->maxGap,prevCtg,weakPoint-1,tempCounter,thisCtg); + */ + // set start and end to break down the scaffold + int index1,index2; + setBreakPoints(tempArray,tempCounter,weakPoint-1,&index1,&index2); + //fprintf(stderr,"break %d ->...-> %d\n",index1,index2); + unsigned int start = *(unsigned int*)darrayGet(tempArray,index1); + unsigned int finish = *(unsigned int*)darrayGet(tempArray,index2); + int len1 = getScaffold(getTwinCtg(start), scafStack1); + int len2 = getScaffold(finish, scafStack2); + if(len1<2000||len2<2000) + continue; + switch2twin(scafStack1); + int flag1 = checkScafConsist(scafStack1,scafStack2); + + switch2twin(scafStack1); + switch2twin(scafStack2); + int flag2 = checkScafConsist(scafStack2,scafStack1); + if(!flag1||!flag2){ + changeScafBegin(scafStack1,getTwinCtg(start)); + changeScafEnd(scafStack2,getTwinCtg(finish)); + //unbind links + unsigned int nextCtg = *(unsigned int *)darrayGet(tempArray,index1+1); + thisCtg = *(unsigned int *)darrayGet(tempArray,index1); + cnt=getCntBetween(getTwinCtg(nextCtg),getTwinCtg(thisCtg)); + if(cnt->nextInScaf){ + prevCtg = getTwinCtg(cnt->nextInScaf->contigID); + cnt->nextInScaf->prevInScaf = 0; + cnt = getCntBetween(prevCtg,thisCtg); + cnt->nextInScaf = NULL; + } + prevCtg = *(unsigned int *)darrayGet(tempArray,index2-1); + thisCtg = *(unsigned int *)darrayGet(tempArray,index2); + cnt = getCntBetween(prevCtg,thisCtg); + if(cnt->nextInScaf){ + nextCtg = cnt->nextInScaf->contigID; + cnt->nextInScaf->prevInScaf= 0; + cnt = getCntBetween(getTwinCtg(nextCtg),getTwinCtg(thisCtg)); + cnt->nextInScaf = NULL; + } + prevCtg = *(unsigned int *)darrayGet(tempArray,index1); + for(t=index1+1;t<=index2;t++){ + thisCtg = *(unsigned int *)darrayGet(tempArray,t); + cnt = getCntBetween(prevCtg,thisCtg); + cnt->mask = 1; + cnt->nextInScaf=NULL; + cnt->prevInScaf = 0; + cnt = getCntBetween(getTwinCtg(thisCtg),getTwinCtg(prevCtg)); + cnt->mask = 1; + cnt->nextInScaf=NULL; + cnt->prevInScaf = 0; + /* + fprintf(stderr,"(%d %d)/(%d %d) ", + prevCtg,thisCtg,getTwinCtg(thisCtg),getTwinCtg(prevCtg)); + */ + prevCtg = thisCtg; + } + //fprintf(stderr,": BREAKING\n"); + counter++; + } + } + + freeStack(scafStack1); + freeStack(scafStack2); + fprintf(stderr,"[%s]existing this function.\n",__FUNCTION__); + //printf("Report from checkScaf: %d scaffold segments broken\n",counter); +} + +static boolean checkSimple(DARRAY *ctgArray,int count) +{ + int i; + unsigned int ctg; + for(i=0;iweak||cn_temp1->deleted){ + cn_temp1 = cn_temp1->next; + continue; + } + ctg = cn_temp1->contigID; + if(checkConnect(ctg,i)){ + counter++; + maskContig(i,1); + maskContig(ctg,1); + } + cn_temp1 = cn_temp1->next; + } + + } + //printf("%d circles removed \n",counter); +} diff --git a/fusion/output_scaffold.c b/fusion/output_scaffold.c new file mode 100755 index 0000000..2076101 --- /dev/null +++ b/fusion/output_scaffold.c @@ -0,0 +1,65 @@ +#include "stdinc.h" +#include "newhash.h" +#include "extfunc.h" +#include "extvab.h" + +void output_contig_graph(char *outfile) +{ + char name[256]; + FILE *fp; + unsigned int i; + + sprintf(name,"%s.contig.gvz",outfile); + fp = ckopen(name,"w"); + fprintf(fp,"digraph G{\n"); + fprintf(fp,"\tsize=\"512,512\";\n"); + + for(i=num_ctg;i>0;i--){ + fprintf(fp,"\tV%d -> V%d[label =\"%d(%d)\"];\n",contig_array[i].from_vt,contig_array[i].to_vt,i,contig_array[i].length); + } + fprintf(fp,"}\n"); + fclose(fp); +} +void output_scaf(char *outfile) +{ + char name[256]; + FILE *fp; + unsigned int i; + CONNECT *connect; + boolean flag; + + sprintf(name,"%s.scaffold.gvz",outfile); + fp = ckopen(name,"w"); + fprintf(fp,"digraph G{\n"); + fprintf(fp,"\tsize=\"512,512\";\n"); + + for(i=num_ctg;i>0;i--){ + //if(contig_array[i].mask||!contig_array[i].downwardConnect) + if(!contig_array[i].downwardConnect) + continue; + connect = contig_array[i].downwardConnect; + while(connect){ + //if(connect->mask||connect->deleted){ + if(connect->deleted){ + connect = connect->next; + continue; + } + if(connect->prevInScaf||connect->nextInScaf) + flag = 1; + else + flag = 0; + if(!connect->mask) + fprintf(fp,"\tC%d_%d -> C%d_%d [label = \"%d(%d_%d)\"];\n" + ,i,contig_array[i].length,connect->contigID,contig_array[connect->contigID].length, + connect->gapLen,flag,connect->weight); + else + fprintf(fp,"\tC%d_%d -> C%d_%d [label = \"%d(%d_%d)\", color = red];\n" + ,i,contig_array[i].length,connect->contigID,contig_array[connect->contigID].length, + connect->gapLen,flag,connect->weight); + connect = connect->next; + } + } + fprintf(fp,"}\n"); + fclose(fp); +} + diff --git a/fusion/potential.c b/fusion/potential.c new file mode 100644 index 0000000..e6f48cd --- /dev/null +++ b/fusion/potential.c @@ -0,0 +1,232 @@ +#include "stdinc.h" +#include "newhash.h" +#include "extfunc.h" +#include "extvab.h" +#include "dfibHeap.h" +#include "fibHeap.h" +#include "darray.h" + + +//static CTGinHEAP *ctg4heapArray; +extern int inputLinks(FILE *fp, int insertS,char *line); +//unsigned int traverse(unsigned int node,int *far_count,unsigned int *farpath, + //int *curr_count,unsigned int *currpath,int *used_count,unsigned int *used,int *max_dist,int *node_dist); +//static int *sub_arr; +//static int sub_counter=0; +int rev_comp (const void * a, const void * b) +{ + return ( *(int*)b - *(int*)a ); +} +void potential() +{ + + char name[256],*line; + FILE *fp; + int i; + int flag2; + + loadUpdatedEdges(graphfile); + + if(!pes) + loadPEgrads(graphfile); + + sprintf(name,"%s.links",graphfile); + fp = ckopen(name,"r"); + + createCntMemManager(); + createCntLookupTable(); + + lineLen = 1024; + line = (char *)ckalloc(lineLen*sizeof(char)); + + fgets(line,lineLen,fp); + line[0] = '\0'; + fprintf(stderr,"[%s]before inputLinks loop.\n",__FUNCTION__); + for(i=0;i=0){ + int curr_bound=curr_boarder; + int curr_node=curr_path[curr_boarder--]; + int base_dist=dist[curr_bound]; + CONNECT *curr_cnt=contig_array[curr_node].downwardConnect; + while(curr_cnt){//push all adjacent connect + if(curr_cnt->weight<3||contig_array[curr_cnt->contigID].inSubGraph + ||contig_array[getTwinCtg(curr_cnt->contigID)].inSubGraph){ + curr_cnt=curr_cnt->next; + continue; + } + curr_path[++curr_boarder]=curr_cnt->contigID; + contig_array[curr_cnt->contigID].inSubGraph=1; + contig_array[getTwinCtg(curr_cnt->contigID)].inSubGraph=1; + fprintf(stderr,"[%d] traversed %d %d .\n",__LINE__,curr_cnt->contigID,getTwinCtg(curr_cnt->contigID)); + ++used; + dist[curr_boarder]=base_dist+ + curr_cnt->gapLen+contig_array[curr_cnt->contigID].length; + + if(dist[curr_boarder]>max_dist) + max_dist=dist[curr_boarder]; + //fprintf(stderr,"curr_boarder %d node_dist %d max_dist %d \n",curr_boarder, + // dist[curr_boarder],max_dist); + curr_cnt=curr_cnt->next; + } + + } + len+=max_dist; + + //} + if(contig_array[getTwinCtg(i)].downwardConnect){ + curr_boarder=0; + curr_path[curr_boarder]=i; + dist[curr_boarder]=0; + + while(curr_boarder>=0){ + int curr_bound=curr_boarder; + int curr_node=curr_path[curr_boarder--]; + int base_dist=dist[curr_bound]; + CONNECT *curr_cnt=contig_array[curr_node].downwardConnect; + while(curr_cnt){//push all adjacent connect + if(curr_cnt->weight<3||contig_array[curr_cnt->contigID].inSubGraph + ||contig_array[getTwinCtg(curr_cnt->contigID)].inSubGraph){ + curr_cnt=curr_cnt->next; + continue; + } + curr_path[++curr_boarder]=curr_cnt->contigID; + contig_array[curr_cnt->contigID].inSubGraph=1; + contig_array[getTwinCtg(curr_cnt->contigID)].inSubGraph=1; + fprintf(stderr,"[%d] traversed %d %d .\n",__LINE__,curr_cnt->contigID,getTwinCtg(curr_cnt->contigID)); + ++used; + dist[curr_boarder]=base_dist+ + curr_cnt->gapLen+contig_array[curr_cnt->contigID].length; + + if(dist[curr_boarder]>max_dist) + max_dist=dist[curr_boarder]; + //fprintf(stderr,"curr_boarder %d node_dist %d max_dist %d \n",curr_boarder, + // dist[curr_boarder],max_dist); + curr_cnt=curr_cnt->next; + } + + } + len+=max_dist; + } + /*int ii; + for(ii=0;ii=half) + break; + } + printf("N50 %d , half %lld.\n",predict[i],half); + printf("used contig %d",used); +} + +/* +unsigned int traverse(unsigned int node,int *far_count,unsigned int *farpath, + int *curr_count,unsigned int *currpath,int *used_count,unsigned int *used,int *max_dist,int *node_dist){ + unsigned int bal = getTwinCtg(node); + + currpath[(*curr_count)++]=node; + used[(*used_count)++]=node; + used[(*used_count)++]=bal; + contig_array[node].inSubGraph=1; + contig_array[bal].inSubGraph=1; + + fprintf(stderr,"farcount %d curr_count %d node_dist %d max_dist %d.\n",*far_count,*curr_count,*node_dist,*max_dist); + CONNECT *tmp_cnt=contig_array[node].downwardConnect; + while(tmp_cnt){ + unsigned int ctg,bal_ctg; + ctg=tmp_cnt->contigID; + bal_ctg=getTwinCtg(ctg); + if(contig_array[ctg].inSubGraph||contig_array[bal_ctg].inSubGraph + ||contig_array[ctg].flag||contig_array[bal_ctg].flag){ + tmp_cnt=tmp_cnt->next; + continue; + } + *node_dist+=(tmp_cnt->gapLen+contig_array[ctg].length); + if(*node_dist>*max_dist){ + int i; + for(i=0;i<*curr_count;++i){ + farpath[i]=currpath[i]; + } + *far_count=*curr_count; + *max_dist=*node_dist+tmp_cnt->gapLen; + } + traverse(tmp_cnt->contigID,far_count,farpath,curr_count,currpath,used_count,used,max_dist,node_dist); + *node_dist-=(tmp_cnt->gapLen+contig_array[ctg].length); + tmp_cnt=tmp_cnt->next; + } + --(*curr_count); + + return 0; +} +*/ diff --git a/fusion/prepare.c b/fusion/prepare.c new file mode 100644 index 0000000..6a04b3c --- /dev/null +++ b/fusion/prepare.c @@ -0,0 +1,216 @@ +#include "stdinc.h" +#include "newhash.h" +#include "extfunc.h" +#include "extvab.h" +#include "ctype.h" +boolean upper_rev(char *in,int in_len); +void print_seq(FILE *out_file,char *sequence , int sequence_len); +char rev[]={0,0,0,0,0,0,0,0,0,0,//0 + 0,0,0,0,0,0,0,0,0,0,//10 + 0,0,0,0,0,0,0,0,0,0,//20 + 0,0,0,0,0,0,0,0,0,0,//30 + 0,0,0,0,0,0,0,0,0,0,//40 + 0,0,0,0,0,0,0,0,0,0,//50 + 0,0,0,0,0,'T',0,'G',0,0,//60 + 0,'C',0,0,0,0,0,0,'N',0,//70 + 0,0,0,0,'A',0,0,0,0,0,//80 + 0,0,0,0,0,0,0,0,0,0,//90 + 0,0,0,0,0,0,0,0,0,0,};//100 +typedef struct io_ctg{ + char *seq; + int len; + int bal; + char *name; +}IO_CTG; + +static int cmp_ctg(const void *a,const void *b){ + IO_CTG *A=(IO_CTG *)a; + IO_CTG *B=(IO_CTG *)b; + return A->len-B->len; +} + +int data_prepare(){ + char file_name[256]; + + FILE *basic; + sprintf(file_name,"%s.preGraphBasic",graphfile); + basic=ckopen(file_name,"w"); + fprintf(basic,"VERTEX 605681 K %d",overlaplen); + fprintf(basic,"\nEDGEs 1861091\n\nMaxReadLen 100 MinReadLen 0 MaxNameLen 256\n"); + fclose(basic); + + //char **ctg_seq=(char **)ckalloc(100000000*sizeof(char *)); + //int *ctg_bal=(int *)ckalloc(100000000*sizeof(int)); + //int *ctg_len=(int *)ckalloc(100000000*sizeof(int)); + + FILE *ctg_fp; + ctg_fp=ckopen(ctg_file,"r"); + FILE *update,*index,*new_ctg; + sprintf(file_name, "%s.contig", graphfile); + new_ctg=ckopen(file_name,"w"); + FILE *conver; + sprintf(file_name,"%s.conver", graphfile); + conver=ckopen(file_name,"w"); + + + char *line; + line= (char *)ckalloc(100000000*sizeof(char )); + char orig_name[1024]; + char *seq; + IO_CTG *pre_ctg=(IO_CTG *)ckalloc(1000000000*sizeof(IO_CTG)); + + seq=(char *)malloc(1000000000*sizeof(char)); + int cul_id=1; + int total=0; + fgets(line,100000000*sizeof(char ),ctg_fp); + sscanf(line,">%s",orig_name); + int len=0; + //fprintf(stderr,"reach here %d\n",__LINE__); + while(fgets(line,100000000*sizeof(char ),ctg_fp)!=NULL){ + if(line[0]=='>'){ + if(len%s",orig_name); + seq[0]='\0'; + len=0; + continue; + } + + boolean flag=upper_rev(seq,len); + //fprintf(new_ctg,">%d length %d\n",cul_id,len); + //fprintf(conver,"%s\t%d\t%d\n",orig_name,cul_id,len); + //print_seq(new_ctg,seq,len); + //fprintf(new_ctg,"%s\n",seq); + char *one_seq=(char *)ckalloc((len+100)*sizeof(char)); + strcpy(one_seq,seq); + + if(flag==0){ + pre_ctg[++total].seq=one_seq; + pre_ctg[total].bal=2; + pre_ctg[total].len=len; + pre_ctg[total].name=(char *)malloc((strlen(orig_name)+1)*sizeof(char)); + strcpy(pre_ctg[total].name,orig_name); + //pre_ctg[++cul_id].bal=0; + cul_id+=2; + }else{ + pre_ctg[++total].seq=one_seq; + pre_ctg[total].len=len; + pre_ctg[total].bal=1; + pre_ctg[total].name=(char *)malloc((strlen(orig_name)+1)*sizeof(char)); + strcpy(pre_ctg[total].name,orig_name); + ++cul_id; + } + + sscanf(line,">%s",orig_name); + seq[0]='\0'; + len=0; + }else{ + //strcat(seq,line);//effective? + int single_len=strlen(line); + line[single_len-1]='\0'; + strcpy(&seq[len],line); + len+=single_len-1; + } + + } + if(len>overlaplen){ + boolean flag=upper_rev(seq,len); + //fprintf(new_ctg,">%d length %d\n",cul_id,len); + //fprintf(conver,"%s\t%d\t%d\n",orig_name,cul_id,len); + //print_seq(new_ctg,seq,len); + //fprintf(new_ctg,"%s\n",seq); + char *one_seq=(char *)ckalloc((len+100)*sizeof(char)); + strcpy(one_seq,seq); + if(flag==0){ + pre_ctg[++total].seq=one_seq; + pre_ctg[total].bal=2; + pre_ctg[total].len=len; + pre_ctg[total].name=(char *)malloc(strlen(orig_name)*sizeof(char)); + strcpy(pre_ctg[total].name,orig_name); + //pre_ctg[++total].bal=0; + cul_id+=2; + }else{ + pre_ctg[++total].seq=one_seq; + pre_ctg[total].len=len; + pre_ctg[total].bal=1; + pre_ctg[total].name=(char *)malloc(strlen(orig_name)*sizeof(char)); + strcpy(pre_ctg[total].name,orig_name); + ++cul_id; + } + + } + fprintf(stderr,"All contigs loaded.\n"); + sprintf(file_name, "%s.updated.edge", graphfile); + update=ckopen(file_name,"w"); + sprintf(file_name, "%s.ContigIndex", graphfile); + index=ckopen(file_name,"w"); + fprintf(update,"EDGEs %d\n",cul_id); + fprintf(index,"Edge_num %d %d\nindex\tlength\treverseComplement\n",cul_id,total); + qsort(&pre_ctg[1],total,sizeof(IO_CTG),cmp_ctg); + + int i=1; + cul_id=0; + for(;i<=total;++i){ + if(pre_ctg[i].bal==2){ + len=pre_ctg[i].len; + fprintf(new_ctg,">%d length %d\n",++cul_id,len); + print_seq(new_ctg,pre_ctg[i].seq,len); + fprintf(conver,"%s\t%d\t%d\n",pre_ctg[i].name,cul_id,len); +// if(overlaplen<=31){ +// fprintf(update,">length %d,fffffffffff,fffffffffff,1,8\n",len); +// fprintf(update,">length %d,fffffffffff,fffffffffff,-1,8\n",len); +// }else{ + fprintf(update,">length %d,1,8\n",len); + fprintf(update,">length %d,-1,8\n",len); +// } + fprintf(index,"%d\t%d\t1\n",cul_id++,len); + + }else{ + fprintf(new_ctg,">%d length %d\n",++cul_id,len); + len=pre_ctg[i].len; + print_seq(new_ctg,pre_ctg[i].seq,len); + fprintf(conver,"%s\t%d\t%d\n",pre_ctg[i].name,cul_id,len); + if(overlaplen<=31){ + fprintf(update,">length %d,fffffffffff,fffffffffff,0,8\n",len); + }else{ + fprintf(update,">length %d,0,8\n",len); + } + fprintf(index,"%d\t%d\t0\n",cul_id,len); + } + } + + sprintf(file_name,"touch %s.Arc",graphfile); + system(file_name); + return 0; +} + +//return value:0: in not equal its' rev_comp +//1: in equal its' rev_comp +boolean upper_rev(char *in,int in_len){ + int i,it_num; + + boolean ret_flag=1; + it_num=in_len/2; + + for(i=0;ithreadID; + //printf("%dth thread with threadID %d, hash_table %p\n",id,prm.threadID,prm.hash_table); + while(1){ + if(*(prm->selfSignal)==1){ + unsigned int seq_index=0; + unsigned int pos = 0; + for(i=0;iselfSignal) = 0; + }else if(*(prm->selfSignal)==2){ + for(i=0;iselfSignal) = 0; + } + else if(*(prm->selfSignal)==3){ + *(prm->selfSignal) = 0; + break; + } + usleep(1); + } +} + +static void singleKmer(int t,KmerSet *kset, + unsigned int seq_index,unsigned int pos) +{ + boolean flag; + kmer_t *node; + + flag = put_kmerset(kset, kmerBuffer[t], 4, 4,&node); + //printf("singleKmer: kmer %llx\n",kmerBuffer[t]); + if(!flag){ + if(smallerBuffer[t]) + node->twin = 0; + else + node->twin = 1;; + node->l_links = ctgIdArray[seq_index]; + node->r_links = pos; + }else + node->deleted = 1; +} + +static void creatThrds(pthread_t *threads,PARAMETER *paras) +{ + unsigned char i; + int temp; + + for(i=0;i='0'&&name[0]<='9') + return atoi(&(name[0])); + else + return 0; +} + +boolean prlContig2nodes(char *grapfile,int len_cut) +{ + long long i,num_seq; + char name[256],*next_name; + FILE *fp; + pthread_t threads[thrd_num]; + time_t start_t,stop_t; + unsigned char thrdSignal[thrd_num+1]; + PARAMETER paras[thrd_num]; + int maxCtgLen,minCtgLen,nameLen; + unsigned int lenSum,contigId; + + WORDFILTER = (((Kmer) 1) << (2*overlaplen)) - 1; + time(&start_t); + sprintf(name,"%s.contig",grapfile); + fp = ckopen(name, "r"); + maxCtgLen = nameLen = 10; + minCtgLen = 1000; + num_seq = readseqpar(&maxCtgLen,&minCtgLen,&nameLen,fp); + //printf("\nthere're %lld contigs in file: %s, max seq len %d, min seq len %d, max name len %d\n", + //num_seq,grapfile,maxCtgLen,minCtgLen,nameLen); + maxReadLen = maxCtgLen; + fclose(fp); + time(&stop_t); + //printf("time spent on parse contigs file %ds\n",(int)(stop_t-start_t)); + + next_name = (char *)ckalloc((maxNameLen+1)*sizeof(char)); + + // extract all the EDONs + seq_buffer_size=buffer_size*2; + max_read_c=seq_buffer_size/20; + + kmerBuffer = (Kmer *)ckalloc(buffer_size*sizeof(Kmer)); + hashBanBuffer = (Kmer *)ckalloc(buffer_size*sizeof(Kmer)); + smallerBuffer = (boolean *)ckalloc(buffer_size*sizeof(boolean)); + + seqBuffer = (char *)ckalloc(seq_buffer_size*sizeof(char)); + lenBuffer = (int *)ckalloc(max_read_c*sizeof(int)); + indexArray = (unsigned int *)ckalloc((max_read_c+1)*sizeof(unsigned int)); + seqBreakers = (unsigned int *)ckalloc((max_read_c+1)*sizeof(unsigned int)); + ctgIdArray = (int *)ckalloc(max_read_c*sizeof(int)); + + fp = ckopen(name, "r"); + //node_mem_manager = createMem_manager(EDONBLOCKSIZE,sizeof(EDON)); + rcSeq = (char **)ckalloc((thrd_num+1)*sizeof(char*)); + if(1){ + kmerCounter = (long long *)ckalloc((thrd_num+1)*sizeof(long long)); + KmerSets = (KmerSet **)ckalloc(thrd_num*sizeof(KmerSet *)); + for(i=0;i0 ? contigId:i; + lenSum += lenBuffer[read_c]; + kmer_c += lenBuffer[read_c] - overlaplen + 1; + read_c++; + seqBreakers[read_c] = lenSum; + indexArray[read_c] = kmer_c; + //printf("seq %d start at %d\n",read_c,seqBreakers[read_c]); + if(read_c==max_read_c||(lenSum+maxCtgLen)>seq_buffer_size||(kmer_c+maxCtgLen-overlaplen+1)>buffer_size){ + kmerCounter[0] += kmer_c; + sendWorkSignal(2,thrdSignal); + sendWorkSignal(1,thrdSignal); + + kmer_c = read_c = lenSum = 0; + } + + } + if(read_c){ + kmerCounter[0] += kmer_c; + sendWorkSignal(2,thrdSignal); + sendWorkSignal(1,thrdSignal); + } + + sendWorkSignal(3,thrdSignal); + + thread_wait(threads); + time(&stop_t); + //printf("time spent on hash reads: %ds\n",(int)(stop_t-start_t)); + if(1){ + unsigned long long alloCounter = 0; + unsigned long long allKmerCounter = 0; + for(i=0;iA G C T + {2, 7, 3, 1}, // C->C T A G + {1, 3, 7, 2}, // T->T C G A + {3, 1, 2, 7} // G->G A T C +}; + +static ubyte2 doubleBitMasker[7] = { + 0x3, //000000 00000011 + 0xC, //000000 00001100 + 0x30, //000000 00110000 + 0xC0, //000000 11000000 + 0x300, //000011 00000000 + 0xC00, //001100 00000000 + 0x3000 //110000 00000000 +}; + +static boolean staticFlag=1; + +static long long readsInGap=0; + +static int buffer_size=10000000; +static long long readCounter; +static long long mapCounter; +long long single_count; +long long single_map; +static int ALIGNLEN=0; +//buffer related varibles for chop kmer +static int read_c; +static char **rcSeq; +static char **seqBuffer; +static int *lenBuffer; +static unsigned int *ctgIdArray; +static int *posArray; +static char *orienArray; +static char *footprint; // flag indicates whether the read shoulld leave markers on contigs + +// kmer related variables +static int kmer_c; +static Kmer *kmerBuffer,*hashBanBuffer; +static kmer_t **nodeBuffer; +static boolean *smallerBuffer; +static unsigned int *indexArray; + +static int *deletion; + +static void parse1read(int t,int threadID); +static void threadRoutine(void *thrdID); +static void searchKmer(int t,KmerSet *kset); +static void chopKmer4read(int t,int threadID); +static void thread_wait(pthread_t *threads); + +static void creatThrds(pthread_t *threads,PARAMETER *paras) +{ + unsigned char i; + int temp; + + for(i=0;ithreadID; + //printf("%dth thread with task %d, hash_table %p\n",id,prm.task,prm.hash_table); + while(1){ + if(*(prm->selfSignal)==1){ + for(i=0;iselfSignal) = 0; + }else if(*(prm->selfSignal)==2){ + for(i=0;iselfSignal) = 0; + }else if(*(prm->selfSignal)==3){ + // parse reads + for(t=0;tselfSignal) = 0; + }else if(*(prm->selfSignal)==5){ + *(prm->selfSignal) = 0; + break; + } + + usleep(1); + } +} +/* +static void chopReads() +{ + int i; + for(i=0;isearchCnt; + + if(found) + { + ++kset->foundCnt; + if(!node->deleted) + nodeBuffer[t] = node; + else + { + ++kset->delCnt; + nodeBuffer[t] = NULL; + } + } + else + { + ++kset->searchSpcSeedCnt; + + boolean spcFlag; + Kmer buff_kmer, spc_kmer; + ubyte2 spc_bases; + spcKmer *rs; + spcBase *tmpBase; + + buff_kmer=kmerBuffer[t]; + spc_kmer = ((buff_kmer>>14)&0xFFFFFFF00) | ((buff_kmer>>12)&0xC0) | ((buff_kmer>>10)&0x3C) | ((buff_kmer>>6)&0x3); + spc_bases = ((buff_kmer>>8)&0x3000) | ((buff_kmer>>6)&0xC00) | ((buff_kmer>>2)&0x3C0) | (buff_kmer&0x3F); + + spcFlag = search_spckmerset(spcSet, spc_kmer, &rs); + + if(spcFlag) + { + ++kset->getSpcSeedCnt; + + int i=0,j=0,getFlag=-1; + int mismatch=0; + ubyte2 tmp,mostLastBase; //loci flags + ubyte2 bestSpcBases; //best spaced bases + int min_mis=31; + ubyte2 tmpSpcBase; + + tmpBase=rs->start; + + //fprintf(stderr,"search %llu\tspc_kmer %u\tspc_bases %u\n", kmerBuffer[t], spc_kmer, spc_bases); + + while(tmpBase != NULL) + { + tmpSpcBase = tmpBase->spaced_bases; + tmp = ((spc_bases ^ tmpSpcBase) & 0x5555) | (((spc_bases ^ tmpSpcBase) & 0xAAAA)>>1); + mismatch=binLight[tmp]; + + if(mismatch < min_mis) //get the minimal mismatch spaced_bases + { + min_mis = mismatch; + mostLastBase = tmp; + bestSpcBases = tmpSpcBase; + node = tmpBase->large_kmer; + getFlag=0; + } + else if(mismatch == min_mis) //if same amount of mismatch, choose the most right mismatch pos + { + if(tmplarge_kmer; + getFlag=1; + } + else if(tmp == mostLastBase) //if same mismatch pos, choose the most probable one[see probableMatrix] + { +/* +static ubyte probableMatrix[4][4] = { +//A C T G 7 3 2 1 +7, 2, 1, 3, // A->A G C T +2, 7, 3, 1, // C->C T A G +1, 3, 7, 2, // T->T C G A +3, 1, 2, 7 // G->G A T C +}; +*/ + getFlag=2; + ubyte2 readBases = spc_bases, loopBases = tmpSpcBase, bestBases = bestSpcBases, mismatchFlag = tmp; + for(j=0;j<7;j++) + { + if((mismatchFlag & 0x3) > 0) + { + if(probableMatrix[(readBases & 0x3)][(loopBases & 0x3)] > probableMatrix[(readBases & 0x3)][(bestBases & 0x3)]) + //check each 2 bits(1 base) if mismatch + { + mostLastBase = tmp; + bestSpcBases = tmpSpcBase; + node = tmpBase->large_kmer; + break; + } + else if((probableMatrix[(readBases & 0x3)][(loopBases & 0x3)] < probableMatrix[(readBases & 0x3)][(bestBases & 0x3)])) + break; + } + mismatchFlag>>=2; + readBases>>=2; + loopBases>>=2; + bestBases>>=2; + } + } + } + + tmpBase = tmpBase->next; + } + + if(getFlag<0) + { + fprintf(stderr,"getFlag error at %llu",kmerBuffer[t]); + exit(-1); + } + ++kset->levelGet[getFlag]; + nodeBuffer[t] = node; + } + else + nodeBuffer[t] = NULL; + } +} + +static void parse1read(int t,int threadID) +{ + unsigned int j,i,s; + unsigned int contigID; + int counter2=0,counter; + unsigned int ctgLen,pos; + kmer_t *node; + boolean isSmaller; + int flag,maxOcc=0; + kmer_t *maxNode=NULL; + int alldgnLen = lenBuffer[t] > ALIGNLEN ? ALIGNLEN:lenBuffer[t]; + int multi = alldgnLen-overlaplen+1 < 5 ? 5:alldgnLen-overlaplen+1; + unsigned int start,finish; + + footprint[t] = 0; + + start = indexArray[t]; + finish = indexArray[t+1]; + if(finish==start){ //too short + ctgIdArray[t] = 0; + deletion[threadID]++; + return; + } + for(j=start;jl_links==node->l_links){ + flag++; + nodeBuffer[s] = NULL; + } + } + if(flag>=2) + counter2++; //a loose alignment + if(flag>=multi) + counter++; + else + continue; + if(flag>maxOcc){ + pos = j; + maxOcc = flag; + maxNode = node; + } + } + if(!counter){ //no match + ctgIdArray[t] = 0; + return; + } + if(counter2>1) + footprint[t] = 1; //aligned to multi contigs + + j = pos; + i = pos - start + 1; + node = nodeBuffer[j]; + isSmaller = smallerBuffer[j]; + contigID = node->l_links; + ctgLen = contig_array[contigID].length; + pos = node->r_links; + if(node->twin==isSmaller){ + orienArray[t] = '-'; + ctgIdArray[t] = getTwinCtg(contigID); + posArray[t] = ctgLen - pos -overlaplen -i + 1; + }else{ + orienArray[t] = '+'; + ctgIdArray[t] = contigID; + posArray[t] = pos - i + 1; + } + +} + +static void sendWorkSignal(unsigned char SIG,unsigned char *thrdSignals) +{ + int t; + + for(t=0;tl_links; + ctgLen = contig_array[contigID].length; + pos = node->r_links; + if(node->twin==isSmaller){ + ctgIdArray[t] = getTwinCtg(contigID); + posArray[t] = ctgLen - pos -overlaplen -i + 1; + }else{ + ctgIdArray[t] = contigID; + posArray[t] = pos - i + 1; + } + } + +} + +static void output1read(int t, FILE *outfp) +{ + int len = lenBuffer[t]; + int index; + readsInGap++; +/* + if(ctgIdArray[t]==735||ctgIdArray[t]==getTwinCtg(735)){ + printf("%d\t%d\t%d\t",t+1,ctgIdArray[t],posArray[t]); + int j; + for(j=0;j R2 <-- R1 + output1read(read1,outfp); + }else{ + read2 = t; + read1 = t - 1; + ctgIdArray[read2] = ctgIdArray[read1]; + posArray[read2] = posArray[read1] + insSize - lenBuffer[read2]; // --> R1 <-- R2 + output1read(read2,outfp); + } +} + +static void recordLongRead(FILE *outfp) +{ + int t; + + for(t=0;t0){ + getReadIngap(t,insSize,outfp2,0); //read 2 in gap + rd2gap = 1; + } + else if(ctgIdArray[t]>0&&ctgIdArray[t-1]<1){ + getReadIngap(t-1,insSize,outfp2,1); //read 1 in gap + rd1gap = 1; + } + } + if(ctgId<1) + continue; + mapCounter++; + single_map++; + fprintf(outfp,"%lld\t%u\t%d\t%c\n",readCounter, + ctgIdArray[t],posArray[t],orienArray[t]); + if(t%2==0) + continue; + if(outfp2&&footprint[t-1]&&!rd1gap) + output1read(t-1,outfp2); + if(outfp2&&footprint[t]&&!rd2gap) + output1read(t,outfp2); + + } +} + +//load contig index and length +void basicContigInfo(char *infile) +{ + char name[256],lldne[1024]; + FILE *fp; + int length,bal_ed,num_all,num_long,index; + + sprintf(name,"%s.ContigIndex",infile); + fp = ckopen(name,"r"); + + fgets(lldne,sizeof(lldne),fp); + sscanf(lldne+8,"%d %d",&num_all,&num_long); + //printf("%d edges in graph\n",num_all); + num_ctg = num_all; + contig_array = (CONTIG *)ckalloc((num_all+1)*sizeof(CONTIG)); + + fgets(lldne,sizeof(lldne),fp); + num_long = 0; + while(fgets(lldne,sizeof(lldne),fp)!=NULL){ + sscanf(lldne,"%d %d %d",&index,&length,&bal_ed); + + contig_array[++num_long].length = length; + contig_array[num_long].bal_edge = bal_ed+1; + if(index!=num_long) + printf("basicContigInfo: %d vs %d\n",index,num_long); + if(bal_ed==0) + continue; + contig_array[++num_long].length = length; + contig_array[num_long].bal_edge = -bal_ed+1; + + } + + fclose(fp); +} + +void prlRead2Ctg(char *libfile,char *outfile) +{ + long long i; + char *src_name,*next_name,name[256]; + FILE *fo,*outfp2=NULL; + int maxReadNum,libNo,prevLibNo,insSize; + boolean flag,pairs=1; + pthread_t threads[thrd_num]; + unsigned char thrdSignal[thrd_num+1]; + PARAMETER paras[thrd_num]; + + maxReadLen = 0; + maxNameLen = 256; + scan_libInfo(libfile); + alloc_pe_mem(num_libs); + if(!maxReadLen) + maxReadLen = 100; + //printf("In file: %s, max seq len %d, max name len %d\n\n", + //libfile,maxReadLen,maxNameLen); + if(maxReadLen>maxReadLen4all) + maxReadLen4all = maxReadLen; + +//////////////////////////////////////////// spcSet + fflush(stdout); + + ubyte2 spc_i,spc_j; + for(spc_i=0;spc_i<16384;spc_i++) + { + binLight[spc_i]=0; + for(spc_j=spc_i;spc_j;spc_j=spc_j&(spc_j-1)) + ++binLight[spc_i]; + } + + spcSet = init_spckmerset(KmerSets[thrd_num-1]->size*thrd_num, 0.77f); + + + for(i=0;i1000) + ALIGNLEN = ALIGNLEN < 35 ? 35: ALIGNLEN; + else + ALIGNLEN = ALIGNLEN < 32 ? 32: ALIGNLEN; + //printf("current insert size %d, map_len %d\n",insSize,ALIGNLEN); + + } + + if(insSize>1000) + ALIGNLEN = ALIGNLEN < (lenBuffer[read_c]/2+1) ? (lenBuffer[read_c]/2+1):ALIGNLEN; + +// if((++i)%100000000==0) +// printf("[%s]%lld reads processed.\n",__FUNCTION__,i); + indexArray[read_c] = kmer_c; + if(lenBuffer[read_c] >= overlaplen+1) + kmer_c += lenBuffer[read_c] - overlaplen + 1; + read_c++; + if(read_c==maxReadNum){ + //mvnv(0,"Start processing reads."); + + indexArray[read_c] = kmer_c; + + sendWorkSignal(2,thrdSignal); + //mvnv(0,"chop finished one buffer."); + sendWorkSignal(1,thrdSignal); + //mvnv(0,"search finished one buffer."); + sendWorkSignal(3,thrdSignal); + //mvnv(0,"parse finished one buffer."); + + recordAlldgn(fo,insSize,outfp2); + kmer_c = 0; + read_c = 0; + } + } + + if(read_c){ + indexArray[read_c] = kmer_c; + sendWorkSignal(2,thrdSignal); + sendWorkSignal(1,thrdSignal); + sendWorkSignal(3,thrdSignal); + recordAlldgn(fo,insSize,outfp2); + //printf("Output %lld out of %lld (%.1f)%% reads in gaps\n",readsInGap,readCounter, + // (float)readsInGap/readCounter*100); + } + if(readCounter) + printf("[%s]total %llu reads , map-rate (%.1f)%%\n",__FUNCTION__, + readCounter,(float)mapCounter/readCounter*100); + sendWorkSignal(5,thrdSignal); + + thread_wait(threads); + fclose(fo); + + sprintf(name,"%s.peGrads",outfile); + fo = ckopen(name,"w"); + fprintf(fo,"grads&num: %d\t%lld\t%d\n",gradsCounter,n_solexa,maxReadLen4all); + if(pairs){ + if(gradsCounter) + ; + //printf("%d pe insert size, the largest boundary is %lld\n\n", + //gradsCounter,pes[gradsCounter-1].PE_bound); + else + printf("no paired reads found\n"); + for(i=0;isearchCnt; + foundCntTot += KmerSets[i]->foundCnt; + delCntTot += KmerSets[i]->delCnt; + searchSpcSeedCntTot += KmerSets[i]->searchSpcSeedCnt; + getSpcSeedCntTot += KmerSets[i]->getSpcSeedCnt; + levelGet1 += KmerSets[i]->levelGet[0]; + levelGet2 += KmerSets[i]->levelGet[1]; + levelGet3 += KmerSets[i]->levelGet[2]; + } + fprintf(stderr,"SEARCH: Search %llu, get %llu, deleted %llu\n", + searchCntTot, foundCntTot, delCntTot); + fprintf(stderr,"SPACED SEED: Search %llu, get %llu, LVnum %llu, LVpos %llu, LVpro %llu\n", + searchSpcSeedCntTot, getSpcSeedCntTot, levelGet1, levelGet2, levelGet3); + + free((void *)rcSeq); + free((void *)deletion); + for(i=0;i= overlaplen+1) + kmer_c += lenBuffer[read_c] - overlaplen + 1; + read_c++; + if(read_c==maxReadNum){ + indexArray[read_c] = kmer_c; + + sendWorkSignal(2,thrdSignal); + sendWorkSignal(1,thrdSignal); + sendWorkSignal(3,thrdSignal); + + recordLongRead(outfp2); + kmer_c = 0; + read_c = 0; + } + } + + if(read_c){ + indexArray[read_c] = kmer_c; + sendWorkSignal(2,thrdSignal); + sendWorkSignal(1,thrdSignal); + sendWorkSignal(3,thrdSignal); + recordLongRead(outfp2); + //printf("Output %lld out of %lld (%.1f)%% reads in gaps\n",readsInGap,readCounter, + // (float)readsInGap/readCounter*100); + } + + sendWorkSignal(5,thrdSignal); + + thread_wait(threads); + + fclose(outfp2); + + free_libs(); + if(1){ // multi-threads + for(i=0;iread\n"); + for(j=0;jlen = len; + rd->dis = pos; + rd->seqStarter = starter; +} + +static void convertIndex() +{ + int *length_array = (int *)ckalloc((num_ctg+1)*sizeof(int)); + unsigned int i; + for(i=1;i<=num_ctg;i++) + length_array[i] = 0; + + for(i=1;i<=num_ctg;i++){ + if(index_array[i]>0) + length_array[index_array[i]] = i; + } + for(i=1;i<=num_ctg;i++) + index_array[i] = length_array[i]; //contig i with new index: index_array[i] + free((void *)length_array); + +} + +static long long getRead1by1(FILE *fp,DARRAY *readSeqInGap) +{ + long long readCounter=0; + if(!fp) + return readCounter; + int len,ctgID,pos; + long long starter; + char *pt; + char *freadBuf = (char *)ckalloc((maxReadLen/4+1)*sizeof(char)); + + while(fread(&len,sizeof(int),1,fp)==1){ + if(fread(&ctgID,sizeof(int),1,fp)!=1) + break; + if(fread(&pos,sizeof(int),1,fp)!=1) + break; + if(fread(freadBuf,sizeof(char),len/4+1,fp)!=(unsigned)(len/4+1)) + break; + //put seq to dynamic array + starter = readSeqInGap->item_c; + if(!darrayPut(readSeqInGap,starter+len/4)) // make sure there's room for this seq + break; + pt = (char *)darrayPut(readSeqInGap,starter); + bcopy(freadBuf,pt,len/4+1); + attach1read2contig(ctgID,len,pos,starter); + readCounter++; + } + + free((void *)freadBuf); + return readCounter; +} +// Darray *readSeqInGap +static boolean loadReads4gap(char *graphfile) +{ + FILE *fp,*fp2; + char name[1024]; + long long readCounter; + + sprintf(name,"%s.readInGap",graphfile); + fp = fopen(name,"rb"); + sprintf(name,"%s.longReadInGap",graphfile); + fp2 = fopen(name,"rb"); + if(!fp&&!fp2) + return 0; + + if(!orig2new){ + convertIndex(); + orig2new = 1; + } + + readSeqInGap = (DARRAY *)createDarray(1000000,sizeof(char)); + if(fp){ + readCounter = getRead1by1(fp,readSeqInGap); + //printf("Loaded %lld reads from %s.readInGap\n",readCounter,graphfile); + fclose(fp); + } + if(fp2){ + readCounter = getRead1by1(fp2,readSeqInGap); + //printf("Loaded %lld reads from %s.LongReadInGap\n",readCounter,graphfile); + fclose(fp2); + } + return 1; +} + +static void debugging1() +{ + unsigned int i; + if(orig2new){ + unsigned int *length_array = (unsigned int *)ckalloc((num_ctg+1)*sizeof(unsigned int)); + //use length_array to change info in index_array + for(i=1;i<=num_ctg;i++) + length_array[i] = 0; + + for(i=1;i<=num_ctg;i++){ + if(index_array[i]>0) + length_array[index_array[i]] = i; + } + for(i=1;i<=num_ctg;i++) + index_array[i] = length_array[i]; //contig i with original index: index_array[i] + orig2new = 0; + } + READNEARBY *rd; + int j; + char *pt; + for(i=1;i<=num_ctg;i++){ + if(!contig_array[i].closeReads) + continue; + if(index_array[i]!=735) + continue; + //printf("contig %d, len %d: \n",index_array[i],contig_array[i].length); + stackBackup(contig_array[i].closeReads); + while((rd=(READNEARBY *)stackPop(contig_array[i].closeReads))!=NULL){ + printf("%d\t%d\t%lld\t",rd->dis,rd->len,rd->seqStarter); + pt = (char *)darrayGet(readSeqInGap,rd->seqStarter); + for(j=0;jlen;j++) + printf("%c",int2base((int)getCharInTightString(pt,j))); + printf("\n"); + } + stackRecover(contig_array[i].closeReads); + } + +} + +static void initiateCtgInScaf(CTGinSCAF *actg) +{ + actg->cutTail = 0; + actg->cutHead = overlaplen; + actg->gapSeqLen = 0; +} + +static int procGap(char *line,STACK *ctgsStack) +{ + char *tp; + int length,i,seg; + unsigned int ctg; + CTGinSCAF *ctgPt; + + tp = strtok(line, " "); + tp = strtok(NULL," "); //length + length = atoi(tp); + tp = strtok(NULL," "); //seg + seg = atoi(tp); + if(!seg) + return length; + for(i=0;ictgID = ctg; + ctgPt->start = 0; + ctgPt->end = 0; + ctgPt->scaftig_start = 0; + ctgPt->mask = 1; + } + return length; +} + +static void debugging2(int index,STACK *ctgsStack) +{ + CTGinSCAF *actg; + + stackBackup(ctgsStack); + printf(">scaffold%d\t%d 0.0\n",index,ctgsStack->item_c); + while((actg=stackPop(ctgsStack))!=NULL){ + printf("%d\t%d\t%d\t%d\n", + actg->ctgID,actg->start,actg->end,actg->scaftig_start); + } + stackRecover(ctgsStack); +} + +static int cmp_reads(const void *a,const void *b) +{ + READNEARBY *A,*B; + A = (READNEARBY *)a; + B = (READNEARBY *)b; + + if(A->dis>B->dis) + return 1; + else if(A->dis==B->dis) + return 0; + else + return -1; +} + +static void cutRdArray(READNEARBY *rdArray,int gapStart,int gapEnd,int *count,int arrayLen,READNEARBY *cutArray) +{ + int i; + int num = 0; + + for(i=0;igapEnd) + break; + if((rdArray[i].dis+rdArray[i].len)>=gapStart){ + cutArray[num].dis = rdArray[i].dis; + cutArray[num].len = rdArray[i].len; + cutArray[num++].seqStarter = rdArray[i].seqStarter; + } + } + *count = num; +} + +static void outputTightStr(FILE *fp,char *tightStr,int start,int length, int outputlen,int revS,int *col) +{ + int i; + int end; + int column = *col; + + if(!revS){ + end = start+outputlen <= length ? start+outputlen:length; + for(i=start;i=0 ? length-start-outputlen:0; + for(i=length-1-start;i>=end;i--){ + fprintf(fp,"%c",int2compbase((int)getCharInTightString(tightStr,i))); + if((++column)%100==0){ + fprintf(fp,"\n"); + //column = 0; + } + } + } + *col = column; +} + +static void outputTightStrLowerCase(FILE *fp,char *tightStr,int start,int length, int outputlen,int revS,int *col) +{ + int i; + int end; + int column = *col; + + if(!revS){ + end = start+outputlen <= length ? start+outputlen:length; + for(i=start;i=0 ? length-start-outputlen:0; + for(i=length-1-start;i>=end;i--){ + fprintf(fp,"%c","tgac"[(int)getCharInTightString(tightStr,i)]); + if((++column)%100==0){ + fprintf(fp,"\n"); + //column = 0; + } + } + } + *col = column; +} + +static void outputNs(FILE *fp,int gapN,int *col) +{ + int i,column=*col; + for(i=0;ictgID; + bal_ctg1 = getTwinCtg(ctg1); + start1 = prevCtg->cutHead; + length1 = contig_array[ctg1].length + overlaplen; + if(length1-prevCtg->cutTail-start1>CTGappend){ + outputlen1 = CTGappend; + start1 = length1-prevCtg->cutTail-outputlen1; + }else + outputlen1 = length1-prevCtg->cutTail-start1; + + ctg2 = actg->ctgID; + bal_ctg2 = getTwinCtg(ctg2); + start2 = actg->cutHead; + length2 = contig_array[ctg2].length + overlaplen; + if(length2-actg->cutTail-start2>CTGappend){ + outputlen2 = CTGappend; + }else + outputlen2 = length2-actg->cutTail-start2; + if(isLargerThanTwin(ctg1)) + fprintf(fo,">S%d_C%d_L%d_G%d",scafIndex,index_array[bal_ctg1],outputlen1,prevCtg->gapSeqLen); + else + fprintf(fo,">S%d_C%d_L%d_G%d",scafIndex,index_array[ctg1],outputlen1,prevCtg->gapSeqLen); + + if(isLargerThanTwin(ctg2)) + fprintf(fo,"_C%d_L%d\n",index_array[bal_ctg2],outputlen2); + else + fprintf(fo,"_C%d_L%d\n",index_array[ctg2],outputlen2); + + if(contig_array[ctg1].seq) + outputTightStr(fo,contig_array[ctg1].seq,start1,length1, outputlen1,0,&column); + else if(contig_array[bal_ctg1].seq) + outputTightStr(fo,contig_array[bal_ctg1].seq,start1,length1, outputlen1,1,&column); + + pt = (char *)darrayPut(gapSeqArray,prevCtg->gapSeqOffset); + outputTightStrLowerCase(fo,pt,0,prevCtg->gapSeqLen, prevCtg->gapSeqLen,0,&column); + + if(contig_array[ctg2].seq) + outputTightStr(fo,contig_array[ctg2].seq,start2,length2, outputlen2,0,&column); + else if(contig_array[bal_ctg2].seq) + outputTightStr(fo,contig_array[bal_ctg2].seq,start2,length2, outputlen2,1,&column); + + fprintf(fo,"\n"); +} + +static void outputGapSeq(FILE *fo,int index,STACK *ctgsStack,DARRAY *gapSeqArray) +{ + CTGinSCAF *actg,*prevCtg=NULL; + stackRecover(ctgsStack); + + while((actg=stackPop(ctgsStack))!=NULL){ + if(prevCtg&&prevCtg->gapSeqLen>0) + output1gap(fo,index,prevCtg,actg,gapSeqArray); + prevCtg = actg; + + } + +} + +static void outputScafSeq(FILE *fo,int index,STACK *ctgsStack,DARRAY *gapSeqArray) +{ + CTGinSCAF *actg,*prevCtg=NULL; + unsigned int ctg,bal_ctg,length; + int start,outputlen,gapN; + char *pt; + int column = 0; + long long cvgSum=0; + int lenSum=0; + + stackRecover(ctgsStack); + while((actg=stackPop(ctgsStack))!=NULL){ + if(!(contig_array[actg->ctgID].cvg>0)) + continue; + lenSum += contig_array[actg->ctgID].length; + cvgSum += contig_array[actg->ctgID].length*contig_array[actg->ctgID].cvg; + } + if(lenSum>0) + fprintf(fo,">scaffold%d %4.1f\n",index,(double)cvgSum/lenSum); + else + fprintf(fo,">scaffold%d 0.0\n",index); + + stackRecover(ctgsStack); + while((actg=stackPop(ctgsStack))!=NULL){ + ctg = actg->ctgID; + bal_ctg = getTwinCtg(ctg); + length = contig_array[ctg].length + overlaplen; + if(prevCtg&&actg->scaftig_start){ + gapN = actg->start - prevCtg->start - contig_array[prevCtg->ctgID].length; + gapN = gapN > 0 ? gapN:1; + outputNs(fo,gapN,&column); + //outputGapInfo(prevCtg->ctgID,ctg); + Ncounter++; + } + if(!prevCtg) + start = 0; + else + start = actg->cutHead; + outputlen = length-start-actg->cutTail; + if(contig_array[ctg].seq) + outputTightStr(fo,contig_array[ctg].seq,start,length, outputlen,0,&column); + else if(contig_array[bal_ctg].seq) + outputTightStr(fo,contig_array[bal_ctg].seq,start,length, outputlen,1,&column); + if(actg->gapSeqLen<1){ + prevCtg = actg; + continue; + } + + pt = (char *)darrayPut(gapSeqArray,actg->gapSeqOffset); + outputTightStrLowerCase(fo,pt,0,actg->gapSeqLen, actg->gapSeqLen,0,&column); + + prevCtg = actg; + } + fprintf(fo,"\n"); + +} + +static void fill1scaf(int index,STACK *ctgsStack,int thrdID); +static void check1scaf(int t,int thrdID) +{ + if(flagBuf[t]) + return; + boolean late=0; + pthread_mutex_lock(&mutex); + if(!flagBuf[t]){ + flagBuf[t] = 1; + thrdNoBuf[t] = thrdID; + }else + late = 1; + pthread_mutex_unlock(&mutex); + if(late) + return; + counters[thrdID]++; + fill1scaf(scafCounter+t+1,ctgStackBuffer[t],thrdID); +} + +static void fill1scaf(int index,STACK *ctgsStack,int thrdID) +{ + + CTGinSCAF *actg,*prevCtg=NULL; + READNEARBY *rdArray,*rdArray4gap,*rd; + int numRd=0,count,maxGLen=0; + unsigned int ctg,bal_ctg; + STACK *rdStack; + + while((actg=stackPop(ctgsStack))!=NULL){ + if(prevCtg) + maxGLen = maxGLen<(actg->start-prevCtg->end) ? (actg->start-prevCtg->end):maxGLen; + ctg = actg->ctgID; + bal_ctg = getTwinCtg(ctg); + if(actg->mask){ + prevCtg = actg; + continue; + } + if(contig_array[ctg].closeReads) + numRd += contig_array[ctg].closeReads->item_c; + else if(contig_array[bal_ctg].closeReads) + numRd += contig_array[bal_ctg].closeReads->item_c; + prevCtg = actg; + } + if(numRd<1) + return; + rdArray = (READNEARBY *)ckalloc(numRd*sizeof(READNEARBY)); + rdArray4gap = (READNEARBY *)ckalloc(numRd*sizeof(READNEARBY)); + //fprintf(stderr,"scaffold%d reads4gap %d\n",index,numRd); + + // collect reads appended to contigs in this scaffold + int numRd2 = 0; + stackRecover(ctgsStack); + while((actg=stackPop(ctgsStack))!=NULL){ + ctg = actg->ctgID; + bal_ctg = getTwinCtg(ctg); + if(actg->mask) + continue; + if(contig_array[ctg].closeReads) + rdStack = contig_array[ctg].closeReads; + else if(contig_array[bal_ctg].closeReads) + rdStack = contig_array[bal_ctg].closeReads; + else + continue; + + stackBackup(rdStack); + while((rd=(READNEARBY *)stackPop(rdStack))!=NULL){ + rdArray[numRd2].len = rd->len; + rdArray[numRd2].seqStarter = rd->seqStarter; + if(isSmallerThanTwin(ctg)) + rdArray[numRd2++].dis = actg->start - overlaplen + rd->dis; + else + rdArray[numRd2++].dis = actg->start -overlaplen + + contig_array[ctg].length - rd->len - rd->dis; + } + stackRecover(rdStack); + } + if(numRd2!=numRd) + printf("##reads numbers doesn't match, %d vs %d when scaffold %d\n",numRd,numRd2,index); + qsort(rdArray,numRd,sizeof(READNEARBY),cmp_reads); + //fill gap one by one + int gapStart,gapEnd; + int numIn=0; + boolean flag; + int buffer_size=maxReadLen > 100 ? maxReadLen:100; + int maxGSLen = maxGLen+GLDiff < 10 ? 10:maxGLen+GLDiff; + //fprintf(stderr,"maxGlen %d, maxGSlen %d\n",maxGLen,maxGSLen); + + char *seqGap = (char *)ckalloc(maxGSLen*sizeof(char)); // temp array for gap sequence + Kmer *kmerCtg1 = (Kmer *)ckalloc(buffer_size*sizeof(Kmer)); + Kmer *kmerCtg2 = (Kmer *)ckalloc(buffer_size*sizeof(Kmer)); + char *seqCtg1 = (char *)ckalloc(buffer_size*sizeof(char)); + char *seqCtg2 = (char *)ckalloc(buffer_size*sizeof(char)); + prevCtg = NULL; + stackRecover(ctgsStack); + while((actg=stackPop(ctgsStack))!=NULL){ + if(!prevCtg||!actg->scaftig_start){ + prevCtg = actg; + continue; + } + gapStart = prevCtg->end - 100; + gapEnd = actg->start - overlaplen + 100; + + cutRdArray(rdArray,gapStart,gapEnd,&count,numRd,rdArray4gap); + + numIn += count; + /* + if(!count){ + prevCtg = actg; + continue; + } + */ + int overlap; + for(overlap=overlaplen;overlap>14;overlap-=2){ + + flag = localGraph(rdArray4gap,count,prevCtg,actg, + overlaplen,kmerCtg1,kmerCtg2,overlap,darrayBuf[thrdID], + seqCtg1,seqCtg2,seqGap); + + //free_kmerset(kmerSet); + + if(flag==1){ + /* + fprintf(stderr,"Between ctg %d and %d, Found with %d\n",prevCtg->ctgID + ,actg->ctgID,overlap); + */ + break; + } + } + /* + if(count==0) + printf("Gap closed without reads\n"); + if(!flag) + fprintf(stderr,"Between ctg %d and %d, NO routes found\n",prevCtg->ctgID,actg->ctgID); + */ + + prevCtg = actg; + } + + //fprintf(stderr,"____scaffold%d reads in gap %d\n",index,numIn); + free((void *)seqGap); + free((void *)kmerCtg1); + free((void *)kmerCtg2); + free((void *)seqCtg1); + free((void *)seqCtg2); + free((void *)rdArray); + free((void *)rdArray4gap); +} + +static void reverseStack(STACK *dStack,STACK *sStack) +{ + CTGinSCAF *actg,*ctgPt; + emptyStack(dStack); + + while((actg=(CTGinSCAF *)stackPop(sStack))!=NULL){ + ctgPt = (CTGinSCAF *)stackPush(dStack); + ctgPt->ctgID = actg->ctgID; + ctgPt->start = actg->start; + ctgPt->end = actg->end; + ctgPt->scaftig_start = actg->scaftig_start; + ctgPt->mask = actg->mask; + ctgPt->cutHead = actg->cutHead; + ctgPt->cutTail = actg->cutTail; + ctgPt->gapSeqLen = actg->gapSeqLen; + ctgPt->gapSeqOffset = actg->gapSeqOffset; + } + stackBackup(dStack); +} + +static Kmer tightStr2Kmer(char *tightStr,int start,int length,int revS) +{ + int i; + Kmer word=0; + + if(!revS){ + if(start+overlaplen>length){ + printf("tightStr2Kmer A: no enough bases for kmer\n"); + return word; + } + for(i=start;ilength-1-start-overlaplen;i--){ + word <<= 2; + word += int_comp(getCharInTightString(tightStr,i)); + } + } + return word; +} + +static Kmer maxKmer() +{ + Kmer word = 0; + int i; + for(i=0;i>= 2; + kmerAtEnd &= MaxKmer; + kmerAtStart >>= 2; + } + if(i<10){ + return overlaplen - i; + } + else + return 0; +} + + +static void initStackBuf(STACK **ctgStackBuffer,int scafBufSize) +{ + int i; + for(i=0;iselfSignal)==1){ + emptyDarray(darrayBuf[prm->threadID]); + for(i=0;ithreadID); + + *(prm->selfSignal) = 0; + }else if(*(prm->selfSignal)==2){ + *(prm->selfSignal) = 0; + break; + } + usleep(1); + } +} + +static void creatThrds(pthread_t *threads,PARAMETER *paras) +{ + unsigned char i; + int temp; + + for(i=0;iC%d %4.1f\n",ctg,(double)contig_array[ctg].cvg); + outputTightStr(fo,contig_array[ctg].seq,0,len,len,0,&col); + } + else if(contig_array[bal_ctg].seq){ + fprintf(fo,">C%d %4.1f\n",bal_ctg,(double)contig_array[ctg].cvg); + outputTightStr(fo,contig_array[bal_ctg].seq,0,len,len,0,&col); + } + contig_array[ctg].flag = 1; + contig_array[bal_ctg].flag = 1; + fprintf(fo,"\n"); +} + +void prlReadsCloseGap(char *graphfile) +{ + //thrd_num=1; + /*if(fillGap){ + boolean flag; + //printf("\nStart to load reads for gap filling. %d length discrepancy is allowed\n",GLDiff); + //printf("...\n"); + flag = loadReads4gap(graphfile); + if(!flag) + return; + }*/ + + if(orig2new){ + convertIndex(); + orig2new = 0; + } + FILE *fp,*fo,*fo2; + char line[1024]; + CTGinSCAF *actg; + STACK *ctgStack,*aStack; + int index=0,offset=0,counter,overallLen; + int i,starter,prev_start,gapLen,catchable; + unsigned int ctg,prev_ctg=0; + boolean IsPrevGap; + pthread_t threads[thrd_num]; + unsigned char thrdSignal[thrd_num+1]; + PARAMETER paras[thrd_num]; + + for(ctg=1;ctg<=num_ctg;ctg++) + contig_array[ctg].flag = 0; + + MAXKMER = maxKmer(); + + ctgStack = (STACK *)createStack(1000,sizeof(CTGinSCAF)); + + sprintf(line, "%s.scaf_gap", graphfile); + fp = ckopen(line, "r"); + sprintf(line, "%s.scafSeq", graphfile); + fo = ckopen(line, "w"); + + sprintf(line, "%s.gapSeq", graphfile); + fo2 = ckopen(line, "w"); + + pthread_mutex_init(&mutex,NULL); + + flagBuf = (boolean *)ckalloc(scafBufSize*sizeof(boolean));; + thrdNoBuf = (unsigned char *)ckalloc(scafBufSize*sizeof(unsigned char));; + memset(thrdNoBuf,0,scafBufSize*sizeof(char)); + + ctgStackBuffer = (STACK **)ckalloc(scafBufSize*sizeof(STACK *)); + initStackBuf(ctgStackBuffer,scafBufSize); + + darrayBuf = (DARRAY **)ckalloc(thrd_num*sizeof(DARRAY *)); + counters = (int *)ckalloc(thrd_num*sizeof(int)); + + /*for(i=0;i'){ + if(index){ + aStack = ctgStackBuffer[scafInBuf]; + flagBuf[scafInBuf++] = 0; + reverseStack(aStack,ctgStack); + if(scafInBuf==scafBufSize){ + /*if(fillGap) + sendWorkSignal(1,thrdSignal);*/ + + outputSeqs(fo,fo2,scafInBuf); + scafCounter += scafInBuf; + scafInBuf = 0; + } + //if(index%1000==0) + //printf("Processed %d scaffolds\n",index); + + } + //read next scaff + emptyStack(ctgStack); + IsPrevGap = offset = prev_ctg = 0; + sscanf(line+9,"%d %d %d",&index,&counter,&overallLen); + continue; + } + if(line[0]=='G'){ // gap appears + /*if(fillGap){ + gapLen = procGap(line,ctgStack); + IsPrevGap = 1; + }*/ + continue; + } + if(line[0]>='0'&&line[0]<='9'){ // a contig line + sscanf(line,"%d %d",&ctg,&starter); + actg = (CTGinSCAF *)stackPush(ctgStack); + actg->ctgID = ctg; + if(contig_array[ctg].flag) + MaskContig(ctg); + else + MarkCtgOccu(ctg); + initiateCtgInScaf(actg); + if(!prev_ctg) + actg->cutHead = 0; + else if(!IsPrevGap) + allGaps++; + if(!IsPrevGap){ + if(prev_ctg&&(starter-prev_start-(int)contig_array[prev_ctg].length) + <((int)overlaplen*4)){ + /* + if(fillGap) + catchable = contigCatch(prev_ctg,ctg); + else + */ + catchable = 0; + if(catchable){ // prev_ctg and ctg overlap **bp + allGaps--; + /* + if(isLargerThanTwin(prev_ctg)) + fprintf(stderr,"%d ####### by_overlap\n",getTwinCtg(prev_ctg)); + else + fprintf(stderr,"%d ####### by_overlap\n",prev_ctg); + */ + actg->scaftig_start = 0; + actg->cutHead = catchable; + offset += - (starter-prev_start-contig_array[prev_ctg].length) + + (overlaplen - catchable); + }else + actg->scaftig_start = 1; + + }else + actg->scaftig_start = 1; + }else{ + offset += - (starter-prev_start-contig_array[prev_ctg].length) + gapLen; + actg->scaftig_start = 0; + } + actg->start = starter + offset; + actg->end = actg->start + contig_array[ctg].length - 1; + actg->mask = contig_array[ctg].mask; + IsPrevGap = 0; + prev_ctg = ctg; + prev_start = starter; + } + } + if(index){ + aStack = ctgStackBuffer[scafInBuf]; + flagBuf[scafInBuf++] = 0; + reverseStack(aStack,ctgStack); + if(fillGap) + sendWorkSignal(1,thrdSignal); + outputSeqs(fo,fo2,scafInBuf); + } + + /*if(fillGap){ + sendWorkSignal(2,thrdSignal); + thread_wait(threads); + }*/ + for(ctg=1;ctg<=num_ctg;ctg++){ + if((contig_array[ctg].length+overlaplen)<100|| + contig_array[ctg].flag) + continue; + output_ctg(ctg,fo); + + } + //printf("Done with %d scaffolds, %d gaps finished, %d gaps overall\n",index,allGaps-Ncounter,allGaps); + //printf("scaffolds outputted : %d.\n",index); + index = 0; + for(i=0;i0) + length_array[index_array[i]] = i; + } + for(i=1;i<=num_ctg;i++) + index_array[i] = length_array[i]; //contig i with new index: index_array[i] + free((void *)length_array); + +} + +static void reverseStack(STACK *dStack,STACK *sStack) +{ + CTGinSCAF *actg,*ctgPt; + emptyStack(dStack); + + while((actg=(CTGinSCAF *)stackPop(sStack))!=NULL){ + ctgPt = (CTGinSCAF *)stackPush(dStack); + ctgPt->ctgID = actg->ctgID; + ctgPt->start = actg->start; + ctgPt->end = actg->end; + } + stackBackup(dStack); +} + +static void initStackBuf(STACK **ctgStackBuffer,int scafBufSize) +{ + int i; + for(i=0;ictgID; + bal_ctg = getTwinCtg(ctg); + + if(contig_array[ctg].from_vt!=0){ + contig_array[ctg].multi = 1; + contig_array[bal_ctg].multi = 1; + continue; + } + + contig_array[ctg].from_vt = scafID; + contig_array[ctg].to_vt = actg->start; + contig_array[ctg].flag = 0; //ctg and scaf on the same strand + contig_array[bal_ctg].from_vt = scafID; + contig_array[bal_ctg].to_vt = actg->start; + contig_array[bal_ctg].flag = 1; + } + } + +} + +static void locateContigOnscaff(char *graphfile) +{ + + FILE *fp; + char line[1024]; + CTGinSCAF *actg; + STACK *ctgStack,*aStack; + int index=0,counter,overallLen; + int starter,prev_start,gapN,scafLen; + unsigned int ctg,prev_ctg=0; + + for(ctg=1;ctg<=num_ctg;ctg++){ + contig_array[ctg].from_vt = 0; + contig_array[ctg].multi = 0; + } + + ctgStack = (STACK *)createStack(1000,sizeof(CTGinSCAF)); + + sprintf(line, "%s.scaf_gap", graphfile); + fp = ckopen(line, "r"); + + ctgStackBuffer = (STACK **)ckalloc(scafBufSize*sizeof(STACK *)); + initStackBuf(ctgStackBuffer,scafBufSize); + + + Ncounter = scafCounter = scafInBuf = allGaps = 0; + while(fgets(line,sizeof(line),fp)!=NULL){ + if(line[0]=='>'){ + if(index){ + aStack = ctgStackBuffer[scafInBuf++]; + reverseStack(aStack,ctgStack); + if(scafInBuf==scafBufSize){ + mapCtg2Scaf(scafInBuf); + scafCounter += scafInBuf; + scafInBuf = 0; + } + //if(index%1000==0) + //printf("Processed %d scaffolds\n",index); + } + //read next scaff + scafLen = prev_ctg = 0; + emptyStack(ctgStack); + sscanf(line+9,"%d %d %d",&index,&counter,&overallLen); + fprintf(stderr,">%d\n",index); + continue; + } + if(line[0]=='G'){ // gap appears + continue; + } + if(line[0]>='0'&&line[0]<='9'){ // a contig line + sscanf(line,"%d %d",&ctg,&starter); + actg = (CTGinSCAF *)stackPush(ctgStack); + actg->ctgID = ctg; + if(!prev_ctg){ + actg->start = scafLen; + actg->end = actg->start + overlaplen + contig_array[ctg].length - 1; + }else{ + gapN = starter - prev_start-(int)contig_array[prev_ctg].length; + gapN = gapN < 1 ? 1:gapN; + actg->start = scafLen + gapN; + actg->end = actg->start + contig_array[ctg].length - 1; + } + fprintf(stderr,"%d\t%d\n",actg->start,actg->end); + scafLen = actg->end+1; + prev_ctg = ctg; + prev_start = starter; + } + } + if(index){ + aStack = ctgStackBuffer[scafInBuf++]; + reverseStack(aStack,ctgStack); + mapCtg2Scaf(scafInBuf); + } + gapN = 0; + for(ctg=1;ctg<=num_ctg;ctg++){ + if(contig_array[ctg].from_vt==0||contig_array[ctg].multi==1) + continue; + gapN++; + } + //printf("\nDone with %d scaffolds, %d contigs in Scaffolld\n",index,gapN); + fclose(fp); + freeStack(ctgStack); + freeStackBuf(ctgStackBuffer,scafBufSize); + free((void*)ctgStackBuffer); +} + +static boolean contigElligible(unsigned int contigno) +{ + unsigned int ctg = index_array[contigno]; + if(contig_array[ctg].from_vt==0||contig_array[ctg].multi==1) + return 0; + else + return 1; + +} +static void output1read(FILE *fo,long long readno,unsigned int contigno,int pos) +{ + + unsigned int ctg = index_array[contigno]; + int posOnScaf; + char orien; + pos = pos < 0 ? 0:pos; + if(contig_array[ctg].flag==0){ + posOnScaf = contig_array[ctg].to_vt + pos - overlaplen; + orien = '+'; + }else{ + posOnScaf = contig_array[ctg].to_vt + contig_array[ctg].length - pos; + orien = '-'; + } + /* + if(readno==676) + printf("Read %lld in region from %d, extend %d, pos %d, orien %c\n", + readno,contig_array[ctg].to_vt,contig_array[ctg].length,posOnScaf,orien); + */ + fprintf(fo,"%lld\t%d\t%d\t%c\n",readno,contig_array[ctg].from_vt,posOnScaf,orien); +} + +void locateReadOnScaf(char *graphfile) +{ + char name[1024],line[1024]; + FILE *fp,*fo; + long long readno,counter=0,pre_readno=0; + unsigned int contigno,pre_contigno; + int pre_pos,pos; + + locateContigOnscaff(graphfile); + + sprintf(name,"%s.readOnContig",graphfile); + fp = ckopen(name,"r"); + sprintf(name,"%s.readOnScaf",graphfile); + fo = ckopen(name,"w"); + + if(!orig2new){ + convertIndex(); + orig2new = 1; + } + fgets(line,1024,fp); + while(fgets(line,1024,fp)!=NULL){ + sscanf(line,"%lld %d %d",&readno,&contigno,&pos); + if((readno%2==0)&&(pre_readno==readno-1) // they are a pair of reads + &&contigElligible(pre_contigno)&&contigElligible(contigno)){ + output1read(fo,pre_readno,pre_contigno,pre_pos); + output1read(fo,readno,contigno,pos); + counter++; + } + pre_readno = readno; + pre_contigno = contigno; + pre_pos = pos; + } + printf("%lld pairs on contig\n",counter); + fclose(fp); + fclose(fo); +} diff --git a/fusion/readseq1by1.c b/fusion/readseq1by1.c new file mode 100755 index 0000000..ee3f35b --- /dev/null +++ b/fusion/readseq1by1.c @@ -0,0 +1,465 @@ +#include "stdinc.h" +#include "newhash.h" +#include "extfunc.h" +#include "extvab.h" + +static char src_rc_seq[1024]; +extern long long single_count; +extern long long single_map; +void readseq1by1(char *src_seq, char *src_name, int *len_seq, FILE *fp,long long num_seq) +{ + int i,k, n,strL; + char c; + char str[5000]; + + n = 0; + k = num_seq; + while(fgets(str, 4950, fp)) { + if(str[0] == '#') continue; + if(str[0] == '>') { + /* + if(k >= 0) { // if this isn't the first '>' in the file + *len_seq = n; + } + */ + *len_seq = n; + n = 0; + sscanf(&str[1],"%s",src_name); + return; + } else { + strL = strlen(str); + if(strL+n>maxReadLen) + strL = maxReadLen - n; + for(i = 0; i < strL; i ++) { + if(str[i] >= 'a' && str[i] <= 'z') { + c = base2int(str[i]-'a'+'A'); + src_seq[n ++] = c; + } else if(str[i] >= 'A' && str[i] <= 'Z') { + c = base2int(str[i]); + src_seq[n ++] = c; + // after pre-process all the symbles would be a,g,c,t,n in lower or upper case. + } else if(str[i]=='.') { + c = base2int('A'); + src_seq[n ++] = c; + } // after pre-process all the symbles would be a,g,c,t,n in lower or upper case. + } + //printf("%d: %d\n",k,n); + } + } + + if(k >= 0){ + *len_seq = n; + return; + } + *len_seq = 0; +} + + +void read_one_sequence(FILE *fp, long long *T, char **X) + +{ + + char *fasta,*src_name; //point to fasta array + int num_seq,len,name_len,min_len; + + num_seq = readseqpar(&len,&min_len,&name_len,fp); + if(num_seq<1){ + printf("no fasta sequence in file\n"); + *T = 0; + return; + } + fasta = (char *)ckalloc(len*sizeof(char)); + src_name = (char *)ckalloc((name_len+1)*sizeof(char)); + rewind(fp); + + readseq1by1(fasta,src_name,&len,fp,-1); + readseq1by1(fasta,src_name,&len,fp,0); + + *X = fasta; + *T = len; + free((void *)src_name); +} + +long long multiFileParse(int *max_leg, int *min_leg,int *max_name_leg, FILE *fp) +{ + + char str[5000]; + FILE *freads; + int slen; + long long counter = 0; + *max_name_leg = *max_leg = 1; + *min_leg = 1000; + while(fgets(str,4950,fp)){ + slen = strlen(str); + str[slen-1] = str[slen]; + freads = ckopen(str,"r"); + counter += readseqpar(max_leg,min_leg,max_name_leg,freads); + fclose(freads); + } + return counter; +} + +long long readseqpar(int *max_leg, int *min_leg,int *max_name_leg, FILE *fp) +{ + int l, n; + long long k; + char str[5000], src_name[5000]; + + + n = 0; + k = -1; + while(fgets(str, 4950, fp)) { + if(str[0] == '>') { + if(k >= 0) { + if(n > *max_leg) + *max_leg = n; + if(n < *min_leg) + *min_leg = n; + + } + n = 0; + k ++; + sscanf(&str[1], "%s", src_name); + if((l = strlen(src_name)) > *max_name_leg) + *max_name_leg = l; + } else { + n += strlen(str)-1; + } + } + if(n > *max_leg) + *max_leg = n; + + if(n < *min_leg) + *min_leg = n; + + k ++; + return(k); +} + +void read1seqfq(char *src_seq, char *src_name, int *len_seq, FILE *fp) +{ + int i,n,strL; + char c; + char str[5000]; + boolean flag=0; + + while(fgets(str, 4950, fp)) { + if(str[0]=='@'){ + flag = 1; + sscanf(&str[1],"%s",src_name); + break; + } + } + + if(!flag){ //last time reading fq file get this + *len_seq = 0; + return; + } + + n = 0; + while(fgets(str, 4950, fp)){ + if(str[0] == '+') { + fgets(str,4950,fp); // pass quality value line + *len_seq = n; + return; + } else { + strL = strlen(str); + if(strL+n>maxReadLen) + strL = maxReadLen - n; + for(i = 0; i < strL; i ++) { + if(str[i] >= 'a' && str[i] <= 'z') { + c = base2int(str[i]-'a'+'A'); + src_seq[n ++] = c; + } else if(str[i] >= 'A' && str[i] <= 'Z') { + c = base2int(str[i]); + src_seq[n ++] = c; + // after pre-process all the symbles would be a,g,c,t,n in lower or upper case. + } else if(str[i]=='.') { + c = base2int('A'); + src_seq[n ++] = c; + } // after pre-process all the symbles would be a,g,c,t,n in lower or upper case. + } + //printf("%d: %d\n",k,n); + } + } + + *len_seq = n; + return; +} + +// find the next file to open in libs +static int nextValidIndex(int libNo,boolean pair,unsigned char asm_ctg) +{ + int i=libNo; + + while(i1&&lib_array[i].asm_flag!=asm_ctg){ // reads for other purpose + i++; + continue; + } + if(lib_array[i].curr_type==1&& + lib_array[i].curr_index3&&strcmp(fname+strlen(fname)-3,".gz")==0){ + char *cmd = (char *)ckalloc((strlen(fname)+20)*sizeof(char)); + sprintf(cmd,"gzip -dc %s",fname); + fp = popen(cmd,"r"); + free(cmd); + return fp; + }else{ + return ckopen(fname,"r"); + } + +} + +void openFileInLib(int libNo) +{ + int i = libNo; + if(lib_array[i].curr_type==1){ + printf("[%s]opened file:\n %s\n", + __FUNCTION__,lib_array[i].a1_fname[lib_array[i].curr_index]); + printf("[%s]opened file:\n %s\n", + __FUNCTION__,lib_array[i].a2_fname[lib_array[i].curr_index]); + lib_array[i].fp1 = openFile4read(lib_array[i].a1_fname[lib_array[i].curr_index]); + lib_array[i].fp2 = openFile4read(lib_array[i].a2_fname[lib_array[i].curr_index]); + lib_array[i].curr_index++; + lib_array[i].paired = 1; + }else if(lib_array[i].curr_type==2){ + printf("[%s]opened file:\n %s\n", + __FUNCTION__,lib_array[i].q1_fname[lib_array[i].curr_index]); + printf("[%s]opened file:\n %s\n", + __FUNCTION__,lib_array[i].q2_fname[lib_array[i].curr_index]); + lib_array[i].fp1 = openFile4read(lib_array[i].q1_fname[lib_array[i].curr_index]); + lib_array[i].fp2 = openFile4read(lib_array[i].q2_fname[lib_array[i].curr_index]); + lib_array[i].curr_index++; + lib_array[i].paired = 1; + }else if(lib_array[i].curr_type==3){ + printf("[%s]opened file:\n %s\n", + lib_array[i].p_fname[lib_array[i].curr_index]); + lib_array[i].fp1 = openFile4read(lib_array[i].p_fname[lib_array[i].curr_index]); + lib_array[i].curr_index++; + lib_array[i].paired = 0; + }else if(lib_array[i].curr_type==4){ + printf("[%s]opened file:\n %s\n", + __FUNCTION__,lib_array[i].s_a_fname[lib_array[i].curr_index]); + lib_array[i].fp1 = openFile4read(lib_array[i].s_a_fname[lib_array[i].curr_index]); + lib_array[i].curr_index++; + lib_array[i].paired = 0; + }else if(lib_array[i].curr_type==5){ + printf("[%s]opened file:\n %s\n", + __FUNCTION__,lib_array[i].s_q_fname[lib_array[i].curr_index]); + lib_array[i].fp1 = openFile4read(lib_array[i].s_q_fname[lib_array[i].curr_index]); + lib_array[i].curr_index++; + lib_array[i].paired = 0; + } + +} + +static void reverse2k(char *src_seq,int len_seq) +{ + if(!len_seq) + return; + + int i; + reverseComplementSeq(src_seq,len_seq,src_rc_seq); + + for(i=0;i3&&strcmp(fname+strlen(fname)-3,".gz")==0) + pclose(lib_array[libNo].fp1); + else + fclose(lib_array[libNo].fp1); +} + +static void closeFp2InLab(int libNo) +{ + int ftype = lib_array[libNo].curr_type; + int index = lib_array[libNo].curr_index-1; + char *fname; + if(ftype==1) + fname = lib_array[libNo].a2_fname[index]; + else if(ftype==2) + fname = lib_array[libNo].q2_fname[index]; + else + return; + if(strlen(fname)>3&&strcmp(fname+strlen(fname)-3,".gz")==0) + pclose(lib_array[libNo].fp2); + else + fclose(lib_array[libNo].fp2); +} + +boolean read1seqInLib(char *src_seq, char *src_name, int *len_seq, int *libNo,boolean pair,unsigned char asm_ctg) +{ + int i = *libNo; + int prevLib = i; + + if(!lib_array[i].fp1 // file1 does not exist + ||(lib_array[i].curr_type!=1&&feof(lib_array[i].fp1)) // file1 reaches end and not type1 + ||(lib_array[i].curr_type==1&&feof(lib_array[i].fp1)&&feof(lib_array[i].fp2))){//f1&f2 reaches end + if(lib_array[i].fp1&&feof(lib_array[i].fp1)){ + closeFp1InLab(i); + //printf("[%s]%d reads in current file , (%.1f) map-rate .\n",__FUNCTION__,single_count,single_map/single_count); + single_count=single_map=0; + } + if(lib_array[i].fp2&&feof(lib_array[i].fp2)){ + closeFp2InLab(i); + //printf("[%s]%d reads in current file , (%.1f) map-rate .\n",__FUNCTION__,single_count,single_map/single_count); + single_count=single_map=0; + } + + *libNo = nextValidIndex(i,pair,asm_ctg); + i = *libNo; + if(lib_array[i].rd_len_cutoff>0) + maxReadLen = lib_array[i].rd_len_cutoff=num_libs) + return 0; + openFileInLib(i); + + if(lib_array[i].curr_type==1){ + readseq1by1(src_seq, src_name,len_seq, lib_array[i].fp1,-1); + readseq1by1(src_seq, src_name,len_seq, lib_array[i].fp2,-1); + }else if(lib_array[i].curr_type==3||lib_array[i].curr_type==4) + readseq1by1(src_seq, src_name,len_seq, lib_array[i].fp1,-1); + + } + if(lib_array[i].curr_type==1){ + if(lib_array[i].paired==1){ + readseq1by1(src_seq, src_name,len_seq, lib_array[i].fp1,1); + if(lib_array[i].reverse) + reverse2k(src_seq,*len_seq); + lib_array[i].paired = 2; + if(*len_seq>0||!feof(lib_array[i].fp1)){ + n_solexa++; + return 1; + } + else + return read1seqInLib(src_seq,src_name,len_seq,libNo,pair,asm_ctg); + }else{ + readseq1by1(src_seq, src_name,len_seq, lib_array[i].fp2,1); + + if(lib_array[i].reverse) + reverse2k(src_seq,*len_seq); + lib_array[i].paired = 1; + n_solexa++; + return 1; //can't fail to read a read2 + } + } + if(lib_array[i].curr_type==2){ + if(lib_array[i].paired==1){ + read1seqfq(src_seq, src_name,len_seq, lib_array[i].fp1); + /* + if(*len_seq>0){ + for(j=0;j<*len_seq;j++) + printf("%c",int2base(src_seq[j])); + printf("\n"); + } + */ + if(lib_array[i].reverse) + reverse2k(src_seq,*len_seq); + lib_array[i].paired = 2; + if(*len_seq>0||!feof(lib_array[i].fp1)){ + n_solexa++; + return 1; + }else + return read1seqInLib(src_seq,src_name,len_seq,libNo,pair,asm_ctg); + }else{ + read1seqfq(src_seq, src_name,len_seq, lib_array[i].fp2); + if(lib_array[i].reverse) + reverse2k(src_seq,*len_seq); + lib_array[i].paired = 1; + n_solexa++; + return 1; //can't fail to read a read2 + } + } + if(lib_array[i].curr_type==5) + read1seqfq(src_seq, src_name,len_seq, lib_array[i].fp1); + else{ + readseq1by1(src_seq, src_name,len_seq, lib_array[i].fp1,1); + } + /* + int t; + for(t=0;t<*len_seq;t++) + printf("%d",src_seq[t]); + printf("\n"); + */ + if(lib_array[i].reverse) + reverse2k(src_seq,*len_seq); + if(*len_seq>0||!feof(lib_array[i].fp1)){ + n_solexa++; + return 1; + }else + return read1seqInLib(src_seq,src_name,len_seq,libNo,pair,asm_ctg); +} diff --git a/fusion/scaffold.c b/fusion/scaffold.c new file mode 100755 index 0000000..001c505 --- /dev/null +++ b/fusion/scaffold.c @@ -0,0 +1,60 @@ +#include "stdinc.h" +#include "newhash.h" +#include "extfunc.h" +#include "extvab.h" + +static void initenv(int argc, char **argv); +static void display_scaff_usage(); + +static boolean LINK,SCAFF; + + +int call_scaffold() +{ + time_t start_t,stop_t,time_bef,time_aft; + time(&start_t); + + //initenv(argc, argv); + + loadPEgrads(graphfile); + + time(&time_bef); + loadUpdatedEdges(graphfile); + time(&time_aft); + //printf("time spent on loading edges %ds\n",(int)(time_aft-time_bef)); + + if(!SCAFF){ + time(&time_bef); + PE2Links(graphfile); + time(&time_aft); + //printf("time spent on loading pair end info %ds\n",(int)(time_aft-time_bef)); + + time(&time_bef); + Links2Scaf(graphfile); + time(&time_aft); + //printf("time spent on creating scaffolds %ds\n",(int)(time_aft-time_bef)); + + scaffolding(100,graphfile); + } + + prlReadsCloseGap(graphfile); + + +// locateReadOnScaf(graphfile); + + free_pe_mem(); + if(index_array) + free((void *)index_array); + + freeContig_array(); + + //destroyPreArcMem(); + destroyConnectMem(); + deleteCntLookupTable(); + + time(&stop_t); + //printf("time elapsed: %dm\n",(int)(stop_t-start_t)/60); + printf("[%s]total time on scaffolding : %d minute(s).\n",__FUNCTION__,(int)(stop_t-start_t)/60); + + return 0; +} diff --git a/fusion/searchPath.c b/fusion/searchPath.c new file mode 100755 index 0000000..1f0015c --- /dev/null +++ b/fusion/searchPath.c @@ -0,0 +1,169 @@ +#include "stdinc.h" +#include "newhash.h" +#include "extfunc.h" +#include "extvab.h" + +static int trace_limit = 5000; //the times function is called in a search +/* + search connection paths which were masked along related contigs + start from one contig, end with another + path length includes the length of the last contig +*/ +void traceAlongMaskedCnt(unsigned int destE,unsigned int currE,int max_steps,int min,int max, + int index,int len,int *num_route) +{ + num_trace++; + if(num_trace>trace_limit||*num_route>=max_n_routes){ + return; + } + + unsigned int *array; + int num,i,length; + CONNECT *ite_cnt; + + if(index>0)// there're at most max_steps edges stored in this array including the destination edge + length = len + contig_array[currE].length; + else + length = 0; + if(index>max_steps||length>max) + return; // this is the only situation we stop + if(index>0)// there're at most max_steps edges stored in this array including the destination edge + so_far[index-1] = currE; + + if(currE==destE&&index==0){ + printf("traceAlongMaskedCnt: start and destination are the same\n"); + return; + } + + if(currE==destE && length>=min &&length<=max){ + num = *num_route; + array = found_routes[num]; + for(i=0;imask||ite_cnt->deleted){ + ite_cnt = ite_cnt->next; + continue; + } + traceAlongMaskedCnt(destE,ite_cnt->contigID,max_steps,min,max, + index+1,length + ite_cnt->gapLen,num_route); + ite_cnt = ite_cnt->next; + } + +} +// search connection paths from one connect to a contig +// path length includes the length of the last contig +void traceAlongConnect(unsigned int destE,CONNECT *currCNT,int max_steps,int min,int max,int index,int len,int *num_route) +{ + num_trace++; + if(num_trace>trace_limit||*num_route>=max_n_routes){ + return; + } + + unsigned int *array,currE; + int num,i,length; + CONNECT *ite_cnt; + + currE = currCNT->contigID; + length = len + currCNT->gapLen; + length += contig_array[currE].length; + + if(index>max_steps||length>max) + return; // this is the only situation we stop + /* + if(globalFlag) + printf("B: step %d, ctg %d, length %d\n",index,currCNT->contigID,length); + */ + if(currE==destE&&index==1){ + printf("traceAlongConnect: start and destination are the same\n"); + return; + } + + so_far[index-1] = currE; // there're at most max_steps edges stored in this array including the destination edge + + if(currE==destE && length>=min &&length<=max){ + num = *num_route; + array = found_routes[num]; + for(i=0;inextInScaf){ + traceAlongConnect(destE,currCNT->nextInScaf,max_steps,min,max,index+1,length,num_route); + return; + } + + ite_cnt = contig_array[currE].downwardConnect; + while(ite_cnt){ + if(ite_cnt->mask||ite_cnt->deleted){ + ite_cnt = ite_cnt->next; + continue; + } + traceAlongConnect(destE,ite_cnt,max_steps,min,max,index+1,length,num_route); + ite_cnt = ite_cnt->next; + } + +} + +//find paths in the graph from currE to destE, its length does not include length of both end contigs +void traceAlongArc(unsigned int destE,unsigned int currE,int max_steps,int min,int max,int index,int len,int *num_route) +{ + num_trace++; + if(num_trace>trace_limit||*num_route>=max_n_routes){ + return; + } + + unsigned int *array,out_ed,vt; + int num,i,pos,length; + preARC *parc; + + pos = index; + if(pos>max_steps||len>max) + return; // this is the only situation we stop + if(currE==destE&&pos==0){ + printf("traceAlongArc: start and destination are the same\n"); + return; + } + + if(pos>0) // pos starts with 0 for the starting edge + so_far[pos-1] = currE; // there're at most max_steps edges stored in this array including the destination edge + + if(currE==destE && len>=min){ + num = *num_route; + array = found_routes[num]; + for(i=0;i0) //not the starting edge + length = len + contig_array[currE].length; + else + length = len; + + + vt = contig_array[currE].to_vt; + + parc = contig_array[currE].arcs; + while(parc){ + out_ed = parc->to_ed; + traceAlongArc(destE,out_ed,max_steps,min,max,pos,length,num_route); + parc = parc->next; + } + +} diff --git a/fusion/seq.c b/fusion/seq.c new file mode 100755 index 0000000..664d4a7 --- /dev/null +++ b/fusion/seq.c @@ -0,0 +1,169 @@ +#include "stdinc.h" +#include "newhash.h" +#include "extfunc.h" +#include "extvab.h" + +/* +put a insertSize in the grads array, +if all grads have been entered and all the boundaris have been set, return 0 +*/ + +void print_kmer(FILE *fp,Kmer kmer,char c) +{ + if(kmer) + fprintf(fp,"%llx",kmer); + else + fprintf(fp,"0x0"); + fprintf(fp,"%c",c); + +} + +void printTightString(char *tightSeq,int len) +{ + int i; + + for(i=0;i> 2); + seq = ((seq & 0x0F0F0F0F0F0F0F0FLLU)<< 4) | ((seq & 0xF0F0F0F0F0F0F0F0LLU)>> 4); + seq = ((seq & 0x00FF00FF00FF00FFLLU)<< 8) | ((seq & 0xFF00FF00FF00FF00LLU)>> 8); + seq = ((seq & 0x0000FFFF0000FFFFLLU)<<16) | ((seq & 0xFFFF0000FFFF0000LLU)>>16); + seq = ((seq & 0x00000000FFFFFFFFLLU)<<32) | ((seq & 0xFFFFFFFF00000000LLU)>>32); + return seq >> (64 - (seq_size<<1)); +} + +Kmer reverseComplementVerbose(Kmer word,int overlap) +{ + return fastReverseComp(word,overlap); + /* + int index; + Kmer revComp = 0; + Kmer copy = word; + unsigned char nucleotide; + + for (index = 0; index < overlap; index++) { + nucleotide = copy & 3; + revComp <<= 2; + revComp += int_comp(nucleotide);//3 - nucleotide; + copy >>= 2; + } + return revComp; + */ +} + +Kmer reverseComplement(Kmer word,int overlap) +{ + return fastReverseComp(word,overlap); +} + +void writeChar2tightString(char nt,char *tightSeq,int pos) +{ + char *byte = tightSeq + pos/4; + switch(pos%4){ + case 0: + *byte &=63; + *byte += nt << 6; + return; + case 1: + *byte &=207; + *byte += nt << 4; + return; + case 2: + *byte &=243; + *byte += nt << 2; + return; + case 3: + *byte &=252; + *byte += nt; + return; + + } +} + +char getCharInTightString(char *tightSeq,int pos) +{ + char *byte = tightSeq+pos/4; + switch(pos%4){ + case 3: + return (*byte & 3); + case 2: + return (*byte & 12) >> 2; + case 1: + return (*byte & 48) >> 4; + case 0: + return (*byte & 192) >> 6; + } + return 0; +} + +// complement of sequence denoted 0, 1, 2, 3 +void reverseComplementSeq(char *seq, int len,char *bal_seq) +{ + int i,index=0; + + if(len<1) + return; + + for(i=len-1;i>=0;i--) + bal_seq[index++] = int_comp(seq[i]); + + return; +} + +// complement of sequence denoted 0, 1, 2, 3 +char *compl_int_seq(char *seq, int len) +{ + char *bal_seq=NULL,c,bal_c; + int i,index; + + if(len<1) + return bal_seq; + + bal_seq = (char *)ckalloc(len*sizeof(char)); + index = 0; + for(i=len-1;i>=0;i--){ + c = seq[i]; + if(c<4) + bal_c = int_comp(c);//3-c; + else + bal_c = c; + bal_seq[index++] = bal_c; + + } + return bal_seq; +} + +long long trans_seq(char *seq, int len) +{ + int i; + long long res; + + res = 0; + for(i = 0; i < len; i ++) { + res = res * 4 + seq[i]; + } + + return(res); +} + +char *kmer2seq(Kmer word) +{ + int i; + char *seq; + Kmer charMask = 3; + + seq = (char *)ckalloc(overlaplen*sizeof(char)); + for(i=overlaplen-1;i>=0;i--){ + seq[i] = charMask&word; + word >>= 2; + } + return seq; +} diff --git a/fusion/stack.c b/fusion/stack.c new file mode 100755 index 0000000..707dd9e --- /dev/null +++ b/fusion/stack.c @@ -0,0 +1,113 @@ +#include "stack.h" + +STACK *createStack(int num_items,size_t unit_size) +{ + STACK *newStack = (STACK *)malloc(1*sizeof(STACK)); + + newStack->block_list = NULL; + newStack->items_per_block = num_items; + newStack->item_size = unit_size; + newStack->item_c = 0; + return newStack; +} + +void emptyStack(STACK *astack) +{ + BLOCK_STARTER *block; + if(!astack||!astack->block_list) + return; + + block = astack->block_list; + if(block->next) + block = block->next; + + astack->block_list = block; + astack->item_c = 0; + astack->index_in_block = 0; +} + +void freeStack(STACK *astack) +{ + BLOCK_STARTER *ite_block,*temp_block; + + if(!astack) + return; + + ite_block = astack->block_list; + if(ite_block){ + while(ite_block->next) + ite_block = ite_block->next; + } + while(ite_block){ + temp_block = ite_block; + ite_block = ite_block->prev; + free((void *)temp_block); + } + + free((void *)astack); +} + +void stackBackup(STACK *astack) +{ + astack->block_backup = astack->block_list; + astack->index_backup = astack->index_in_block; + astack->item_c_backup = astack->item_c; +} + +void stackRecover(STACK *astack) +{ + astack->block_list = astack->block_backup; + astack->index_in_block = astack->index_backup; + astack->item_c = astack->item_c_backup; +} + +void *stackPop(STACK *astack) +{ + BLOCK_STARTER *block; + + if(!astack||!astack->block_list||!astack->item_c) + return NULL; + + astack->item_c--; + block = astack->block_list; + if(astack->index_in_block==1){ + if(block->next){ + astack->block_list = block->next; + astack->index_in_block = astack->items_per_block; + }else{ + astack->index_in_block = 0; + astack->item_c = 0; + } + return (void *)((void *)block+sizeof(BLOCK_STARTER)); + + } + return (void *)((void *)block+sizeof(BLOCK_STARTER)+astack->item_size*(--astack->index_in_block)); +} + +void *stackPush(STACK *astack) +{ + BLOCK_STARTER *block; + + if(!astack) + return NULL; + + astack->item_c++; + if(!astack->block_list||(astack->index_in_block==astack->items_per_block&&!astack->block_list->prev)){ + block = malloc(sizeof(BLOCK_STARTER)+astack->items_per_block*astack->item_size); + block->prev = NULL; + if(astack->block_list) + astack->block_list->prev = block; + block->next = astack->block_list; + astack->block_list = block; + astack->index_in_block = 1; + return (void *)((void *)block+sizeof(BLOCK_STARTER)); + }else if(astack->index_in_block==astack->items_per_block&&astack->block_list->prev){ + astack->block_list = astack->block_list->prev; + astack->index_in_block = 1; + return (void *)((void *)astack->block_list+sizeof(BLOCK_STARTER)); + } + + block = astack->block_list; + return (void *)((void *)block+sizeof(BLOCK_STARTER)+astack->item_size*astack->index_in_block++); + +} diff --git a/sparsePregraph/Makefile b/sparsePregraph/Makefile index 9c88a75..980a616 100644 --- a/sparsePregraph/Makefile +++ b/sparsePregraph/Makefile @@ -1,8 +1,8 @@ -CC= g++ # /opt/blc/gcc-4.5.0/bin/gcc #gcc +CC= g++ ifdef debug -CFLAGS= -O0 -g -fomit-frame-pointer #-static #-mcrc32 -march=core2 -msse4.1 -msse4.2 +CFLAGS= -O0 -g -fomit-frame-pointer #-mcrc32 else -CFLAGS= -O4 -fomit-frame-pointer #-static #-mcrc32 -march=core2 -msse4.1 -msse4.2 +CFLAGS= -O3 -fomit-frame-pointer -w #-mcrc32 endif DFLAGS= @@ -37,15 +37,6 @@ EXTRA_FLAGS += -Wl,--hash-style=both LIBS += -lbam endif -ifneq (,$(findstring Unix,$(shell uname))) -EXTRA_FLAGS += -Wl,--hash-style=both -LIBS += -lbam -lrt -endif - -ifneq (,$(findstring Darwin,$(shell uname))) -LIBS += -lbammac -endif - ifneq (,$(findstring $(shell uname -m), x86_64)) CFLAGS += -m64 endif @@ -61,26 +52,23 @@ endif .SUFFIXES:.cpp .o .cpp.o: - @printf "Compiling $<... \r"; \ - $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< || echo "Error in command: $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $<" + @printf "Compiling $<... \r" + @$(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< || echo "Error in command: $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $<" all: clean $(OBJS) + @printf "$(PROG) objects generated. \n" #pregraph_sparse -.PHONY:all clean install +.PHONY:all clean envTest: @test $(BIT_ERR) != 1 || sh -c 'echo "Fatal: 64bit CPU and Operating System required!";false;' -pregraph_sparse: clean envTest $(OBJS) - @printf "Linking... \r" - #@$(CC) $(CFLAGS)$(INCLUDES) -o $(PROG) $(OBJS) $(LIBPATH) $(LIBS) $(ENTRAFLAGS) - @printf "$(PROG) compilation done.\n"; +pregraph_sparse: clean envTest $(OBJS) + @printf "Linking... \r" + @$(CC) $(CFLAGS)$(INCLUDES) -o $(PROG) $(OBJS) $(LIBPATH) $(LIBS) $(ENTRAFLAGS) + @printf "$(PROG) compilation done. \n" clean: - @rm -fr gmon.out *.o a.out *.exe *.dSYM $(PROG) *~ *.a *.so.* *.so *.dylib - @printf "$(PROG) cleaning done.\n"; - -install: - @cp $(PROG) ../bin/ - @printf "$(PROG) installed at ../bin/$(PROG)\n" + @rm -fr gmon.out *.o a.out $(PROG) + @printf "$(PROG) cleaning done. \n" diff --git a/sparsePregraph/build_edge.cpp b/sparsePregraph/build_edge.cpp index 2b9ceab..f6639bb 100644 --- a/sparsePregraph/build_edge.cpp +++ b/sparsePregraph/build_edge.cpp @@ -1,7 +1,7 @@ /* * build_edge.cpp * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/sparsePregraph/build_preArc.cpp b/sparsePregraph/build_preArc.cpp index ad4f2ca..c271e26 100644 --- a/sparsePregraph/build_preArc.cpp +++ b/sparsePregraph/build_preArc.cpp @@ -1,7 +1,7 @@ /* * build_preArc.cpp * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/sparsePregraph/change.log b/sparsePregraph/change.log deleted file mode 100644 index 99a26ff..0000000 --- a/sparsePregraph/change.log +++ /dev/null @@ -1,24 +0,0 @@ -1.change the edge node - -old: -struct edge_node -{ - uint64_t edge£º50£¬edge_cov:7,len:6,used:1£» - struct edge_node *nxt_edge; -}; - -now: -struct edge_node -{ - uint64_t edge; - uint64_t edge_cov:7,len:6,used:1,deleted:1; - struct edge_node *nxt_edge; -}; - -so, the LoadGraph... function can't work when performed on an old hash data set. - - -2. support bam format -3. support -R -4. support 127mer -5. build vertex K_size -> gap . \ No newline at end of file diff --git a/sparsePregraph/convert_soapdenovo.cpp b/sparsePregraph/convert_soapdenovo.cpp index 89852c8..f9ec777 100644 --- a/sparsePregraph/convert_soapdenovo.cpp +++ b/sparsePregraph/convert_soapdenovo.cpp @@ -1,7 +1,7 @@ /* * convert_soapdenovo.cpp * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/sparsePregraph/global.cpp b/sparsePregraph/global.cpp index 2a49afd..18c9a23 100644 --- a/sparsePregraph/global.cpp +++ b/sparsePregraph/global.cpp @@ -1,7 +1,7 @@ /* * global.cpp * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/sparsePregraph/inc/build_edge.h b/sparsePregraph/inc/build_edge.h index 2b6fa03..adb27c6 100644 --- a/sparsePregraph/inc/build_edge.h +++ b/sparsePregraph/inc/build_edge.h @@ -1,7 +1,7 @@ /* * inc/sparse_kmer.h * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/sparsePregraph/inc/build_graph.h b/sparsePregraph/inc/build_graph.h index 6ff5818..b603331 100644 --- a/sparsePregraph/inc/build_graph.h +++ b/sparsePregraph/inc/build_graph.h @@ -1,7 +1,7 @@ /* * inc/build_graph.h * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/sparsePregraph/inc/build_preArc.h b/sparsePregraph/inc/build_preArc.h index df683ca..7513739 100644 --- a/sparsePregraph/inc/build_preArc.h +++ b/sparsePregraph/inc/build_preArc.h @@ -1,7 +1,7 @@ /* * inc/build_preArc.h * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/sparsePregraph/inc/convert_soapdenovo.h b/sparsePregraph/inc/convert_soapdenovo.h index 05fe016..fb7768c 100644 --- a/sparsePregraph/inc/convert_soapdenovo.h +++ b/sparsePregraph/inc/convert_soapdenovo.h @@ -1,7 +1,7 @@ /* * inc/convert_soapdenovo.h * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/sparsePregraph/inc/global.h b/sparsePregraph/inc/global.h index eebaeca..94ee249 100644 --- a/sparsePregraph/inc/global.h +++ b/sparsePregraph/inc/global.h @@ -1,7 +1,7 @@ /* * inc/global.h * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/sparsePregraph/inc/io_func.h b/sparsePregraph/inc/io_func.h index 24066fd..105eb3f 100644 --- a/sparsePregraph/inc/io_func.h +++ b/sparsePregraph/inc/io_func.h @@ -1,7 +1,7 @@ /* * inc/io_func.h * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/sparsePregraph/inc/libcurses.a b/sparsePregraph/inc/libcurses.a deleted file mode 100644 index a3863b8..0000000 Binary files a/sparsePregraph/inc/libcurses.a and /dev/null differ diff --git a/sparsePregraph/inc/multi_threads.h b/sparsePregraph/inc/multi_threads.h index 2155d91..68e8f8e 100644 --- a/sparsePregraph/inc/multi_threads.h +++ b/sparsePregraph/inc/multi_threads.h @@ -1,7 +1,7 @@ /* * inc/multi_threads.h * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/sparsePregraph/inc/sparse_kmer.h b/sparsePregraph/inc/sparse_kmer.h index d013ded..650d376 100644 --- a/sparsePregraph/inc/sparse_kmer.h +++ b/sparsePregraph/inc/sparse_kmer.h @@ -1,7 +1,7 @@ /* * inc/sparse_kmer.h * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/sparsePregraph/inc/stdinc.h b/sparsePregraph/inc/stdinc.h index 5cd64c0..3a02528 100644 --- a/sparsePregraph/inc/stdinc.h +++ b/sparsePregraph/inc/stdinc.h @@ -1,7 +1,7 @@ /* * inc/stdinc.h * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/sparsePregraph/inc/xcurses.h.gch b/sparsePregraph/inc/xcurses.h.gch deleted file mode 100644 index e01c34a..0000000 Binary files a/sparsePregraph/inc/xcurses.h.gch and /dev/null differ diff --git a/sparsePregraph/io_func.cpp b/sparsePregraph/io_func.cpp index 6832bce..349ff87 100644 --- a/sparsePregraph/io_func.cpp +++ b/sparsePregraph/io_func.cpp @@ -1,7 +1,7 @@ /* * io_func.cpp * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/sparsePregraph/multi_threads.cpp b/sparsePregraph/multi_threads.cpp index 5aa755d..0847f51 100644 --- a/sparsePregraph/multi_threads.cpp +++ b/sparsePregraph/multi_threads.cpp @@ -1,7 +1,7 @@ /* * multi_threads.cpp * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/sparsePregraph/pregraph_sparse.cpp b/sparsePregraph/pregraph_sparse.cpp index cd422f3..d425c97 100644 --- a/sparsePregraph/pregraph_sparse.cpp +++ b/sparsePregraph/pregraph_sparse.cpp @@ -1,7 +1,7 @@ /* * pregraph_sparse.cpp * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/Makefile b/standardPregraph/Makefile index 9716391..421f400 100644 --- a/standardPregraph/Makefile +++ b/standardPregraph/Makefile @@ -9,9 +9,9 @@ CC= gcc GCCVERSIONMAJOR := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONMINOR := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 4) ifdef debug -CFLAGS= -O0 -g -fomit-frame-pointer #-static #-mcrc32 -march=core2 -msse4.1 -msse4.2 +CFLAGS= -O0 -g -fomit-frame-pointer #-mcrc32 else -CFLAGS= -O4 -fomit-frame-pointer #-static #-mcrc32 -march=core2 -msse4.1 -msse4.2 +CFLAGS= -O3 -fomit-frame-pointer -w #-mcrc32 endif DFLAGS= OBJS= arc.o attachPEinfo.o bubble.o check.o compactEdge.o \ @@ -49,15 +49,6 @@ EXTRA_FLAGS += -Wl,--hash-style=both LIBS += -lbam -lrt endif -ifneq (,$(findstring Unix,$(shell uname))) -EXTRA_FLAGS += -Wl,--hash-style=both -LIBS += -lbam -lrt -endif - -ifneq (,$(findstring Darwin,$(shell uname))) -LIBS += -lbammac -endif - ifneq (,$(findstring $(shell uname -m), x86_64)) CFLAGS += -m64 endif @@ -73,12 +64,11 @@ endif .SUFFIXES:.c .o .c.o: - @printf "Compiling $<... \r"; \ - $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< || echo "Error in command: $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $<" + @printf "Compiling $<... \r" + @$(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< || echo "Error in command: $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $<" all: clean $(OBJS) - -#SOAPdenovo + @printf "$(PROG) objects generated. \n" .PHONY:all clean install @@ -88,14 +78,10 @@ envTest: @test $(GCCVERSIONMINOR) == 1 || sh -c 'echo "GCC version lower than 4.4.0";false;' SOAPdenovo: envTest $(OBJS) - @printf "Linking... \r" + @printf "Linking... \r" @$(CC) $(CFLAGS) -o $(PROG) $(OBJS) $(LIBPATH) $(LIBS) $(ENTRAFLAGS) - @printf "$(PROG) compilation done.\n"; + @printf "$(PROG) compilation done. \n" clean: @rm -fr gmon.out *.o a.out *.exe *.dSYM $(PROG) *~ *.a *.so.* *.so *.dylib - @printf "$(PROG) cleaning done.\n"; - -install: - @cp $(PROG) ../bin/ - @printf "$(PROG) installed at ../bin/$(PROG)\n" + @printf "$(PROG) cleaning done. \n" diff --git a/standardPregraph/arc.c b/standardPregraph/arc.c index 32cf554..08649a6 100644 --- a/standardPregraph/arc.c +++ b/standardPregraph/arc.c @@ -1,7 +1,7 @@ /* * arc.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/attachPEinfo.c b/standardPregraph/attachPEinfo.c index 77105d5..f1ac7cf 100644 --- a/standardPregraph/attachPEinfo.c +++ b/standardPregraph/attachPEinfo.c @@ -1,7 +1,7 @@ /* * attachPEinfo.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/bubble.c b/standardPregraph/bubble.c index dd83a4c..c7abf7f 100644 --- a/standardPregraph/bubble.c +++ b/standardPregraph/bubble.c @@ -1,7 +1,7 @@ /* * bubble.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/check.c b/standardPregraph/check.c index be06fba..cf39402 100644 --- a/standardPregraph/check.c +++ b/standardPregraph/check.c @@ -1,7 +1,7 @@ /* * check.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/compactEdge.c b/standardPregraph/compactEdge.c index edf9824..6979760 100644 --- a/standardPregraph/compactEdge.c +++ b/standardPregraph/compactEdge.c @@ -1,7 +1,7 @@ /* * compactEdge.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/concatenateEdge.c b/standardPregraph/concatenateEdge.c index c795e46..18fa761 100644 --- a/standardPregraph/concatenateEdge.c +++ b/standardPregraph/concatenateEdge.c @@ -1,7 +1,7 @@ /* * concatenateEdge.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/connect.c b/standardPregraph/connect.c index 1f10a8c..60ac7bb 100644 --- a/standardPregraph/connect.c +++ b/standardPregraph/connect.c @@ -1,7 +1,7 @@ /* * connect.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/contig.c b/standardPregraph/contig.c index f1ce229..6d8c0c1 100644 --- a/standardPregraph/contig.c +++ b/standardPregraph/contig.c @@ -1,7 +1,7 @@ /* * contig.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/cutTipPreGraph.c b/standardPregraph/cutTipPreGraph.c index b1fb2c8..1594be9 100644 --- a/standardPregraph/cutTipPreGraph.c +++ b/standardPregraph/cutTipPreGraph.c @@ -1,7 +1,7 @@ /* * cutTipPreGraph.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/cutTip_graph.c b/standardPregraph/cutTip_graph.c index 651c1f8..4cfc0be 100644 --- a/standardPregraph/cutTip_graph.c +++ b/standardPregraph/cutTip_graph.c @@ -1,7 +1,7 @@ /* * cutTip_graph.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/cutTip_graph2.c b/standardPregraph/cutTip_graph2.c index 9ab776c..12da91b 100644 --- a/standardPregraph/cutTip_graph2.c +++ b/standardPregraph/cutTip_graph2.c @@ -1,7 +1,7 @@ /* * cutTip_graph2.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/darray.c b/standardPregraph/darray.c index 66ca668..46fdad0 100644 --- a/standardPregraph/darray.c +++ b/standardPregraph/darray.c @@ -1,7 +1,7 @@ /* * darray.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/dfibHeap.c b/standardPregraph/dfibHeap.c index 900171d..5399b70 100644 --- a/standardPregraph/dfibHeap.c +++ b/standardPregraph/dfibHeap.c @@ -1,7 +1,7 @@ /* * dfibHeap.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/fibHeap.c b/standardPregraph/fibHeap.c index f690fb8..51f4e21 100644 --- a/standardPregraph/fibHeap.c +++ b/standardPregraph/fibHeap.c @@ -1,7 +1,7 @@ /* * fibHeap.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/hashFunction.c b/standardPregraph/hashFunction.c index bc7065d..6de56cc 100644 --- a/standardPregraph/hashFunction.c +++ b/standardPregraph/hashFunction.c @@ -1,7 +1,7 @@ /* * hashFunction.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/inc/check.h b/standardPregraph/inc/check.h index 278620a..322d809 100644 --- a/standardPregraph/inc/check.h +++ b/standardPregraph/inc/check.h @@ -1,7 +1,7 @@ /* * inc/check.h * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/inc/darray.h b/standardPregraph/inc/darray.h index a3a4a35..eee1f0d 100644 --- a/standardPregraph/inc/darray.h +++ b/standardPregraph/inc/darray.h @@ -1,7 +1,7 @@ /* * inc/darray.h * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/inc/def.h b/standardPregraph/inc/def.h index c4ba677..4881b8f 100644 --- a/standardPregraph/inc/def.h +++ b/standardPregraph/inc/def.h @@ -1,7 +1,7 @@ /* * inc/def.h * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/inc/def2.h b/standardPregraph/inc/def2.h index ead7cfd..bca6390 100644 --- a/standardPregraph/inc/def2.h +++ b/standardPregraph/inc/def2.h @@ -1,7 +1,7 @@ /* * inc/def2.h * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/inc/dfibHeap.h b/standardPregraph/inc/dfibHeap.h index 53b6ff5..399702d 100644 --- a/standardPregraph/inc/dfibHeap.h +++ b/standardPregraph/inc/dfibHeap.h @@ -1,7 +1,7 @@ /* * inc/dfibHeap.h * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/inc/extfunc.h b/standardPregraph/inc/extfunc.h index b687787..71a1819 100644 --- a/standardPregraph/inc/extfunc.h +++ b/standardPregraph/inc/extfunc.h @@ -1,7 +1,7 @@ /* * inc/extfunc.h * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/inc/extfunc2.h b/standardPregraph/inc/extfunc2.h index 1d839a0..5fa5c56 100644 --- a/standardPregraph/inc/extfunc2.h +++ b/standardPregraph/inc/extfunc2.h @@ -1,7 +1,7 @@ /* * inc/extfunc2.h * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/inc/extvab.h b/standardPregraph/inc/extvab.h index f7a5ce0..9c9b7f3 100644 --- a/standardPregraph/inc/extvab.h +++ b/standardPregraph/inc/extvab.h @@ -1,7 +1,7 @@ /* * inc/extvab.h * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/inc/fibHeap.h b/standardPregraph/inc/fibHeap.h index 9a940b0..9869cec 100644 --- a/standardPregraph/inc/fibHeap.h +++ b/standardPregraph/inc/fibHeap.h @@ -1,7 +1,7 @@ /* * inc/fibHeap.h * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/inc/global.h b/standardPregraph/inc/global.h index 7e0b7f2..f550770 100644 --- a/standardPregraph/inc/global.h +++ b/standardPregraph/inc/global.h @@ -1,7 +1,7 @@ /* * inc/global.h * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/inc/kmerhash.h b/standardPregraph/inc/kmerhash.h index 7421119..27f8e36 100644 --- a/standardPregraph/inc/kmerhash.h +++ b/standardPregraph/inc/kmerhash.h @@ -1,7 +1,7 @@ /* * inc/kmerhash.h * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/inc/libbammac.a b/standardPregraph/inc/libbammac.a new file mode 100644 index 0000000..9952f9d Binary files /dev/null and b/standardPregraph/inc/libbammac.a differ diff --git a/standardPregraph/inc/newhash.h b/standardPregraph/inc/newhash.h index 8b5d17b..2fb0a5e 100644 --- a/standardPregraph/inc/newhash.h +++ b/standardPregraph/inc/newhash.h @@ -1,7 +1,7 @@ /* * inc/newhash.h * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/inc/nuc.h b/standardPregraph/inc/nuc.h index 7762cf8..d0044b4 100644 --- a/standardPregraph/inc/nuc.h +++ b/standardPregraph/inc/nuc.h @@ -1,7 +1,7 @@ /* * inc/nuc.h * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/inc/stack.h b/standardPregraph/inc/stack.h index 4973260..f3a501a 100644 --- a/standardPregraph/inc/stack.h +++ b/standardPregraph/inc/stack.h @@ -1,7 +1,7 @@ /* * inc/stack.h * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/inc/stdinc.h b/standardPregraph/inc/stdinc.h index dbef7c3..67ca028 100644 --- a/standardPregraph/inc/stdinc.h +++ b/standardPregraph/inc/stdinc.h @@ -1,7 +1,7 @@ /* * inc/stdinc.h * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/inc/types.h b/standardPregraph/inc/types.h index f3f821d..dc4dc59 100644 --- a/standardPregraph/inc/types.h +++ b/standardPregraph/inc/types.h @@ -1,7 +1,7 @@ /* * inc/types.h * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/iterate.c b/standardPregraph/iterate.c index 610b253..4a7e80b 100644 --- a/standardPregraph/iterate.c +++ b/standardPregraph/iterate.c @@ -1,7 +1,7 @@ /* * iterate.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/kmer.c b/standardPregraph/kmer.c index 8a80ca4..3588325 100644 --- a/standardPregraph/kmer.c +++ b/standardPregraph/kmer.c @@ -1,7 +1,7 @@ /* * kmer.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/kmerhash.c b/standardPregraph/kmerhash.c index 8266dd0..8a49282 100644 --- a/standardPregraph/kmerhash.c +++ b/standardPregraph/kmerhash.c @@ -1,7 +1,7 @@ /* * kmerhash.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/lib.c b/standardPregraph/lib.c index 95e4eb7..76c361b 100644 --- a/standardPregraph/lib.c +++ b/standardPregraph/lib.c @@ -1,7 +1,7 @@ /* * lib.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/linearEdge.c b/standardPregraph/linearEdge.c index 95772e9..0602d37 100644 --- a/standardPregraph/linearEdge.c +++ b/standardPregraph/linearEdge.c @@ -1,7 +1,7 @@ /* * linearEdge.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/loadGraph.c b/standardPregraph/loadGraph.c index f9dafdd..0229752 100644 --- a/standardPregraph/loadGraph.c +++ b/standardPregraph/loadGraph.c @@ -1,7 +1,7 @@ /* * loadGraph.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/loadPath.c b/standardPregraph/loadPath.c index 38d280a..506884f 100644 --- a/standardPregraph/loadPath.c +++ b/standardPregraph/loadPath.c @@ -1,7 +1,7 @@ /* * loadPath.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/loadPreGraph.c b/standardPregraph/loadPreGraph.c index 1422eab..90f9ab4 100644 --- a/standardPregraph/loadPreGraph.c +++ b/standardPregraph/loadPreGraph.c @@ -1,7 +1,7 @@ /* * loadPreGraph.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/localAsm.c b/standardPregraph/localAsm.c index e61d9e4..78f688e 100644 --- a/standardPregraph/localAsm.c +++ b/standardPregraph/localAsm.c @@ -1,7 +1,7 @@ /* * localAsm.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/main.c b/standardPregraph/main.c index 98170e4..b93e8c5 100644 --- a/standardPregraph/main.c +++ b/standardPregraph/main.c @@ -1,7 +1,7 @@ /* * main.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/map.c b/standardPregraph/map.c index c762ec3..4e0ebe3 100644 --- a/standardPregraph/map.c +++ b/standardPregraph/map.c @@ -1,7 +1,7 @@ /* * map.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/mem_manager.c b/standardPregraph/mem_manager.c index 231dadb..ad86662 100644 --- a/standardPregraph/mem_manager.c +++ b/standardPregraph/mem_manager.c @@ -1,7 +1,7 @@ /* * mem_manager.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/newhash.c b/standardPregraph/newhash.c index 48cbdb8..b0700d2 100644 --- a/standardPregraph/newhash.c +++ b/standardPregraph/newhash.c @@ -1,7 +1,7 @@ /* * newhash.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/node2edge.c b/standardPregraph/node2edge.c index df5ae73..c992383 100644 --- a/standardPregraph/node2edge.c +++ b/standardPregraph/node2edge.c @@ -1,7 +1,7 @@ /* * node2edge.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/output_contig.c b/standardPregraph/output_contig.c index de5d0c5..257e3db 100644 --- a/standardPregraph/output_contig.c +++ b/standardPregraph/output_contig.c @@ -1,7 +1,7 @@ /* * output_contig.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/output_pregraph.c b/standardPregraph/output_pregraph.c index f553885..d9a1a86 100644 --- a/standardPregraph/output_pregraph.c +++ b/standardPregraph/output_pregraph.c @@ -1,7 +1,7 @@ /* * 31mer/output_pregraph.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/output_scaffold.c b/standardPregraph/output_scaffold.c index ada9486..ef3dd74 100644 --- a/standardPregraph/output_scaffold.c +++ b/standardPregraph/output_scaffold.c @@ -1,7 +1,7 @@ /* * output_scaffold.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/pregraph.c b/standardPregraph/pregraph.c index c4151b3..f23a78d 100644 --- a/standardPregraph/pregraph.c +++ b/standardPregraph/pregraph.c @@ -1,7 +1,7 @@ /* * pregraph.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/prlHashCtg.c b/standardPregraph/prlHashCtg.c index f14c7c7..e956df1 100644 --- a/standardPregraph/prlHashCtg.c +++ b/standardPregraph/prlHashCtg.c @@ -1,7 +1,7 @@ /* * prlHashCtg.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/prlHashReads.c b/standardPregraph/prlHashReads.c index 6a9158e..1e1393d 100644 --- a/standardPregraph/prlHashReads.c +++ b/standardPregraph/prlHashReads.c @@ -1,7 +1,7 @@ /* * prlHashReads.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/prlRead2Ctg.c b/standardPregraph/prlRead2Ctg.c index b0c70bc..50a29c3 100644 --- a/standardPregraph/prlRead2Ctg.c +++ b/standardPregraph/prlRead2Ctg.c @@ -1,7 +1,7 @@ /* * prlRead2Ctg.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/prlRead2path.c b/standardPregraph/prlRead2path.c index f584e2a..ef4e9fe 100644 --- a/standardPregraph/prlRead2path.c +++ b/standardPregraph/prlRead2path.c @@ -1,7 +1,7 @@ /* * prlRead2path.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/prlReadFillGap.c b/standardPregraph/prlReadFillGap.c index b1de77a..7b2afca 100644 --- a/standardPregraph/prlReadFillGap.c +++ b/standardPregraph/prlReadFillGap.c @@ -1,7 +1,7 @@ /* * prlReadFillGap.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/read2edge.c b/standardPregraph/read2edge.c index dcea79b..dffb76d 100644 --- a/standardPregraph/read2edge.c +++ b/standardPregraph/read2edge.c @@ -1,7 +1,7 @@ /* * read2edge.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/read2scaf.c b/standardPregraph/read2scaf.c index 00e9827..0abca1e 100644 --- a/standardPregraph/read2scaf.c +++ b/standardPregraph/read2scaf.c @@ -1,7 +1,7 @@ /* * read2scaf.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/readInterval.c b/standardPregraph/readInterval.c index 362c54f..76d9030 100644 --- a/standardPregraph/readInterval.c +++ b/standardPregraph/readInterval.c @@ -1,7 +1,7 @@ /* * readInterval.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/readseq1by1.c b/standardPregraph/readseq1by1.c index 9d0e655..6fbac39 100644 --- a/standardPregraph/readseq1by1.c +++ b/standardPregraph/readseq1by1.c @@ -1,7 +1,7 @@ /* * readseq1by1.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/scaffold.c b/standardPregraph/scaffold.c index c712bae..7054dc4 100644 --- a/standardPregraph/scaffold.c +++ b/standardPregraph/scaffold.c @@ -1,7 +1,7 @@ /* * scaffold.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/searchPath.c b/standardPregraph/searchPath.c index 72981d2..087ac8c 100644 --- a/standardPregraph/searchPath.c +++ b/standardPregraph/searchPath.c @@ -1,7 +1,7 @@ /* * searchPath.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/seq.c b/standardPregraph/seq.c index a24ce6e..f3ae144 100644 --- a/standardPregraph/seq.c +++ b/standardPregraph/seq.c @@ -1,7 +1,7 @@ /* * seq.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/splitReps.c b/standardPregraph/splitReps.c index 46ebf6d..1c8628b 100644 --- a/standardPregraph/splitReps.c +++ b/standardPregraph/splitReps.c @@ -1,7 +1,7 @@ /* * splitReps.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/standardPregraph/stack.c b/standardPregraph/stack.c index 04f2b6e..455cde8 100644 --- a/standardPregraph/stack.c +++ b/standardPregraph/stack.c @@ -1,7 +1,7 @@ /* * stack.c * - * Copyright (c) 2008-2012 BGI-Shenzhen . + * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * diff --git a/update.log b/update.log deleted file mode 100644 index 5f6e8b5..0000000 --- a/update.log +++ /dev/null @@ -1,91 +0,0 @@ -r240 | 2013-07-09 11:30:03 +0800 (Tue, 09 Jul 2013) - -Fix a bug in reading files in 'map' step. This bug might lead to seg fault. - ------------------------------------------------------------------------- -r239 | 2013-06-26 09:41:39 +0800 (Wed, 26 Jun 2013) - -1) Fix the bug of reading fasta file in map step. This bug was introduced when - fixing a bug of reading fastq file in r238. - ------------------------------------------------------------------------- -r224 - r238 | 2013-06-13 - -1) Fix a serious bug in 'map' step of version r223. This bug can lead - to incorrect pairing of PE reads in LIB of even order, e.g., the - 2nd LIB, the 4th LIB and so on...And these affected LIBs may not - contribute to the construction of scaffold. -2) Merge 'standPregraph' and 'sparsePregraph'. Now, there are only two - executable programs: SOAPdenovo-63mer and SOAPdenovo-127mer. User - can choose to use 'pregraph' for standard Kmer graph or - 'sparse_pregraph' for sparse Kmer graph. -3) Add an option for debug version compilation. User can use - 'make debug=1' to obtain programs for debug. -4) Fix a bug in sorting edges in 'contig' step. -5) Fix a bug in reading files when using multi-kmer. Now the - 'max_read_length' will change according to the LIB being red. - ------------------------------------------------------------------------- -r223 | 2012-12-28 10:11:43 +0800 (Fri, 28 Dec 2012) - -Fix the problem that parameter k doesn't work when k is larger than 63 -for 127mer version. - ------------------------------------------------------------------------- -r222 | 2012-12-21 14:45:49 +0800 (Fri, 21 Dec 2012) - -1) Change some codes so that program can handle reads longer than 5000. -2) Add a new perl script which can seperate singletons from scaffolds in - *.scafSeq file. - ------------------------------------------------------------------------- -r221 | 2012-12-07 14:27:02 +0800 (Fri, 07 Dec 2012) - -Fix a bug in reading files which might cause zombie process. - ------------------------------------------------------------------------- -r220 | 2012-11-26 10:09:45 +0800 (Mon, 26 Nov 2012) - -Fix bug in aio that the buffer was not enough for fq for long reads. - ------------------------------------------------------------------------- -r219 | 2012-11-08 12:58:45 +0800 (Thu, 08 Nov 2012) - -Fix a bug that using -r 1 will casuse the infomation loss of MaxReadLen -and MinReadLen in *.preGraphBasic file in pregraph_sparse module. - ------------------------------------------------------------------------- -r218 | 2012-11-08 11:04:54 +0800 (Thu, 08 Nov 2012) - -Output palindrome sequence only once now instead of twice before in -pregraph_sparse module. - ------------------------------------------------------------------------- -r217 | 2012-11-01 13:09:50 +0800 (Thu, 01 Nov 2012) - -Fix bug in scaffolding which may lead to scaffold consisting of none -or only one contig. - ------------------------------------------------------------------------- -r216 | 2012-10-31 14:53:29 +0800 (Wed, 31 Oct 2012) - -Fix a bug of 'pregraph-sparse' which may lead to segmentation fault in -'contig' step if option -R is set and there are reads longer than 100bp. - ------------------------------------------------------------------------- -r215 | 2012-10-16 18:53:28 +0800 (Tue, 16 Oct 2012) - -Fix a bug of aio which happens rarely in 'pregraph' step when there are -reads shorter than Kmer. - ------------------------------------------------------------------------- -r214 | 2012-10-08 15:58:09 +0800 (Mon, 08 Oct 2012) - -Modify usage description of '-V'. - ------------------------------------------------------------------------- -r213 | 2012-09-29 09:24:32 +0800 (Sat, 29 Sep 2012) - -Fix a bug which might happen in 'contig' step if the 'pregraph-sparse' is -used to replace the regular 'pregraph'. -