diff --git a/Makefile b/Makefile
index 8874681..8e8f80e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,12 +1,13 @@
+MAKEFLAGS += --no-print-directory
CC = g++
ifdef debug
CFLAGS= -O0 -g -fomit-frame-pointer
else
-CFLAGS= -O4 -fomit-frame-pointer
+CFLAGS= -O3 -fomit-frame-pointer
endif
-SUBDIRS = sparsePregraph standardPregraph
-PROG= SOAPdenovo-63mer SOAPdenovo-127mer
+SUBDIRS = sparsePregraph standardPregraph fusion
+PROG= SOAPdenovo-63mer SOAPdenovo-127mer SOAPdenovo-fusion
INCLUDES= -I./sparsePregraph/inc -I./standardPregraph/inc
LIBPATH= -L/lib64 -L/usr/lib64 -L./sparsePregraph/inc -L./standardPregraph/inc
@@ -23,15 +24,6 @@ EXTRA_FLAGS += -Wl,--hash-style=both
LIBS += -lbam -lrt
endif
-ifneq (,$(findstring Unix,$(shell uname)))
-EXTRA_FLAGS += -Wl,--hash-style=both
-LIBS += -lbam -lrt
-endif
-
-ifneq (,$(findstring Darwin,$(shell uname)))
-LIBS += -lbammac
-endif
-
ifneq (,$(findstring $(shell uname -m), x86_64))
CFLAGS += -m64
endif
@@ -45,7 +37,10 @@ CFLAGS += -mpowerpc64
endif
-all: SOAPdenovo-63mer SOAPdenovo-127mer
+all: SOAPdenovo-63mer SOAPdenovo-127mer SOAPdenovo-fusion
+
+SOAPdenovo-fusion:
+ @cd fusion;make;cp SOAPdenovo-fusion ../;cd ..;
ifdef debug
SOAPdenovo-63mer:
@@ -56,10 +51,6 @@ SOAPdenovo-127mer:
@cd sparsePregraph;make 127mer=1 debug=1;cd ..;
@cd standardPregraph;make 127mer=1 debug=1;cd ..;
@$(CC) sparsePregraph/*.o standardPregraph/*.o $(LIBPATH) $(LIBS) $(EXTRA_FLAGS) -o SOAPdenovo-127mer
-clean:
- @cd sparsePregraph;make clean;cd ..;
- @cd standardPregraph;make clean;cd ..;
- @rm SOAPdenovo-63mer SOAPdenovo-127mer -f
else
SOAPdenovo-63mer:
@cd sparsePregraph;make 63mer=1;cd ..;
@@ -69,8 +60,10 @@ SOAPdenovo-127mer:
@cd sparsePregraph;make 127mer=1;cd ..;
@cd standardPregraph;make 127mer=1;cd ..;
@$(CC) sparsePregraph/*.o standardPregraph/*.o $(LIBPATH) $(LIBS) $(EXTRA_FLAGS) -o SOAPdenovo-127mer
+endif
+
clean:
@cd sparsePregraph;make clean;cd ..;
@cd standardPregraph;make clean;cd ..;
- @rm SOAPdenovo-63mer SOAPdenovo-127mer -f
-endif
+ @cd fusion;make clean;cd ..;
+ @rm -f SOAPdenovo-63mer SOAPdenovo-127mer SOAPdenovo-fusion
diff --git a/README.md b/README.md
index e3801a6..e2ecc4d 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,19 @@
# Manual of SOAPdenovo2
-## What's next of SOAPdenovo2
+## About MEGAHIT
-MEGAHIT is the formal successor of SOAPdenovo2
+MEGAHIT works with single-cell sequencing data and metagenomcis data. Compare to SOAPdenovo, it generates longer contigs and consumes less memory.
+To scaffold the contigs generated by MEGAHIT, please use SOAPdenovo-fusion. It is a preparation module that takes contigs as input and generates files that could be used consecutively by SOAPdenovo's map and scaff module.
+
+Reference:
MEGAHIT: An ultra-fast single-node solution for large and complex metagenomics assembly via succinct de Bruijn graph
-http://www.ncbi.nlm.nih.gov/pubmed/25609793
-https://github.com/voutcn/megahit
+Manuscript
+Github
+
+## For MAC users
+
+Please use brew to install SOAPdenovo. SOAPdenovo's package in Homebrew-science is managed by Shaun Jackman.
## Introduction
diff --git a/fusion/Makefile b/fusion/Makefile
new file mode 100755
index 0000000..b33e154
--- /dev/null
+++ b/fusion/Makefile
@@ -0,0 +1,46 @@
+# Generated automatically from Makefile.in by configure.
+SHELL = /bin/sh
+
+exec_prefix = .
+bindir = $(exec_prefix)/bin
+libdir =
+mandir =
+
+CC = gcc
+CCOPT = -O3 -fprefetch-loop-arrays -funroll-loops -fomit-frame-pointer -w
+LIBS = -lm -lpthread
+INCDIRS = -Iinc/
+CFLAGS = ${CCOPT} ${INCDIRS}
+
+all: clean SOAPdenovo-fusion
+SRCS1 = searchPath.c scaffold.c check.c seq.c bundle.c potential.c\
+ loadGraph.c mem_manager.c attachPEinfo.c newhash.c\
+ output_scaffold.c orderContig.c connect.c hashFunction.c\
+ readseq1by1.c fib.c fibHeap.c stack.c kmer.c prepare.c
+OBJS1 = searchPath.o scaffold.o check.o seq.o bundle.o potential.o\
+ loadGraph.o mem_manager.o attachPEinfo.o newhash.o\
+ output_scaffold.o orderContig.o connect.o hashFunction.o\
+ readseq1by1.o fib.o fibHeap.o stack.o kmer.o prepare.o
+
+SRCS2 = prlHashCtg.c prlRead2Ctg.c map.c localAsm.c\
+ lib.c darray.c prlReadFillGap.c read2scaf.c
+OBJS2 = prlHashCtg.o prlRead2Ctg.o map.o localAsm.o\
+ lib.o darray.o prlReadFillGap.o read2scaf.o
+
+
+SRCS3 = main.c
+OBJS3 = main.o
+
+.c.o :
+ @printf "Compiling $<... \r"
+ @$(CC) $(CFLAGS) -c $<
+
+SOAPdenovo-fusion: $(OBJS1) $(OBJS2) $(OBJS3)
+ @printf "Making $@... \r"
+ @$(CC) $(CCOPT) -o $@ $^ $(LIBS)
+ @printf "$@ compilation done.\n";
+
+clean:
+ @/bin/rm -f *.o SOAPdenovo-fusion
+ @printf "SOAPdenovo-fusion cleaning done. \n"
+
diff --git a/fusion/attachPEinfo.c b/fusion/attachPEinfo.c
new file mode 100755
index 0000000..b733e80
--- /dev/null
+++ b/fusion/attachPEinfo.c
@@ -0,0 +1,488 @@
+#include "stdinc.h"
+#include "newhash.h"
+#include "extfunc.h"
+#include "extvab.h"
+#include "stack.h"
+
+#define CNBLOCKSIZE 10000
+#define GAPARRSIZE 256
+#define BIG_NEG -10000000
+#define BIG_POS 10000000
+static STACK * isStack;
+static int ignorePE1,ignorePE2,ignorePE3,ignorePE4,ignorePE5,static_flag;
+static int onsameCtgPE;
+static unsigned long long peSUM;
+
+//static boolean staticF;
+
+static int existCounter;
+
+int calcuIS(STACK *intStack,int *SD);
+
+
+static int cmp_pe(const void *a,const void *b)
+{
+ PE_INFO *A,*B;
+ A = (PE_INFO *)a;
+ B = (PE_INFO *)b;
+
+ if(A->rank>B->rank)
+ return 1;
+ else if(A->rank==B->rank)
+ return 0;
+ else
+ return -1;
+}
+
+void loadPEgrads(char *infile)
+{
+ FILE *fp;
+ char name[256],line[1024];
+ int i;
+ boolean rankSet=1;
+
+ sprintf(name,"%s.peGrads",infile);
+ fp = fopen(name,"r");
+ if(!fp){
+ printf("can not open file %s .\n",name);
+ gradsCounter = 0;
+ return;
+ }
+
+ while(fgets(line,sizeof(line),fp)!=NULL){
+ if(line[0] == 'g'){
+ sscanf(line+10, "%d %lld %d",&gradsCounter,&n_solexa,&maxReadLen);
+ //printf("there're %d grads, %lld reads, max read len %d\n",gradsCounter,n_solexa,maxReadLen);
+ printf("[%s]reads statistic : %lld reads with max len %d in %d grads .\n",__FUNCTION__,n_solexa,maxReadLen,gradsCounter);
+ break;
+ }
+ }
+
+ alloc_pe_mem(gradsCounter);
+
+ for(i=0;i255)
+ weight = 255;
+
+ connect = getCntBetween(e1, e2);
+ if(connect){
+ if(!weight)
+ return connect;
+ existCounter++;
+ if(!inherit){
+ sum = connect->weightNotInherit*connect->gapLen + gap*weight;
+ connect->gapLen = sum/(connect->weightNotInherit+weight);
+ if(connect->weightNotInherit+weight <=255)
+ connect->weightNotInherit += weight;
+ else if(connect->weightNotInherit<255)
+ connect->weightNotInherit = 255;
+ }else{
+ sum = connect->weight*connect->gapLen + gap*weight;
+ connect->gapLen = sum/(connect->weight+weight);
+ if(!connect->inherit){
+ connect->maxSingleWeight = connect->weightNotInherit;
+ }
+ connect->inherit = 1;
+ connect->maxSingleWeight = connect->maxSingleWeight>weight ?
+ connect->maxSingleWeight:weight;
+ }
+ if(connect->weight+weight <=255){
+ connect->weight += weight;
+ }else if(connect->weight<255){
+ connect->weight = 255;
+ }
+
+ }else{
+ newCntCounter++;
+ connect = allocateCN(e2,gap);
+ if(cntLookupTable)
+ putCnt2LookupTable(e1,connect);
+ connect->weight = weight;
+ if(contig_array[e1].mask||contig_array[e2].mask){
+ connect->mask = 1;
+ }
+ connect->next = contig_array[e1].downwardConnect;
+ contig_array[e1].downwardConnect = connect;
+ if(!inherit){
+ connect->weightNotInherit = weight;
+ }else{
+ connect->weightNotInherit = 0;
+ connect->inherit = 1;
+ connect->maxSingleWeight = weight;
+ }
+ }
+
+ return connect;
+}
+CONNECT *add1AccuConnect(unsigned int e1, unsigned int e2, int gap, int weight)
+{
+ if(e1==e2||e1==getTwinCtg(e2))
+ return NULL;
+ CONNECT *connect=NULL;
+ //long long sum;
+ if(weight>255)
+ weight = 255;
+
+ connect = getCntBetween(e1, e2);
+ if(connect){
+ if(!weight)
+ return connect;
+ existCounter++;
+ //if(!inherit){
+ //sum = connect->weightNotInherit*connect->gapLen + gap*weight;
+ //connect->gapLen = sum/(connect->weightNotInherit+weight);
+ int i=connect->weightNotInherit;
+
+ if(connect->weightNotInherit+weight <=255)
+ connect->weightNotInherit += weight;
+ else if(connect->weightNotInherit<255)
+ connect->weightNotInherit = 255;
+ for(;iweightNotInherit;i++){
+ connect->PE[i]=gap;
+ fprintf(stderr,"inputting a PE with estimated gap size %d\n",gap);
+ }
+ /*}else{
+ //sum = connect->weight*connect->gapLen + gap*weight;
+ //connect->gapLen = sum/(connect->weight+weight);
+ if(!connect->inherit){
+ connect->maxSingleWeight = connect->weightNotInherit;
+ }
+ connect->inherit = 1;
+ connect->maxSingleWeight = connect->maxSingleWeight>weight ?
+ connect->maxSingleWeight:weight;
+ }*/
+ if(connect->weight+weight <=255){
+ connect->weight += weight;
+ }else if(connect->weight<255){
+ connect->weight = 255;
+ }
+
+ }else{
+ newCntCounter++;
+ connect = allocateCN(e2,gap);
+ if(cntLookupTable)
+ putCnt2LookupTable(e1,connect);
+ connect->weight = weight;
+ connect->PE=(int *)ckalloc(GAPARRSIZE*sizeof(int));//newly added
+ fprintf(stderr,"creating array for PEs in a connection.\n");
+ int i;
+ for(i=0;iPE[i]=gap;
+ fprintf(stderr,"inputting a PE with estimated gap size %d\n",gap);
+ }
+ if(contig_array[e1].mask||contig_array[e2].mask){
+ connect->mask = 1;
+ }
+ connect->next = contig_array[e1].downwardConnect;
+ contig_array[e1].downwardConnect = connect;
+ //if(!inherit){
+ connect->weightNotInherit = weight;
+ /*}else{
+ connect->weightNotInherit = 0;
+ connect->inherit = 1;
+ connect->maxSingleWeight = weight;
+ }*/
+ }
+
+ return connect;
+}
+int attach1PE(unsigned int e1,int pre_pos,unsigned int bal_e2,int pos,int insert_size)
+{
+ int gap,realpeSize;
+ unsigned int bal_e1,e2;
+ if(e1==bal_e2){
+ ignorePE1++;
+ return -1; //orientation wrong
+ }
+
+ bal_e1 = getTwinCtg(e1);
+ e2 = getTwinCtg(bal_e2);
+ if(e1==e2){
+ realpeSize = contig_array[e1].length + overlaplen - pre_pos - pos;
+ if(realpeSize>0){
+ peSUM += realpeSize;
+ onsameCtgPE++;
+ if((int)contig_array[e1].length>insert_size){
+ int *item = (int *)stackPush(isStack);
+ (*item) = realpeSize;
+ }
+ }
+ return 2;
+ }
+
+ gap = insert_size - overlaplen + pre_pos + pos - contig_array[e1].length - contig_array[e2].length;
+ //fprintf(stderr,"[%s]\tgap\t%d\t%d\t%f\t%f\n",__FUNCTION__,gap,insert_size,close_threshold,insert_size*close_threshold);
+ if(gap<-(insert_size*close_threshold)){
+ ignorePE2++;
+ return 0;
+ }
+ if(gap>insert_size){
+ ignorePE3++;
+ return 0;
+ }
+ add1AccuConnect(e1,e2,gap,1);
+ add1AccuConnect(bal_e2,bal_e1,gap,1);
+
+ return 1;
+}
+
+int connectByPE_grad(FILE *fp,int peGrad,char *line)
+{
+ fprintf(stderr,"[%s]entering this function.\n",__FUNCTION__);
+ long long pre_readno,readno,minno,maxno;
+ int pre_pos,pos,flag,PE,count=0;
+ unsigned int pre_contigno,contigno,newIndex;
+
+ if(peGrad<0||peGrad>gradsCounter){
+ printf("[%s]specified pe grad is out of bound .\n",__FUNCTION__);
+ return 0;
+ }
+ maxno = pes[peGrad].PE_bound;
+ if(peGrad==0)
+ minno = 0;
+ else
+ minno = pes[peGrad-1].PE_bound;
+
+ onsameCtgPE = peSUM = 0;
+ PE = pes[peGrad].insertS;
+ if(strlen(line)){
+ sscanf(line,"%lld %d %d",&pre_readno,&pre_contigno,&pre_pos);
+ //printf("first record %d %d %d\n",pre_readno,pre_contigno,pre_pos);
+ if(pre_readno<=minno)
+ pre_readno = -1;
+ }
+ else
+ pre_readno = -1;
+ ignorePE1 = ignorePE2 = ignorePE3 = ignorePE4 = ignorePE5 = 0;
+ static_flag = 1;
+ isStack = (STACK *)createStack(CNBLOCKSIZE,sizeof(int));
+ while(fgets(line,lineLen,fp)!=NULL){
+ sscanf(line,"%lld %d %d",&readno,&contigno,&pos);
+ if(readno>maxno)
+ break;
+ if(readno<=minno)
+ continue;
+
+ newIndex = index_array[contigno];
+ //if(contig_array[newIndex].bal_edge==0)
+ if(isSameAsTwin(newIndex))
+ continue;
+ if(PE&&(readno%2==0)&&(pre_readno==readno-1)){ // they are a pair of reads
+ flag = attach1PE(pre_contigno,pre_pos,newIndex,pos,PE);
+ if(flag==1)
+ count++;
+ }
+ pre_readno = readno;
+ pre_contigno = newIndex;
+ pre_pos = pos;
+ }
+ printf("[%s]Finish loading all PEs in grad %d .\n",__FUNCTION__,peGrad);
+ printf("[%s]Calculating estimated gap size for all connections .\n",__FUNCTION__);
+ unsigned int i;
+ for(i=1;i<=num_ctg;i++){
+ CONNECT *tmp=contig_array[i].downwardConnect;
+ while(tmp){
+ if(tmp->weightNotInherit<=8&&tmp->weightNotInherit>2){//delete max and min value
+ int max=BIG_NEG,maxid=-1,min=BIG_POS,minid=-1;
+ int weight=tmp->weightNotInherit;
+ int ii;
+ for(ii=0;iiPE[ii]>max){
+ max=tmp->PE[ii];
+ maxid=ii;
+ }
+ if(tmp->PE[ii]<=min){
+ min=tmp->PE[ii];
+ minid=ii;
+ }
+ }
+ int sum=0;
+ for(ii=0;iiPE[ii];
+ }
+ }
+ ignorePE4+=2;
+ tmp->gapLen=sum/(weight-2);
+ fprintf(stderr,"estimating contigs' gap by removing max&min PE ,with max&min %d %d\n",
+ tmp->PE[maxid],tmp->PE[minid]);
+ }else if(tmp->weightNotInherit>8){//delete values exceed 3*SD
+ long long int sum=0;
+ int weight=tmp->weightNotInherit;
+ int ii;
+ int counter=0;
+ for(ii=0;iiPE[ii];
+ }
+
+ long long int avg=sum/weight;
+ sum = 0;
+ for(ii=0;iiPE[ii]-avg)*(tmp->PE[ii]-avg));
+ }
+
+ double SD=(sqrt((double)sum/(weight-1)))*3;//just for fast
+ sum=0;
+ int num=0;
+ for(ii=0;iiPE[ii]-avg)<=SD){
+ sum+=tmp->PE[ii];
+ num++;
+ }else{
+ ignorePE5++;
+ counter++;
+ }
+ }
+ if(num==0){
+ fprintf(stderr,"[%s]num=0 in removing exceed 3*SD(%.1f) avg(%d)step",__FUNCTION__,SD,avg);
+ for(ii=0;iiPE[ii]);
+ }
+ }
+ tmp->gapLen=sum/num;
+ fprintf(stderr,"estimating contigs' gap by removing PE exceeding 3*SD ,removing %d PEs\n",counter);
+ }else if(tmp->weightNotInherit<=2){
+ int weight=tmp->weightNotInherit;
+ int sum=0;
+ int ii;
+ for(ii=0;iiPE[ii];
+ }
+ tmp->gapLen=sum/weight;
+ fprintf(stderr,"weight too small , directly estimate gap size.\n");
+ }
+ //fprintf(stderr,"finish %d connection.\n",i);
+ free((void *)tmp->PE);
+ tmp=tmp->next;
+ }
+ }
+ //printf("%d PEs with insert size %d attached, %d + %d + %d ignored\n",count,PE,ignorePE1,ignorePE2,ignorePE3);
+ fprintf(stderr,"[%s]%d PEs of insert size %d loaded .\n",__FUNCTION__,count,PE);
+ fprintf(stderr,"[%s]PEs discarded:%d because of wrong orientation,%d too close,%d too far,\n",__FUNCTION__,ignorePE1,ignorePE2,ignorePE3);
+ fprintf(stderr,"[%s]%d deleted by removing max&min , %d not fall in 3*SD.\n",__FUNCTION__,ignorePE4,ignorePE5);
+ printf("[%s]%d PEs of insert size %d loaded .\n",__FUNCTION__,count,PE);
+ printf("[%s]PEs discarded :%d because of wrong orientation,%d too close,%d too far ,\n",__FUNCTION__,ignorePE1,ignorePE2,ignorePE3);
+ printf("[%s]%d deleted by removing max&min , %d not fall in 3*SD .\n",__FUNCTION__,ignorePE4,ignorePE5);
+
+ if(onsameCtgPE>0){
+ //printf("estimated PE size %lli, by %d pairs\n",peSUM/onsameCtgPE,onsameCtgPE);
+ int SD=0;
+ int avg=calcuIS(isStack,&SD);
+ printf("[%s]%d PE attached on same contig with estimated insert size %d SD %d .\n",__FUNCTION__,onsameCtgPE,avg,SD);
+ }
+ //printf("on contigs longer than %d, %d pairs found,",PE,isStack->item_c);
+ //printf("insert_size estimated: %d\n",calcuIS(isStack));
+ freeStack(isStack);
+ return count;
+}
+
+
+int calcuIS(STACK *intStack,int *SD)
+{
+ long long sum=0;
+ int avg=0;
+ int *item;
+ int num = intStack->item_c;
+
+ if(num<100)
+ return avg;
+ stackBackup(intStack);
+ while((item=(int *)stackPop(intStack))!=NULL)
+ sum += *item;
+
+ stackRecover(intStack);
+ num = intStack->item_c;
+ avg = sum/num;
+
+ sum = 0;
+ stackBackup(intStack);
+ while((item=(int *)stackPop(intStack))!=NULL)
+ sum += (*item-avg)*(*item-avg);
+
+ *SD = sqrt(sum/(num-1));
+ if(SD==0){
+ //printf("SD=%d, ",SD);
+ return avg;
+ }
+ stackRecover(intStack);
+ sum = num = 0;
+ while((item=(int *)stackPop(intStack))!=NULL)
+ if(abs(*item-avg)<3**SD){
+ sum += *item;
+ num++;
+ }
+
+ avg = sum/num;
+ //printf("SD=%d, ",SD);
+ return avg;
+
+}
+
+unsigned int getTwinCtg(unsigned int ctg)
+{
+ return ctg + contig_array[ctg].bal_edge - 1;
+}
+
+boolean isSmallerThanTwin(unsigned int ctg)
+{
+ return contig_array[ctg].bal_edge > 1;
+}
+
+boolean isLargerThanTwin(unsigned int ctg)
+{
+ return contig_array[ctg].bal_edge < 1;
+}
+
+boolean isSameAsTwin(unsigned int ctg)
+{
+ return contig_array[ctg].bal_edge == 1;
+}
diff --git a/fusion/bundle.c b/fusion/bundle.c
new file mode 100755
index 0000000..4bc1efa
--- /dev/null
+++ b/fusion/bundle.c
@@ -0,0 +1,455 @@
+#include "stdinc.h"
+#include "newhash.h"
+#include "extfunc.h"
+#include "extvab.h"
+#include "dfibHeap.h"
+#include "fibHeap.h"
+#include "darray.h"
+
+
+#define CNBLOCKSIZE 10000
+#define GAPARRSIZE 256
+#define BIG_NEG -10000000
+#define BIG_POS 10000000
+static STACK * isStack;
+static int onsameCtgPE;
+extern int calcuIS(STACK *intStack,int *SD);
+void outputBundle(FILE *fp, int insertS);
+
+static CONNECT *bun1AccuConnect(unsigned int e1, unsigned int e2, int gap, int weight)
+{
+ if(e1==e2||e1==getTwinCtg(e2))
+ return NULL;
+ CONNECT *connect=NULL;
+ //long long sum;
+ if(weight>255)
+ weight = 255;
+
+ connect = getCntBetween(e1, e2);
+ if(connect){
+ if(!weight)
+ return connect;
+ //existCounter++;
+ //if(!inherit){
+ //sum = connect->weightNotInherit*connect->gapLen + gap*weight;
+ //connect->gapLen = sum/(connect->weightNotInherit+weight);
+ int i=connect->weightNotInherit;
+
+ if(connect->weightNotInherit+weight <=255)
+ connect->weightNotInherit += weight;
+ else if(connect->weightNotInherit<255)
+ connect->weightNotInherit = 255;
+ for(;iweightNotInherit;i++){
+ //connect->PE[i]=gap;
+ //fprintf(stderr,"inputting a PE with estimated gap size %d\n",gap);
+ }
+ /*}else{
+ //sum = connect->weight*connect->gapLen + gap*weight;
+ //connect->gapLen = sum/(connect->weight+weight);
+ if(!connect->inherit){
+ connect->maxSingleWeight = connect->weightNotInherit;
+ }
+ connect->inherit = 1;
+ connect->maxSingleWeight = connect->maxSingleWeight>weight ?
+ connect->maxSingleWeight:weight;
+ }*/
+ if(connect->weight+weight <=255){
+ connect->weight += weight;
+ }else if(connect->weight<255){
+ connect->weight = 255;
+ }
+
+ }else{
+ newCntCounter++;
+ connect = allocateCN(e2,gap);
+ if(cntLookupTable)
+ putCnt2LookupTable(e1,connect);
+ connect->weight = weight;
+ //connect->PE=(int *)ckalloc(GAPARRSIZE*sizeof(int));//newly added
+ //fprintf(stderr,"creating array for PEs in a connection.\n");
+ int i;
+ for(i=0;iPE[i]=gap;
+ //fprintf(stderr,"inputting a PE with estimated gap size %d\n",gap);
+ }
+ if(contig_array[e1].mask||contig_array[e2].mask){
+ connect->mask = 1;
+ }
+ connect->next = contig_array[e1].downwardConnect;
+ contig_array[e1].downwardConnect = connect;
+ //if(!inherit){
+ connect->weightNotInherit = weight;
+ /*}else{
+ connect->weightNotInherit = 0;
+ connect->inherit = 1;
+ connect->maxSingleWeight = weight;
+ }*/
+ }
+
+ return connect;
+}
+
+static int in1PE(unsigned int e1,int pre_pos,unsigned int bal_e2,int pos,int insert_size)
+{
+ int gap,realpeSize;
+ unsigned int bal_e1,e2;
+ if(e1==bal_e2){
+ //ignorePE1++;
+ return -1; //orientation wrong
+ }
+
+ bal_e1 = getTwinCtg(e1);
+ e2 = getTwinCtg(bal_e2);
+ if(e1==e2){
+ realpeSize = contig_array[e1].length + overlaplen - pre_pos - pos;
+ if(realpeSize>0){
+ //peSUM += realpeSize;
+ onsameCtgPE++;
+ if((int)contig_array[e1].length>insert_size){
+ int *item = (int *)stackPush(isStack);
+ (*item) = realpeSize;
+ }
+ }
+ return 2;
+ }
+
+ gap = insert_size - overlaplen + pre_pos + pos - contig_array[e1].length - contig_array[e2].length;
+ //fprintf(stderr,"[%s]\t%d\t%d\tgap\t%d\t%d\t%d\t%d\n",__FUNCTION__,e1,e2,gap,contig_array[e1].bal_edge,contig_array[e2].bal_edge,insert_size);
+ //if(gap<-(insert_size/10)){
+ // //ignorePE2++;
+ // return 0;
+ //}
+ bun1AccuConnect(e1,e2,gap,1);
+ bun1AccuConnect(bal_e2,bal_e1,gap,1);
+
+ return 1;
+}
+
+static int inputPE(FILE *fp,int peGrad,char *line)
+{
+ long long pre_readno,readno,minno,maxno;
+ int pre_pos,pos,flag,PE,count=0;
+ unsigned int pre_contigno,contigno,newIndex;
+
+ if(peGrad<0||peGrad>gradsCounter){
+ printf("[%s]specified pe grad is out of bound .\n",__FUNCTION__);
+ return 0;
+ }
+ maxno = pes[peGrad].PE_bound;
+ if(peGrad==0)
+ minno = 0;
+ else
+ minno = pes[peGrad-1].PE_bound;
+
+ //onsameCtgPE = peSUM = 0;
+ PE = pes[peGrad].insertS;
+ if(strlen(line)){
+ sscanf(line,"%lld %d %d",&pre_readno,&pre_contigno,&pre_pos);
+ //printf("first record %d %d %d\n",pre_readno,pre_contigno,pre_pos);
+ if(pre_readno<=minno)
+ pre_readno = -1;
+ }
+ else
+ pre_readno = -1;
+ //ignorePE1 = ignorePE2 = ignorePE3 = ignorePE4 = ignorePE5 = 0;
+ //static_flag = 1;
+ isStack = (STACK *)createStack(CNBLOCKSIZE,sizeof(int));
+ while(fgets(line,lineLen,fp)!=NULL){
+ sscanf(line,"%lld %d %d",&readno,&contigno,&pos);
+ if(readno>maxno)
+ break;
+ if(readno<=minno)
+ continue;
+
+ newIndex = index_array[contigno];
+ //if(contig_array[newIndex].bal_edge==0)
+ if(isSameAsTwin(newIndex))
+ continue;
+ if(PE&&(readno%2==0)&&(pre_readno==readno-1)){ // they are a pair of reads
+ flag = in1PE(pre_contigno,pre_pos,newIndex,pos,PE);
+ if(flag==1)
+ count++;
+ }
+ pre_readno = readno;
+ pre_contigno = newIndex;
+ pre_pos = pos;
+ }
+ printf("[%s]Finish loading all PEs in grad %d .\n",__FUNCTION__,peGrad);
+ printf("[%s]Calculating estimated gap size for all connections .\n",__FUNCTION__);
+ /*unsigned int i;
+ for(i=1;i<=num_ctg;i++){
+ CONNECT *tmp=contig_array[i].downwardConnect;
+ while(tmp){
+ if(tmp->weightNotInherit<=8&&tmp->weightNotInherit>2){//delete max and min value
+ int max=BIG_NEG,maxid=-1,min=BIG_POS,minid=-1;
+ int weight=tmp->weightNotInherit;
+ int ii;
+ for(ii=0;iiPE[ii]>max){
+ max=tmp->PE[ii];
+ maxid=ii;
+ }
+ if(tmp->PE[ii]<=min){
+ min=tmp->PE[ii];
+ minid=ii;
+ }
+ }
+ int sum=0;
+ for(ii=0;iiPE[ii];
+ }
+ }
+ //ignorePE4+=2;
+ tmp->gapLen=sum/(weight-2);
+ //fprintf(stderr,"estimating contigs' gap by removing max&min PE ,with max&min %d %d\n",
+ //tmp->PE[maxid],tmp->PE[minid]);
+ }else if(tmp->weightNotInherit>8){//delete values exceed 3*SD
+ long long int sum=0;
+ int weight=tmp->weightNotInherit;
+ int ii;
+ int counter=0;
+ for(ii=0;iiPE[ii];
+ }
+
+ long long int avg=sum/weight;
+ sum = 0;
+ for(ii=0;iiPE[ii])*(avg-(long long int)tmp->PE[ii]));
+ }
+
+ double SD=(sqrt((double)sum/(weight-1)))*3;//just for fast
+ sum=0;
+ int num=0;
+ for(ii=0;iiPE[ii]-avg)<=SD){
+ sum+=tmp->PE[ii];
+ num++;
+ }else{
+ //ignorePE5++;
+ counter++;
+ }
+ }
+ if(num==0){
+ //fprintf(stderr,"[%s]num=0 in removing exceed 3*SD(%.1f) avg(%lld)step",__FUNCTION__,SD,avg);
+ for(ii=0;iiPE[ii]);
+ }
+ }
+ tmp->gapLen=sum/num;
+ //fprintf(stderr,"estimating contigs' gap by removing PE exceeding 3*SD ,removing %d PEs\n",counter);
+ }else if(tmp->weightNotInherit<=2){
+ int weight=tmp->weightNotInherit;
+ int sum=0;
+ int ii;
+ for(ii=0;iiPE[ii];
+ }
+ tmp->gapLen=sum/weight;
+ //fprintf(stderr,"weight too small , directly estimate gap size.\n");
+ //}
+ //fprintf(stderr,"finish %d connection.\n",i);
+ //free((void *)tmp->PE);
+ tmp=tmp->next;
+ }
+ }*/
+ //printf("%d PEs with insert size %d attached, %d + %d + %d ignored\n",count,PE,ignorePE1,ignorePE2,ignorePE3);
+ fprintf(stderr,"[%s]%d PEs of insert size %d loaded .\n",__FUNCTION__,count,PE);
+ //fprintf(stderr,"[%s]PEs discarded:%d because of wrong orientation,%d too close,%d too far,\n",__FUNCTION__,ignorePE1,ignorePE2,ignorePE3);
+ //fprintf(stderr,"[%s]%d deleted by removing max&min , %d not fall in 3*SD.\n",__FUNCTION__,ignorePE4,ignorePE5);
+ //printf("[%s]%d PEs of insert size %d loaded .\n",__FUNCTION__,count,PE);
+ //printf("[%s]PEs discarded :%d because of wrong orientation,%d too close,%d too far ,\n",__FUNCTION__,ignorePE1,ignorePE2,ignorePE3);
+ //printf("[%s]%d deleted by removing max&min , %d not fall in 3*SD .\n",__FUNCTION__,ignorePE4,ignorePE5);
+
+ /*if(onsameCtgPE>0){
+ //printf("estimated PE size %lli, by %d pairs\n",peSUM/onsameCtgPE,onsameCtgPE);
+ int SD=0;
+ int avg=calcuIS(isStack,&SD);
+ printf("[%s]%d PE attached on same contig with estimated insert size %d SD %d .\n",__FUNCTION__,onsameCtgPE,avg,SD);
+ }*/
+ //printf("on contigs longer than %d, %d pairs found,",PE,isStack->item_c);
+ //printf("insert_size estimated: %d\n",calcuIS(isStack));
+ //freeStack(isStack);
+ return count;
+}
+
+int call_bundle(){
+ char name[256],*line;
+ FILE *fp,*linkF;
+ int i;
+ int flag=0;
+ unsigned int j;
+
+ loadUpdatedEdges(graphfile);
+
+ //sprintf(name,"%s.bundle",graphfile);
+
+ linkF = ckopen(name,"w");
+
+ if(!pes)
+ loadPEgrads(graphfile);
+
+ sprintf(name,"%s.readOnContig",graphfile);
+ fp = ckopen(name,"r");
+
+ lineLen = 1024;
+ line = (char *)ckalloc(lineLen*sizeof(char));
+
+ fgets(line,lineLen,fp);
+ line[0] = '\0';
+
+ //printf("\n");
+ newCntCounter = 0;
+ //createCntMemManager();
+ //createCntLookupTable();
+ /*int *length_array = (unsigned int *)ckalloc((num_ctg+1)*sizeof(unsigned int));
+ //use length_array to change info in index_array
+ for(i=1;i<=num_ctg;i++)
+ length_array[i] = 0;
+
+ for(i=1;i<=num_ctg;i++){
+ if(index_array[i]>0)
+ length_array[index_array[i]] = i;
+ }
+ for(i=1;i<=num_ctg;i++)
+ index_array[i] = length_array[i];
+ */
+ for(i=0;iPE);
+ tmp=tmp->next;
+ }
+ contig_array[j].downwardConnect = NULL;
+ }
+ //destroyConnectMem();
+ //deleteCntLookupTable();
+
+ fclose(linkF);
+ }
+
+ outputBundle(linkF,1);
+ destroyConnectMem();
+ deleteCntLookupTable();
+
+ free((void *)line);
+ fclose(fp);
+ //fclose(linkF);
+ printf("[%s]all PEs attached\n",__FUNCTION__);
+
+ return 0;
+}
+
+void outputBundle(FILE *fp, int insertS)
+{
+ unsigned int i,bal_ctg,bal_toCtg;
+ CONNECT *cnts,*temp_cnt;
+ //printf("outputLinks, %d contigs\n",num_ctg);
+ for(i=1;i<=num_ctg;i++){
+ cnts = contig_array[i].downwardConnect;
+ bal_ctg = getTwinCtg(i);
+ //fprintf(stderr,"contig %d.\n",i);
+ while(cnts){
+ if(cnts->weightNotInherit<=bund_threshold){
+ cnts = cnts->next;
+ continue;
+ }
+ //fprintf(stderr,"with contig %d.\n",cnts->contigID);
+ //fprintf(fp,"%-10d %-10d\t%d\t%d\t%d\n"
+ //,i,cnts->contigID,cnts->gapLen,cnts->weight,insertS);
+ /*int st1,st2,ed1,ed2,len1,len2,gap;
+ len1=contig_array[i].length+overlaplen;
+ len2=contig_array[cnts->contigID].length+overlaplen;
+ gap=-cnts->gapLen;
+ if(len1contigID];*/
+ /*if((id1/2+1)==1194){
+ int ii;
+ fprintf(stdout,"\n");
+ for(ii=0;iiweightNotInherit;++ii){
+ fprintf(stdout,"%d ",cnts->PE[ii]);
+ }
+ fprintf(stdout,"\n");
+ }*/
+ /*if(isSmallerThanTwin(id1)){
+ if(isSmallerThanTwin(id2)){
+ fprintf(fp,"%u\t%d\t%u\t%d\t%d\n",id1/2+1,len1,id2/2+1,len2,cnts->gapLen,cnts->weightNotInherit);
+
+ }else{
+ fprintf(fp,"%u\t%d\t%u\t%d\t%d\n",id1/2+1,len1,-id2/2,len2,cnts->gapLen,cnts->weightNotInherit);
+ }
+ }else{
+ if(isSmallerThanTwin(id2)){
+ fprintf(fp,"%u\t%d\t%u\t%d\t%d\n",-id1/2,len1,id2/2+1,len2,cnts->gapLen,cnts->weightNotInherit);
+ }else{
+ fprintf(fp,"%u\t%d\t%u\t%d\t%d\n",-id1/2,len1,-id2/2,len2,cnts->gapLen,cnts->weightNotInherit);
+ }
+ }*/
+ //int ii=0;
+ //int weight=cnts->weightNotInherit;
+ //for(;iigapLen);
+ //}
+ if(cnts->gapLen<0){
+ fprintf(fp,"%d\t%d\t%d\n",i,cnts->contigID,cnts->gapLen);
+ }
+
+ //fprintf(fp,"\n");
+ cnts->weightNotInherit = 0;
+
+ bal_toCtg = getTwinCtg(cnts->contigID);
+ temp_cnt = getCntBetween(bal_toCtg,bal_ctg);
+ if(temp_cnt)
+ temp_cnt->weightNotInherit = 0;
+
+ cnts = cnts->next;
+ }
+ }
+}
+
diff --git a/fusion/check.c b/fusion/check.c
new file mode 100755
index 0000000..2af20d6
--- /dev/null
+++ b/fusion/check.c
@@ -0,0 +1,64 @@
+/***************************************************************************
+ * Title: check.c
+ * Author: Haixu Tang
+ * Created: Jun. 2002
+ * Last modified: May. 2004
+ *
+ * Copyright (c) 2001-2004 The Regents of the University of California
+ * All Rights Reserved
+ * See file LICENSE for details.
+ ***************************************************************************/
+
+/* ckopen - open file; check for success */
+
+#include
+//#include
+
+void *ckalloc(unsigned long long amount);
+FILE *ckopen(char *name, char *mode);
+
+FILE *ckopen(char *name, char *mode)
+{
+ FILE *fp;
+
+ if ((fp = fopen(name, mode)) == NULL) {
+ printf("Cannot open file %s.\n", name);
+ exit(-1);
+ }
+ return(fp);
+}
+
+
+/* ckalloc - allocate space; check for success */
+
+void *ckalloc(unsigned long long amount)
+{
+ void *p;
+
+ if ((p = (void *) calloc( 1, (unsigned long long) amount)) == NULL && amount != 0) {
+ printf("not enought memory");
+ fflush(stdout);
+ exit(-1);
+ }
+ return(p);
+}
+
+
+/* reallocate memory */
+void *ckrealloc(void *p, size_t new_size, size_t old_size)
+{
+ void *q;
+
+ q = realloc((void *) p, new_size);
+ if (new_size == 0 || q != (void *) 0)
+ return q;
+
+ /* manually reallocate space */
+ q = ckalloc(new_size);
+
+ /* move old memory to new space */
+ bcopy(p, q, old_size);
+ free(p);
+
+ return q;
+}
diff --git a/fusion/connect.c b/fusion/connect.c
new file mode 100755
index 0000000..4a63a20
--- /dev/null
+++ b/fusion/connect.c
@@ -0,0 +1,173 @@
+#include "stdinc.h"
+#include "newhash.h"
+#include "extfunc.h"
+#include "extvab.h"
+
+#define CNBLOCKSIZE 100000
+
+void createCntMemManager()
+{
+ if(!cn_mem_manager)
+ cn_mem_manager = createMem_manager(CNBLOCKSIZE,sizeof(CONNECT));
+ //else
+ //printf("cn_mem_manger was created\n");
+}
+
+void destroyConnectMem()
+{
+ freeMem_manager(cn_mem_manager);
+ cn_mem_manager = NULL;
+}
+
+CONNECT *allocateCN(unsigned int contigId, int gap)
+{
+ CONNECT *newCN;
+ newCN = (CONNECT *)getItem(cn_mem_manager);
+ newCN->contigID = contigId;
+ newCN->gapLen = gap;
+
+ newCN->minGap = 0;
+ newCN->maxGap = 0;
+ newCN->bySmall = 0;
+ newCN->weakPoint = 0;
+
+ newCN->weight = 1;
+ newCN->weightNotInherit = 0;
+ newCN->mask = 0;
+ newCN->used = 0;
+ newCN->checking = 0;
+ newCN->deleted = 0;
+ newCN->prevInScaf = 0;
+ newCN->inherit = 0;
+ newCN->singleInScaf = 0;
+ newCN->nextInScaf = NULL;
+ newCN->PE=NULL;//(int *)ckalloc(CNBLOCKSIZE*sizeof(int));
+
+ return newCN;
+}
+
+void output_cntGVZ(char *outfile)
+{
+ char name[256];
+ FILE *fp;
+ unsigned int i;
+ CONNECT *connect;
+ boolean flag;
+
+ sprintf(name,"%s.scaffold.gvz",outfile);
+ fp = ckopen(name,"w");
+ fprintf(fp,"digraph G{\n");
+ fprintf(fp,"\tsize=\"512,512\";\n");
+
+ for(i=num_ctg;i>0;i--){
+ //if(contig_array[i].mask||!contig_array[i].downwardConnect)
+ if(!contig_array[i].downwardConnect)
+ continue;
+ connect = contig_array[i].downwardConnect;
+ while(connect){
+ //if(connect->mask||connect->deleted){
+ if(connect->deleted){
+ connect = connect->next;
+ continue;
+ }
+ if(connect->prevInScaf||connect->nextInScaf)
+ flag = 1;
+ else
+ flag = 0;
+ if(!connect->mask)
+ fprintf(fp,"\tC%d_%d -> C%d_%d [label = \"%d(%d_%d)\"];\n"
+ ,i,contig_array[i].length,connect->contigID,contig_array[connect->contigID].length,
+ connect->gapLen,flag,connect->weight);
+ else
+ fprintf(fp,"\tC%d_%d -> C%d_%d [label = \"%d(%d_%d)\", color = red];\n"
+ ,i,contig_array[i].length,connect->contigID,contig_array[connect->contigID].length,
+ connect->gapLen,flag,connect->weight);
+ connect = connect->next;
+ }
+ }
+ fprintf(fp,"}\n");
+ fclose(fp);
+}
+
+/***************** below this line all codes are about lookup table *****************/
+
+void createCntLookupTable()
+{
+ if(!cntLookupTable)
+ cntLookupTable = (CONNECT **)ckalloc((3*num_ctg+1)*sizeof(CONNECT *));
+}
+
+void deleteCntLookupTable()
+{
+ if(cntLookupTable){
+ free((void *)cntLookupTable);
+ cntLookupTable = NULL;
+ }
+}
+
+void putCnt2LookupTable(unsigned int from_c,CONNECT *cnt)
+{
+ if(!cnt||!cntLookupTable)
+ return;
+ unsigned int index = 2*from_c + cnt->contigID;
+ cnt->nextInLookupTable = cntLookupTable[index];
+ cntLookupTable[index] = cnt;
+}
+
+static CONNECT *getCntInLookupTable(unsigned int from_c,unsigned int to_c)
+{
+ unsigned int index = 2*from_c + to_c;
+ CONNECT *ite_cnt = cntLookupTable[index];
+ while(ite_cnt){
+ if(ite_cnt->contigID==to_c)
+ return ite_cnt;
+ ite_cnt = ite_cnt->nextInLookupTable;
+ }
+ return NULL;
+}
+
+CONNECT *getCntBetween(unsigned int from_c, unsigned int to_c)
+{
+ CONNECT *pcnt;
+
+ if(cntLookupTable){
+ pcnt = getCntInLookupTable(from_c,to_c);
+ return pcnt;
+ }
+ pcnt = contig_array[from_c].downwardConnect;
+
+ while(pcnt){
+ if(pcnt->contigID==to_c)
+ return pcnt;
+ pcnt = pcnt->next;
+ }
+ return pcnt;
+}
+/*
+void removeCntInLookupTable(unsigned int from_c,unsigned int to_c)
+{
+ unsigned int index = 2*from_c + to_c;
+ CONNECT *ite_cnt = cntLookupTable[index];
+ CONNECT *cnt;
+
+ if(!ite_cnt){
+ printf("removeCntInLookupTable: not found A\n");
+ return;
+ }
+ if(ite_cnt->contigID==to_c){
+ cntLookupTable[index] = ite_cnt->nextInLookupTable;
+ return;
+ }
+
+ while(ite_cnt->nextInLookupTable&&ite_cnt->nextInLookupTable->contigID!=to_c)
+ ite_cnt = ite_cnt->nextInLookupTable;
+
+ if(ite_cnt->nextInLookupTable){
+ cnt = ite_cnt->nextInLookupTable;
+ ite_cnt->nextInLookupTable = cnt->nextInLookupTable;
+ return;
+ }
+ printf("removeCntInLookupTable: not found B\n");
+ return;
+}
+*/
diff --git a/fusion/darray.c b/fusion/darray.c
new file mode 100755
index 0000000..5d6a789
--- /dev/null
+++ b/fusion/darray.c
@@ -0,0 +1,56 @@
+#include "darray.h"
+#include "check.h"
+
+DARRAY *createDarray(int num_items,size_t unit_size)
+{
+ DARRAY *newDarray = (DARRAY *)malloc(1*sizeof(DARRAY));
+
+ newDarray->array_size = num_items;
+ newDarray->item_size = unit_size;
+ newDarray->item_c = 0;
+ newDarray->array = (void *)ckalloc(num_items*unit_size);
+ return newDarray;
+}
+
+void *darrayPut(DARRAY *darray,long long index)
+{
+ int i=2;
+ if(index+1>darray->item_c)
+ darray->item_c = index + 1;
+ if(indexarray_size)
+ return darray->array + darray->item_size*index;
+ while(index>i*darray->array_size)
+ i++;
+
+ darray->array = (void *)ckrealloc(darray->array,i*darray->array_size*darray->item_size
+ ,darray->array_size*darray->item_size);
+ darray->array_size *=i;
+ return (void *)((void *)darray->array + darray->item_size*index);
+}
+
+void *darrayGet(DARRAY *darray, long long index)
+{
+ if(indexarray_size)
+ return (void *)((void *)darray->array + darray->item_size*index);
+ printf("array read index %lld out of range %lld\n",index,darray->array_size);
+ return NULL;
+}
+
+
+void emptyDarray(DARRAY *darray)
+{
+ darray->item_c = 0;
+}
+
+void freeDarray(DARRAY *darray)
+{
+
+ if(!darray)
+ return;
+
+ if(darray->array)
+ free((void *)darray->array);
+
+ free((void *)darray);
+}
+
diff --git a/fusion/fib.c b/fusion/fib.c
new file mode 100755
index 0000000..33f36a2
--- /dev/null
+++ b/fusion/fib.c
@@ -0,0 +1,640 @@
+/*
+Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
+
+ This file is part of Velvet.
+
+ Velvet is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ Velvet is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with Velvet; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*/
+/*-
+ * Copyright 1997-2003 John-Mark Gurney.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: fib.c,v 1.10 2007/10/19 13:09:26 zerbino Exp $
+ *
+ */
+#include
+#include
+#include "fib.h"
+#include "fibpriv.h"
+#include "extfunc2.h"
+
+#define HEAPBLOCKSIZE 10000
+
+static int fh_comparedata(FibHeap * h, Coordinate key, unsigned int data, FibHeapNode * b);
+unsigned int fh_replacekeydata(FibHeap * h, FibHeapNode * x,Coordinate key, unsigned int data);
+
+static FibHeapNode *allocateFibHeapEl(FibHeap * heap)
+{
+ return (FibHeapNode *)getItem(heap->nodeMemory);
+};
+
+static void deallocateFibHeapEl(FibHeapNode * a, FibHeap * heap)
+{
+ returnItem(heap->nodeMemory, a);
+}
+
+#define swap(type, a, b) \
+ do { \
+ type c; \
+ c = a; \
+ a = b; \
+ b = c; \
+ } while (0) \
+
+#define INT_BITS (sizeof(IDnum) * 8)
+
+static inline IDnum ceillog2(IDnum a)
+{
+ IDnum oa;
+ IDnum i;
+ IDnum b;
+ IDnum cons;
+
+ oa = a;
+ b = INT_BITS / 2;
+ i = 0;
+ while (b) {
+ i = (i << 1);
+ cons = ((IDnum) 1) << b;
+ if (a >= cons) {
+ a /= cons;
+ i = i | 1;
+ } else
+ a &= cons - 1;
+ b /= 2;
+ }
+ if ((((IDnum) 1 << i)) == oa)
+ return i;
+ else
+ return i + 1;
+}
+
+/*
+ * Private Heap Functions
+ */
+static void fh_initheap(FibHeap * new)
+{
+ new->fh_cmp_fnct = NULL;
+ new->nodeMemory = createMem_manager(sizeof(FibHeapNode), HEAPBLOCKSIZE);
+ new->fh_neginf = 0;
+ new->fh_n = 0;
+ new->fh_Dl = -1;
+ new->fh_cons = NULL;
+ new->fh_min = NULL;
+ new->fh_root = NULL;
+ new->fh_keys = 0;
+}
+
+static void fh_destroyheap(FibHeap * h)
+{
+ h->fh_cmp_fnct = NULL;
+ h->fh_neginf = 0;
+ if (h->fh_cons != NULL)
+ free(h->fh_cons);
+ h->fh_cons = NULL;
+ free(h);
+}
+
+/*
+ * Public Heap Functions
+ */
+FibHeap *fh_makekeyheap()
+{
+ FibHeap *n;
+
+ if ((n = malloc(sizeof *n)) == NULL)
+ return NULL;
+
+ fh_initheap(n);
+ n->fh_keys = 1;
+
+ return n;
+}
+
+FibHeap *fh_makeheap()
+{
+ FibHeap *n;
+
+ if ((n = malloc(sizeof *n)) == NULL)
+ return NULL;
+
+ fh_initheap(n);
+
+ return n;
+}
+
+voidcmp fh_setcmp(FibHeap * h, voidcmp fnct)
+{
+ voidcmp oldfnct;
+
+ oldfnct = h->fh_cmp_fnct;
+ h->fh_cmp_fnct = fnct;
+
+ return oldfnct;
+}
+
+unsigned int fh_setneginf(FibHeap * h, unsigned int data)
+{
+ unsigned int old;
+
+ old = h->fh_neginf;
+ h->fh_neginf = data;
+
+ return old;
+}
+
+FibHeap *fh_union(FibHeap * ha, FibHeap * hb)
+{
+ FibHeapNode *x;
+
+ if (ha->fh_root == NULL || hb->fh_root == NULL) {
+ /* either one or both are empty */
+ if (ha->fh_root == NULL) {
+ fh_destroyheap(ha);
+ return hb;
+ } else {
+ fh_destroyheap(hb);
+ return ha;
+ }
+ }
+ ha->fh_root->fhe_left->fhe_right = hb->fh_root;
+ hb->fh_root->fhe_left->fhe_right = ha->fh_root;
+ x = ha->fh_root->fhe_left;
+ ha->fh_root->fhe_left = hb->fh_root->fhe_left;
+ hb->fh_root->fhe_left = x;
+ ha->fh_n += hb->fh_n;
+ /*
+ * we probably should also keep stats on number of unions
+ */
+
+ /* set fh_min if necessary */
+ if (fh_compare(ha, hb->fh_min, ha->fh_min) < 0)
+ ha->fh_min = hb->fh_min;
+
+ fh_destroyheap(hb);
+ return ha;
+}
+
+void fh_deleteheap(FibHeap * h)
+{
+ freeMem_manager(h->nodeMemory);
+ h->nodeMemory = NULL;
+ fh_destroyheap(h);
+}
+
+/*
+ * Public Key Heap Functions
+ */
+FibHeapNode *fh_insertkey(FibHeap * h, Coordinate key, unsigned int data)
+{
+ FibHeapNode *x;
+
+ if ((x = fhe_newelem(h)) == NULL)
+ return NULL;
+
+ /* just insert on root list, and make sure it's not the new min */
+ x->fhe_data = data;
+ x->fhe_key = key;
+
+ fh_insertel(h, x);
+
+ return x;
+}
+
+boolean fh_isempty(FibHeap *h)
+{
+
+ if (h->fh_min == NULL)
+ return 1;
+ else
+ return 0;
+
+}
+
+Coordinate fh_minkey(FibHeap * h)
+{
+ if (h->fh_min == NULL)
+ return INT_MIN;
+ return h->fh_min->fhe_key;
+}
+
+
+unsigned int fh_replacekeydata(FibHeap * h, FibHeapNode * x,
+ Coordinate key, unsigned int data)
+{
+ unsigned int odata;
+ Coordinate okey;
+ FibHeapNode *y;
+ int r;
+
+ odata = x->fhe_data;
+ okey = x->fhe_key;
+
+ /*
+ * we can increase a key by deleting and reinserting, that
+ * requires O(lgn) time.
+ */
+ if ((r = fh_comparedata(h, key, data, x)) > 0) {
+ /* XXX - bad code! */
+ abort();
+ }
+
+ x->fhe_data = data;
+ x->fhe_key = key;
+
+ /* because they are equal, we don't have to do anything */
+ if (r == 0)
+ return odata;
+
+ y = x->fhe_p;
+
+ if (h->fh_keys && okey == key)
+ return odata;
+
+ if (y != NULL && fh_compare(h, x, y) <= 0) {
+ fh_cut(h, x, y);
+ fh_cascading_cut(h, y);
+ }
+
+ /*
+ * the = is so that the call from fh_delete will delete the proper
+ * element.
+ */
+ if (fh_compare(h, x, h->fh_min) <= 0)
+ h->fh_min = x;
+
+ return odata;
+}
+
+Coordinate fh_replacekey(FibHeap * h, FibHeapNode * x, Coordinate key)
+{
+ Coordinate ret;
+
+ ret = x->fhe_key;
+ (void) fh_replacekeydata(h, x, key, x->fhe_data);
+
+ return ret;
+}
+
+/*
+ * Public void * Heap Functions
+ */
+/*
+ * this will return these values:
+ * NULL failed for some reason
+ * ptr token to use for manipulation of data
+ */
+FibHeapNode *fh_insert(FibHeap * h, unsigned int data)
+{
+ FibHeapNode *x;
+
+ if ((x = fhe_newelem(h)) == NULL)
+ return NULL;
+
+ /* just insert on root list, and make sure it's not the new min */
+ x->fhe_data = data;
+
+ fh_insertel(h, x);
+
+ return x;
+}
+
+unsigned int fh_min(FibHeap * h)
+{
+ if (h->fh_min == NULL)
+ return 0;
+ return h->fh_min->fhe_data;
+}
+
+unsigned int fh_extractmin(FibHeap * h)
+{
+ FibHeapNode *z;
+ unsigned int ret=0;
+
+
+ if (h->fh_min != NULL) {
+ z = fh_extractminel(h);
+ ret = z->fhe_data;
+#ifndef NO_FREE
+ deallocateFibHeapEl(z, h);
+#endif
+
+ }
+
+ return ret;
+}
+
+unsigned int fh_replacedata(FibHeapNode * x, unsigned int data)
+{
+ unsigned int odata = x->fhe_data;
+ x->fhe_data = data;
+ return odata;
+}
+
+unsigned int fh_delete(FibHeap * h, FibHeapNode * x)
+{
+ unsigned int k;
+
+ k = x->fhe_data;
+ if (!h->fh_keys)
+ fh_replacedata(x, h->fh_neginf);
+ else
+ fh_replacekey(h, x, INT_MIN);
+ fh_extractmin(h);
+
+ return k;
+}
+
+/*
+ * begin of private element fuctions
+ */
+static FibHeapNode *fh_extractminel(FibHeap * h)
+{
+ FibHeapNode *ret;
+ FibHeapNode *x, *y, *orig;
+
+ ret = h->fh_min;
+
+ orig = NULL;
+ /* put all the children on the root list */
+ /* for true consistancy, we should use fhe_remove */
+ for (x = ret->fhe_child; x != orig && x != NULL;) {
+ if (orig == NULL)
+ orig = x;
+ y = x->fhe_right;
+ x->fhe_p = NULL;
+ fh_insertrootlist(h, x);
+ x = y;
+ }
+ /* remove minimum from root list */
+ fh_removerootlist(h, ret);
+ h->fh_n--;
+
+ /* if we aren't empty, consolidate the heap */
+ if (h->fh_n == 0)
+ h->fh_min = NULL;
+ else {
+ h->fh_min = ret->fhe_right;
+ fh_consolidate(h);
+ }
+
+ return ret;
+}
+
+static void fh_insertrootlist(FibHeap * h, FibHeapNode * x)
+{
+ if (h->fh_root == NULL) {
+ h->fh_root = x;
+ x->fhe_left = x;
+ x->fhe_right = x;
+ return;
+ }
+
+ fhe_insertafter(h->fh_root, x);
+}
+
+static void fh_removerootlist(FibHeap * h, FibHeapNode * x)
+{
+ if (x->fhe_left == x)
+ h->fh_root = NULL;
+ else
+ h->fh_root = fhe_remove(x);
+}
+
+static void fh_consolidate(FibHeap * h)
+{
+ FibHeapNode **a;
+ FibHeapNode *w;
+ FibHeapNode *y;
+ FibHeapNode *x;
+ IDnum i;
+ IDnum d;
+ IDnum D;
+
+ fh_checkcons(h);
+
+ /* assign a the value of h->fh_cons so I don't have to rewrite code */
+ D = h->fh_Dl + 1;
+ a = h->fh_cons;
+
+ for (i = 0; i < D; i++)
+ a[i] = NULL;
+
+ while ((w = h->fh_root) != NULL) {
+ x = w;
+ fh_removerootlist(h, w);
+ d = x->fhe_degree;
+ /* XXX - assert that d < D */
+ while (a[d] != NULL) {
+ y = a[d];
+ if (fh_compare(h, x, y) > 0)
+ swap(FibHeapNode *, x, y);
+ fh_heaplink(h, y, x);
+ a[d] = NULL;
+ d++;
+ }
+ a[d] = x;
+ }
+ h->fh_min = NULL;
+ for (i = 0; i < D; i++)
+ if (a[i] != NULL) {
+ fh_insertrootlist(h, a[i]);
+ if (h->fh_min == NULL
+ || fh_compare(h, a[i], h->fh_min) < 0)
+ h->fh_min = a[i];
+ }
+}
+
+static void fh_heaplink(FibHeap * h, FibHeapNode * y, FibHeapNode * x)
+{
+ /* make y a child of x */
+ if (x->fhe_child == NULL)
+ x->fhe_child = y;
+ else
+ fhe_insertbefore(x->fhe_child, y);
+ y->fhe_p = x;
+ x->fhe_degree++;
+ y->fhe_mark = 0;
+}
+
+static void fh_cut(FibHeap * h, FibHeapNode * x, FibHeapNode * y)
+{
+ fhe_remove(x);
+ y->fhe_degree--;
+ fh_insertrootlist(h, x);
+ x->fhe_p = NULL;
+ x->fhe_mark = 0;
+}
+
+static void fh_cascading_cut(FibHeap * h, FibHeapNode * y)
+{
+ FibHeapNode *z;
+
+ while ((z = y->fhe_p) != NULL) {
+ if (y->fhe_mark == 0) {
+ y->fhe_mark = 1;
+ return;
+ } else {
+ fh_cut(h, y, z);
+ y = z;
+ }
+ }
+}
+
+/*
+ * begining of handling elements of fibheap
+ */
+static FibHeapNode *fhe_newelem(FibHeap * h)
+{
+ FibHeapNode *e;
+
+ if ((e = allocateFibHeapEl(h)) == NULL)
+ return NULL;
+
+ fhe_initelem(e);
+
+ return e;
+}
+
+static void fhe_initelem(FibHeapNode * e)
+{
+ e->fhe_degree = 0;
+ e->fhe_mark = 0;
+ e->fhe_p = NULL;
+ e->fhe_child = NULL;
+ e->fhe_left = e;
+ e->fhe_right = e;
+ e->fhe_data = 0;
+}
+
+static void fhe_insertafter(FibHeapNode * a, FibHeapNode * b)
+{
+ if (a == a->fhe_right) {
+ a->fhe_right = b;
+ a->fhe_left = b;
+ b->fhe_right = a;
+ b->fhe_left = a;
+ } else {
+ b->fhe_right = a->fhe_right;
+ a->fhe_right->fhe_left = b;
+ a->fhe_right = b;
+ b->fhe_left = a;
+ }
+}
+
+static inline void fhe_insertbefore(FibHeapNode * a, FibHeapNode * b)
+{
+ fhe_insertafter(a->fhe_left, b);
+}
+
+static FibHeapNode *fhe_remove(FibHeapNode * x)
+{
+ FibHeapNode *ret;
+
+ if (x == x->fhe_left)
+ ret = NULL;
+ else
+ ret = x->fhe_left;
+
+ /* fix the parent pointer */
+ if (x->fhe_p != NULL && x->fhe_p->fhe_child == x)
+ x->fhe_p->fhe_child = ret;
+
+ x->fhe_right->fhe_left = x->fhe_left;
+ x->fhe_left->fhe_right = x->fhe_right;
+
+ /* clear out hanging pointers */
+ x->fhe_p = NULL;
+ x->fhe_left = x;
+ x->fhe_right = x;
+
+ return ret;
+}
+
+static void fh_checkcons(FibHeap * h)
+{
+ IDnum oDl;
+
+ /* make sure we have enough memory allocated to "reorganize" */
+ if (h->fh_Dl == -1 || h->fh_n > (1 << h->fh_Dl)) {
+ oDl = h->fh_Dl;
+ if ((h->fh_Dl = ceillog2(h->fh_n) + 1) < 8)
+ h->fh_Dl = 8;
+ if (oDl != h->fh_Dl)
+ h->fh_cons =
+ (FibHeapNode **) realloc(h->fh_cons,
+ sizeof *h->
+ fh_cons *
+ (h->fh_Dl + 1));
+ if (h->fh_cons == NULL)
+ abort();
+ }
+}
+
+static int fh_compare(FibHeap * h, FibHeapNode * a, FibHeapNode * b)
+{
+ if (a->fhe_key < b->fhe_key)
+ return -1;
+ if (a->fhe_key == b->fhe_key)
+ return 0;
+ return 1;
+}
+
+static int
+fh_comparedata(FibHeap * h, Coordinate key, unsigned int data, FibHeapNode * b)
+{
+ FibHeapNode a;
+
+ a.fhe_key = key;
+ a.fhe_data = data;
+
+ return fh_compare(h, &a, b);
+}
+
+static void fh_insertel(FibHeap * h, FibHeapNode * x)
+{
+ fh_insertrootlist(h, x);
+
+ if (h->fh_min == NULL
+ || (h->fh_keys ? x->fhe_key <
+ h->fh_min->fhe_key : h->fh_cmp_fnct(x->fhe_data,
+ h->fh_min->fhe_data) <
+ 0))
+ h->fh_min = x;
+
+ h->fh_n++;
+}
diff --git a/fusion/fibHeap.c b/fusion/fibHeap.c
new file mode 100755
index 0000000..8235ee2
--- /dev/null
+++ b/fusion/fibHeap.c
@@ -0,0 +1,77 @@
+/*
+Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
+
+ This file is part of Velvet.
+
+ Velvet is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ Velvet is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with Velvet; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*/
+#include "fib.h"
+
+// Constructor
+// Memory allocated
+FibHeap *newFibHeap()
+{
+ return fh_makekeyheap();
+}
+
+// Add new node into heap with a key, and a pointer to the specified node
+FibHeapNode *insertNodeIntoHeap(FibHeap * heap, Coordinate key,
+ unsigned int node)
+{
+ return fh_insertkey(heap, key, node);
+}
+
+// Returns smallest key in heap
+Coordinate minKeyOfHeap(FibHeap * heap)
+{
+ return fh_minkey(heap);
+}
+
+// Replaces the key for a given node
+Coordinate replaceKeyInHeap(FibHeap * heap, FibHeapNode * node,
+ Coordinate newKey)
+{
+ return fh_replacekey(heap, node, newKey);
+}
+
+// Removes the node with the shortest key, then returns it.
+unsigned int removeNextNodeFromHeap(FibHeap * heap)
+{
+ return (unsigned int) fh_extractmin(heap);
+}
+
+boolean IsHeapEmpty(FibHeap *heap)
+{
+ return fh_isempty(heap);
+}
+
+// Destructor
+void destroyHeap(FibHeap * heap)
+{
+ fh_deleteheap(heap);
+}
+
+// Replace the node pointed to by a heap node
+void replaceValueInHeap(FibHeapNode * node, unsigned int newValue)
+{
+ fh_replacedata(node, newValue);
+}
+
+// Remove unwanted node
+void destroyNodeInHeap(FibHeapNode * node, FibHeap * heap)
+{
+ fh_delete(heap, node);
+}
diff --git a/fusion/finalFusion b/fusion/finalFusion
new file mode 100755
index 0000000..c5c46b8
Binary files /dev/null and b/fusion/finalFusion differ
diff --git a/fusion/hashFunction.c b/fusion/hashFunction.c
new file mode 100755
index 0000000..f2424fd
--- /dev/null
+++ b/fusion/hashFunction.c
@@ -0,0 +1,83 @@
+#include
+
+
+#define KMER_HASH_MASK 0x0000000000ffffffL
+#define KMER_HASH_BUCKETS 16777216 // 4^12
+
+static int crc_table[256] = {
+ 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419,
+ 0x706af48f, 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4,
+ 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07,
+ 0x90bf1d91, 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
+ 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 0x136c9856,
+ 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
+ 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4,
+ 0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
+ 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3,
+ 0x45df5c75, 0xdcd60dcf, 0xabd13d59, 0x26d930ac, 0x51de003a,
+ 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599,
+ 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+ 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190,
+ 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f,
+ 0x9fbfe4a5, 0xe8b8d433, 0x7807c9a2, 0x0f00f934, 0x9609a88e,
+ 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
+ 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed,
+ 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
+ 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3,
+ 0xfbd44c65, 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
+ 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a,
+ 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5,
+ 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, 0xbe0b1010,
+ 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+ 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17,
+ 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6,
+ 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615,
+ 0x73dc1683, 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
+ 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 0xf00f9344,
+ 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
+ 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a,
+ 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
+ 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1,
+ 0xa6bc5767, 0x3fb506dd, 0x48b2364b, 0xd80d2bda, 0xaf0a1b4c,
+ 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef,
+ 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+ 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe,
+ 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31,
+ 0x2cd99e8b, 0x5bdeae1d, 0x9b64c2b0, 0xec63f226, 0x756aa39c,
+ 0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
+ 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b,
+ 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
+ 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1,
+ 0x18b74777, 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
+ 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, 0xa00ae278,
+ 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7,
+ 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, 0x40df0b66,
+ 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+ 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605,
+ 0xcdd70693, 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8,
+ 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b,
+ 0x2d02ef8d
+};
+
+static int crc32(int crc, const char *buf, int len)
+{
+ if (buf == NULL)
+ return 0;
+
+ crc = crc ^ 0xffffffff;
+ while (len--) {
+ crc =
+ crc_table[((int) crc ^ (*buf++)) & 0xff] ^ (crc >> 8);
+ }
+
+ return crc ^ 0xffffffff;
+}
+
+Kmer hash_kmer(Kmer kmer)
+{
+ Kmer hash;
+ hash = kmer;
+ hash = crc32(0, (char *) &kmer, sizeof(Kmer));
+ hash &= KMER_HASH_MASK;
+ return hash;
+}
diff --git a/fusion/inc/check.h b/fusion/inc/check.h
new file mode 100755
index 0000000..db8f1ca
--- /dev/null
+++ b/fusion/inc/check.h
@@ -0,0 +1,5 @@
+
+extern void *ckalloc(unsigned long long amount);
+extern void *ckrealloc(void *p, size_t new_size, size_t old_size);
+extern FILE *ckopen(char *name, char *mode);
+
diff --git a/fusion/inc/darray.h b/fusion/inc/darray.h
new file mode 100755
index 0000000..9b8e4f4
--- /dev/null
+++ b/fusion/inc/darray.h
@@ -0,0 +1,23 @@
+#ifndef __DARRAY__
+#define __DARRAY__
+
+#include
+#include
+#include
+
+typedef struct dynamic_array
+{
+ void *array;
+ long long array_size;
+ size_t item_size;
+ long long item_c;
+}DARRAY;
+
+void *darrayPut(DARRAY *darray,long long index);
+void *darrayGet(DARRAY *darray,long long index);
+DARRAY *createDarray(int num_items,size_t unit_size);
+void freeDarray(DARRAY *darray);
+void emptyDarray(DARRAY *darray);
+
+#endif
+
diff --git a/fusion/inc/def.h b/fusion/inc/def.h
new file mode 100755
index 0000000..9c4d5b9
--- /dev/null
+++ b/fusion/inc/def.h
@@ -0,0 +1,296 @@
+/* this file provides some datatype definition */
+#ifndef _DEF
+#define _DEF
+
+#include "def2.h"
+#include "types.h"
+#include "stack.h"
+#include "darray.h"
+
+#define EDGE_BIT_SIZE 6
+#define word_len 12
+#define taskMask 0xf //the last 7 bits
+
+#define MaxEdgeCov 16000
+
+#define base2int(base) (char)(((base)&0x06)>>1)
+#define int2base(seq) "ACTG"[seq]
+#define int2compbase(seq) "TGAC"[seq]
+#define int_comp(seq) (char)(seq^0x02) //(char)((0x4E>>((seq)<<1))&0x03)
+
+int b_ban;
+
+typedef unsigned long long Kmer;
+
+typedef struct edon
+{
+ Kmer kmer;
+ unsigned int ctgLen:1;
+ unsigned int twin:1;
+ unsigned int pos:30;
+ unsigned int ctgID;
+ struct edon *left;
+ struct edon *right;
+}EDON;
+
+struct node_pt;
+
+typedef struct node
+{
+ Kmer kmer;
+ unsigned char links;
+ unsigned char linksB;
+ unsigned char cvg;
+ unsigned char linear:1;
+ unsigned char deleted:1;
+ unsigned char mark:1;
+ unsigned int to_end; // the edge no. it belongs to
+ struct node *left;
+ struct node *right;
+}NODE;
+
+typedef struct node_pt
+{
+ NODE *node;
+ Kmer kmer;
+ boolean isSmaller;
+ struct node_pt *next;
+}NODE_PT;
+
+typedef struct preedge
+{
+ Kmer from_node;
+ Kmer to_node;
+ char *seq;
+ int length;
+ unsigned short cvg;
+ unsigned short bal_edge:2; //indicate whether it's bal_edge is the previous edge, next edge or itself
+}preEDGE;
+
+typedef struct readinterval
+{
+ int readid;
+ unsigned int edgeid;
+ int start;
+ struct readinterval *bal_rv;
+ struct readinterval *nextOnEdge;
+ struct readinterval *prevOnEdge;
+ struct readinterval *nextInRead;
+ struct readinterval *prevInRead;
+}READINTERVAL;
+
+struct arc;
+typedef struct edge
+{
+ unsigned int from_vt;
+ unsigned int to_vt;
+ int length;
+ unsigned short cvg:14;
+ unsigned short bal_edge:2;
+ unsigned short multi:14;
+ unsigned short deleted : 1;
+ unsigned short flag : 1;
+ char *seq;
+ READINTERVAL *rv;
+ struct arc *arcs;
+ long long *markers;
+}EDGE;
+
+typedef struct edge_pt
+{
+ EDGE *edge;
+ struct edge_pt *next;
+}EDGE_PT;
+
+typedef struct vertex
+{
+ Kmer kmer;
+}VERTEX;
+
+typedef struct connection
+{
+ unsigned int contigID;
+ int gapLen;
+
+ unsigned short maxGap;
+ unsigned char minGap;
+ unsigned char bySmall:1;
+ unsigned char weakPoint:1;
+
+ unsigned char weightNotInherit;
+ unsigned char weight;
+ unsigned char maxSingleWeight;
+ unsigned char mask : 1;
+ unsigned char used : 1;
+ unsigned char weak : 1;
+ unsigned char deleted : 1;
+ unsigned char prevInScaf : 1;
+ unsigned char inherit : 1;
+ unsigned char checking : 1;
+ unsigned char singleInScaf : 1;
+ struct connection *nextInScaf;
+ struct connection *next;
+ struct connection *nextInLookupTable;
+ int *PE;
+}CONNECT;
+
+typedef struct prearc
+{
+ unsigned int to_ed;
+ unsigned int multiplicity;
+ struct prearc *next;
+}preARC;
+
+typedef struct contig
+{
+ unsigned int from_vt;
+ unsigned int to_vt;
+ unsigned int length;
+ unsigned short indexInScaf;
+ unsigned char cvg;
+ unsigned char bal_edge:2; // 0, 1 or 2
+ unsigned char mask : 1;
+ unsigned char flag : 1;
+ unsigned char multi: 1;
+ unsigned char inSubGraph: 1;
+ char *seq;
+ CONNECT *downwardConnect;
+ preARC *arcs;
+ STACK *closeReads;
+}CONTIG;
+
+typedef struct read_nearby
+{
+ int len;
+ int dis; // dis to nearby contig or scaffold's start position
+ long long seqStarter; //sequence start position in dynamic array
+}READNEARBY;
+
+typedef struct annotation
+{
+ unsigned long long readID;
+ unsigned int contigID;
+ int pos;
+}ANNOTATION;
+
+typedef struct parameter
+{
+ unsigned char threadID;
+ void **hash_table;
+ unsigned char *mainSignal;
+ unsigned char *selfSignal;
+}PARAMETER;
+
+typedef struct lightannot
+{
+ int contigID;
+ int pos;
+}LIGHTANNOT;
+
+typedef struct edgepatch
+{
+ Kmer from_kmer,to_kmer;
+ unsigned int length;
+ char bal_edge;
+}EDGEPATCH;
+
+typedef struct lightctg
+{
+ unsigned int index;
+ int length;
+ char *seq;
+}LIGHTCTG;
+
+
+typedef struct arc
+{
+ unsigned int to_ed;
+ unsigned int multiplicity;
+ struct arc *prev;
+ struct arc *next;
+ struct arc *bal_arc;
+ struct arc *nextInLookupTable;
+}ARC;
+
+typedef struct arcexist
+{
+ Kmer kmer;
+ struct arcexist *left;
+ struct arcexist *right;
+}ARCEXIST;
+
+typedef struct lib_info
+{
+ int min_ins;
+ int max_ins;
+ int avg_ins;
+ int rd_len_cutoff;
+ int reverse;
+ int asm_flag;
+ int map_len;
+ int pair_num_cut;
+ int rank;
+ //indicate which file is next to be read
+ int curr_type;
+ int curr_index;
+
+ //file handlers to opened files
+ FILE *fp1;
+ FILE *fp2;
+ boolean f1_start;
+ boolean f2_start;
+ //whether last read is read1 in pair
+ int paired; // 0 -- single; 1 -- read1; 2 -- read2;
+
+//type1
+ char **a1_fname;
+ char **a2_fname;
+ int num_a1_file;
+ int num_a2_file;
+
+//type2
+ char **q1_fname;
+ char **q2_fname;
+ int num_q1_file;
+ int num_q2_file;
+
+//type3
+ char **p_fname;
+ int num_p_file; //fasta only
+
+//type4 &5
+ char **s_a_fname;
+ int num_s_a_file;
+ char **s_q_fname;
+ int num_s_q_file;
+
+}LIB_INFO;
+
+typedef struct ctg4heap{
+ unsigned int ctgID;
+ int dis;
+ unsigned char ds_shut4dheap:1; // ignore downstream connections
+ unsigned char us_shut4dheap:1; // ignore upstream connections
+ unsigned char ds_shut4uheap:1; // ignore downstream connections
+ unsigned char us_shut4uheap:1; // ignore upstream connections
+}CTGinHEAP;
+
+typedef struct ctg4scaf{
+ unsigned int ctgID;
+ int start;
+ int end; //position in scaff
+ unsigned int cutHead : 8; //
+ unsigned int cutTail : 7; //
+ unsigned int scaftig_start : 1; //is it a scaftig starter
+ unsigned int mask : 1; // is it masked for further operations
+ unsigned int gapSeqLen:15;
+ int gapSeqOffset;
+}CTGinSCAF;
+
+typedef struct pe_info{
+ int insertS;
+ long long PE_bound;
+ int rank;
+ int pair_num_cut;
+}PE_INFO;
+#endif
diff --git a/fusion/inc/def2.h b/fusion/inc/def2.h
new file mode 100755
index 0000000..677002f
--- /dev/null
+++ b/fusion/inc/def2.h
@@ -0,0 +1,43 @@
+#ifndef _DEF2
+#define _DEF2
+typedef char boolean;
+typedef long long IDnum;
+typedef double Time;
+typedef long long Coordinate;
+// Fibonacci heaps used mainly in Tour Bus
+typedef struct fibheap FibHeap;
+typedef struct fibheap_el FibHeapNode;
+typedef struct dfibheap DFibHeap;
+typedef struct dfibheap_el DFibHeapNode;
+//Memory manager
+typedef struct block_start
+{
+ struct block_start *next;
+}BLOCK_START;
+
+typedef struct recycle_mark
+{
+ struct recycle_mark *next;
+}RECYCLE_MARK;
+
+typedef struct mem_manager
+{
+ BLOCK_START *block_list;
+ int index_in_block;
+ int items_per_block;
+ size_t item_size;
+ RECYCLE_MARK *recycle_list;
+ unsigned long long counter;
+}MEM_MANAGER;
+
+struct dfibheap_el {
+ int dfhe_degree;
+ boolean dfhe_mark;
+ DFibHeapNode *dfhe_p;
+ DFibHeapNode *dfhe_child;
+ DFibHeapNode *dfhe_left;
+ DFibHeapNode *dfhe_right;
+ Time dfhe_key;
+ unsigned int dfhe_data;//void *dfhe_data;
+};
+#endif
diff --git a/fusion/inc/dfib.h b/fusion/inc/dfib.h
new file mode 100755
index 0000000..fa96304
--- /dev/null
+++ b/fusion/inc/dfib.h
@@ -0,0 +1,72 @@
+/*
+Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
+
+ This file is part of Velvet.
+
+ Velvet is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ Velvet is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with Velvet; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*/
+/*-
+ * Copyright 1997, 1998-2003 John-Mark Gurney.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: dfib.h,v 1.8 2007/04/24 12:16:41 zerbino Exp $
+ *
+ */
+
+#ifndef _DFIB_H_
+#define _DFIB_H_
+
+#include
+#include "def2.h" //#include "globals.h"
+
+/* functions for key heaps */
+DFibHeap *dfh_makekeyheap(void);
+DFibHeapNode *dfh_insertkey(DFibHeap *, Time, unsigned int);
+Time dfh_replacekey(DFibHeap *, DFibHeapNode *, Time);
+unsigned int dfh_replacekeydata(DFibHeap *, DFibHeapNode *, Time, unsigned int);
+
+unsigned int dfh_extractmin(DFibHeap *);
+unsigned int dfh_replacedata(DFibHeapNode *, unsigned int);
+unsigned int dfh_delete(DFibHeap *, DFibHeapNode *);
+void dfh_deleteheap(DFibHeap *);
+
+// DEBUG
+IDnum dfibheap_getSize(DFibHeap *);
+Time dfibheap_el_getKey(DFibHeapNode *);
+// END DEBUG
+
+#endif /* _FIB_H_ */
diff --git a/fusion/inc/dfibHeap.h b/fusion/inc/dfibHeap.h
new file mode 100755
index 0000000..120252c
--- /dev/null
+++ b/fusion/inc/dfibHeap.h
@@ -0,0 +1,43 @@
+/*
+Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
+
+ This file is part of Velvet.
+
+ Velvet is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ Velvet is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with Velvet; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*/
+#ifndef _DFIBHEAP_H_
+#define _DFIBHEAP_H_
+
+DFibHeap *newDFibHeap();
+
+DFibHeapNode *insertNodeIntoDHeap(DFibHeap * heap, Time key, unsigned int node);
+
+Time replaceKeyInDHeap(DFibHeap * heap, DFibHeapNode * node, Time newKey);
+
+unsigned int removeNextNodeFromDHeap(DFibHeap * heap);
+
+void destroyDHeap(DFibHeap * heap);
+
+boolean HasMin(DFibHeap *h);
+
+void replaceValueInDHeap(DFibHeapNode * node, unsigned int newValue);
+
+void *destroyNodeInDHeap(DFibHeapNode * node, DFibHeap * heap);
+
+IDnum getDFibHeapSize(DFibHeap * heap);
+
+Time getKey(DFibHeapNode * node);
+#endif
diff --git a/fusion/inc/dfibpriv.h b/fusion/inc/dfibpriv.h
new file mode 100755
index 0000000..fb0d5b3
--- /dev/null
+++ b/fusion/inc/dfibpriv.h
@@ -0,0 +1,96 @@
+/*
+Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
+
+ This file is part of Velvet.
+
+ Velvet is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ Velvet is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with Velvet; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*/
+/*-
+ * Copyright 1997, 1999-2003 John-Mark Gurney.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: dfibpriv.h,v 1.8 2007/10/09 09:56:46 zerbino Exp $
+ *
+ */
+
+#ifndef _DFIBPRIV_H_
+#define _DFIBPRIV_H_
+
+//#include "globals.h"
+#include "def2.h"
+
+/*
+ * specific node operations
+ */
+
+static DFibHeapNode *dfhe_newelem(DFibHeap *);
+static void dfhe_insertafter(DFibHeapNode * a, DFibHeapNode * b);
+static inline void dfhe_insertbefore(DFibHeapNode * a, DFibHeapNode * b);
+static DFibHeapNode *dfhe_remove(DFibHeapNode * a);
+
+/*
+ * global heap operations
+ */
+struct dfibheap {
+ MEM_MANAGER *nodeMemory;
+ IDnum dfh_n;
+ IDnum dfh_Dl;
+ DFibHeapNode **dfh_cons;
+ DFibHeapNode *dfh_min;
+ DFibHeapNode *dfh_root;
+};
+
+static void dfh_insertrootlist(DFibHeap *, DFibHeapNode *);
+static void dfh_removerootlist(DFibHeap *, DFibHeapNode *);
+static void dfh_consolidate(DFibHeap *);
+static void dfh_heaplink(DFibHeap * h, DFibHeapNode * y, DFibHeapNode * x);
+static void dfh_cut(DFibHeap *, DFibHeapNode *, DFibHeapNode *);
+static void dfh_cascading_cut(DFibHeap *, DFibHeapNode *);
+static DFibHeapNode *dfh_extractminel(DFibHeap *);
+static void dfh_checkcons(DFibHeap * h);
+static int dfh_compare(DFibHeap * h, DFibHeapNode * a, DFibHeapNode * b);
+static int dfh_comparedata(DFibHeap * h, Time key,
+ unsigned int data, DFibHeapNode * b);
+static void dfh_insertel(DFibHeap * h, DFibHeapNode * x);
+
+
+/*
+ * general functions
+ */
+static inline IDnum ceillog2(IDnum a);
+
+#endif /* _FIBPRIV_H_ */
diff --git a/fusion/inc/extfunc.h b/fusion/inc/extfunc.h
new file mode 100755
index 0000000..72052f2
--- /dev/null
+++ b/fusion/inc/extfunc.h
@@ -0,0 +1,209 @@
+/***************************************************************************
+ * Title: extfunc.h
+ * Author: Haixu Tang
+ * Created: Jun. 2002
+ * Last modified: May. 2004
+ *
+ * Copyright (c) 2001-2004 The Regents of the University of California
+ * All Rights Reserved
+ * See file LICENSE for details.
+ ***************************************************************************/
+#include "check.h"
+#include "extfunc2.h"
+extern NODE **seq2nodes_with_pair(char *seqfile,char *outfile);
+extern NODE **prlSeq2nodes_with_pair(char *seqfile,char *outfile);
+extern void readseq1by1(char *src_seq, char *src_name,int *len_seq, FILE *fp,long long num_seq);
+extern void readseqPbyP(char *src_seq, char *src_name,int *insertS,int *len_seq, FILE *fp,long long num_seq);
+extern void nodes2edges_with_pair(NODE **hash_table,EDGE_PT **edge_list,char *outfile);
+extern int findOrInsertOccurenceInNodeTree(Kmer kmer, NODE ** T);
+extern NODE *SplayNodeTree(NODE * T,Kmer kmer);
+extern Kmer reverseComplement(Kmer word,int overlap);
+extern Kmer hash_kmer(Kmer kmer);
+extern void link2next(NODE *node,char ch);
+extern unsigned char check_link2next(NODE *node,char ch);
+extern void unlink2next(NODE *node,char ch);
+extern void link2prev(NODE *node,char ch);
+extern unsigned char check_link2prev(NODE *node,char ch);
+extern void unlink2prev(NODE *node,char ch);
+extern int count_link2next(NODE *node);
+extern int count_link2prev(NODE *node);
+extern Kmer nextKmer(Kmer prev,char ch);
+extern Kmer prevKmer(Kmer next,char ch);
+extern long long readseqpar(int *max_len,int *min_leg,int *max_name_len,FILE *fp);
+extern void destroyNodeHash(NODE **hash_table);
+extern void free_edge_list(EDGE_PT *el);
+extern void reverseComplementSeq(char *seq, int len,char *bal_seq);
+extern void free_node_list(NODE_PT *np);
+extern NODE *SplayNodeTree_FILTER(NODE *T,Kmer kmer);
+extern NODE *allocateNode_cvg(Kmer kmer);
+extern int findOrInsertOccurenceInNodeTree_cvg(Kmer kmer, NODE **T);
+extern void free_edge_array(EDGE *ed_array,int ed_num);
+extern void free_lightctg_array(LIGHTCTG *ed_array,int ed_num);
+extern char getCharInTightString(char *tightSeq,int pos);
+extern void writeChar2tightSting(char nt,char *tightSeq,int pos);
+extern void short_reads_sum();
+extern void read_one_sequence(FILE *fp,long long *T,char **X);
+extern void output_edges(preEDGE *ed_array,int ed_num,char *outfile);
+extern void read2edge(char *seqfile,NODE **hash_table,char *outfile);
+extern void loadVertex(char *graphfile);
+extern int kmer2vt(Kmer kmer);
+extern void loadEdge(char *graphfile);
+extern boolean loadPath(char *graphfile);
+extern READINTERVAL *allocateRV(int readid,int edgeid);
+extern void createRVmemo();
+extern void dismissRV(READINTERVAL *rv);
+extern void destroyReadIntervMem();
+extern void destroyConnectMem();
+extern void u2uConcatenate();
+extern void unlink2all(NODE *node,NODE **hash_table);
+extern void cutTip(NODE **hash_table);
+extern void output_contig(EDGE *ed_array,unsigned int ed_num,char *outfile,int cut_len);
+extern void printTightString(char *tightSeq,int len);
+extern int roughUniqueness(unsigned int edgeno,char ignore_cvg,char *ignored);
+extern void outputReadPos(char *graphfile,int min_len);
+extern NODE *reverseComplementNode(NODE *node1,NODE **hash_table);
+extern void testSearch();
+extern void print_kmer(FILE *fp,Kmer kmer,char c);
+extern void allpathConcatenate();
+extern void output_updated_edges(char *outfile);
+extern void output_updated_vertex(char *outfile);
+extern void loadUpdatedEdges(char *graphfile);
+extern void loadUpdatedVertex(char *graphfile);
+extern void connectByPE(char *infile);
+extern void output_cntGVZ(char *outfile);
+extern void output_graph(char *outfile);
+extern void removeUnreliable(NODE **hash_talbe);
+extern void testLinearC2C();
+extern void output_contig_graph(char *outfile);
+extern void scaffolding(unsigned int cut_len,char *outfile);
+extern int cmp_int(const void *a,const void *b);
+extern CONNECT *allocateCN(unsigned int contigId, int gap);
+extern int recoverRep();
+extern void loadPEgrads(char *infile);
+extern int putInsertS(long long readid,int size,int *currGrads);
+extern int getInsertS(long long readid,int *readlen);
+extern int connectByPE_grad(FILE *fp,int peGrad,char *line);
+extern void PEgradsScaf(char *infile);
+extern void reorderAnnotation(char *infile,char *outfile);
+extern int count_ends(NODE **hash_table);
+extern void output_1edge(preEDGE *edge, FILE *fp);
+extern void prlRead2edge(char *libfile,char *outfile);
+extern int count_edges(NODE **hash_table);
+extern int prlFindOrInsertOccurenceInNodeTree_cvg(Kmer kmer, NODE ** T,MEM_MANAGER *node_mem_manager);
+extern void prlDestroyNodeHash(NODE **hash_table);
+extern void annotFileTrans(char *infile,char *outfile);
+extern void prlLoadPath(char *graphfile);
+extern void misCheck(char *infile,char *outfile);
+extern int uniqueLenSearch(unsigned int *len_array,unsigned int *flag_array,int num,unsigned int target);
+extern int cmp_vertex(const void *a,const void *b);
+extern void linkContig2Vts();
+extern int bisearch(VERTEX *vts,int num,Kmer target);
+extern int connectByPE_gradPatch(FILE *fp1,FILE *fp2,int peGrad,char *line1,char *line2);
+extern void scaftiging(char *graphfile,int len_cut);
+extern void gapFilling(char *graphfile,int cut_len);
+extern ARC *getArcBetween(unsigned int from_ed, unsigned int to_ed);
+extern void bubblePinch(double simiCutoff,char *outfile,int M);
+extern void linearConcatenate();
+extern unsigned char setArcMulti(unsigned int from_ed,unsigned int to_ed,unsigned char value);
+extern ARC *allocateArc(unsigned int edgeid);
+extern void cutTipsInGraph(int cutLen, boolean strict);
+extern ARC *deleteArc(ARC *arc_list,ARC *arc);
+extern void compactEdgeArray();
+extern void dismissArc(ARC *arc);
+extern void createArcMemo();
+extern ARC *getArcBetween(unsigned int from_ed, unsigned int to_ed);
+extern ARC *allocateArc(unsigned int edgeid);
+extern void unlink2prevUncertain(NODE *node,char ch,boolean smaller);
+extern char firstCharInKmer(Kmer kmer);
+extern void writeChar2tightString(char nt,char *tightSeq,int pos);
+extern Kmer reverseComplementVerbose(Kmer word,int overlap);
+extern Kmer KmerPlus(Kmer prev,char ch);
+extern void output_heavyArcs(char *outfile);
+extern preARC *allocatePreArc(unsigned int edgeid);
+extern void destroyPreArcMem();
+extern void traceAlongArc(unsigned int destE,unsigned int currE,int max_steps,int min,int max,int index,int len,int *num_route);
+extern void freeContig_array();
+extern void output_scafSeq(char *graphfile,int len_cut);
+extern void putArcInHash(unsigned int from_ed,unsigned int to_ed);
+extern boolean DoesArcExist(unsigned int from_ed,unsigned int to_ed);
+extern void recordArcInHash();
+extern void destroyArcHash();
+extern void removeWeakEdges(int lenCutoff,unsigned int multiCutoff);
+extern void createArcLookupTable();
+extern void deleteArcLookupTable();
+extern void putArc2LookupTable(unsigned int from_ed,ARC *arc);
+extern void removeArcInLookupTable(unsigned int from_ed,unsigned int to_ed);
+extern ARC *arcCount(unsigned int edgeid,unsigned int *num);
+extern void mapFileTrans(char *infile);
+extern void solveReps();
+extern void removeDeadArcs();
+extern void destroyArcMem();
+extern int count_link2prevB(NODE *node);
+extern int count_link2nextB(NODE *node);
+extern void getCntsInFile(char *infile);
+extern void scafByCntInfo(char *infile);
+extern CONNECT *add1Connect(unsigned int e1, unsigned int e2, int gap, int weight,boolean inherit);
+extern void getScaff(char *infile);
+extern void traceAlongMaskedCnt(unsigned int destE,unsigned int currE,int max_steps,int min,int max,int index,int len,int *num_route);
+extern void createPreArcMemManager();
+extern boolean loadPathBin(char *graphfile);
+extern void analyzeTips(NODE **hash_table, char *graphfile);
+extern void recordArcsInLookupTable();
+extern FILE *multiFileRead1seq(char *src_seq, char *src_name, int *len_seq, FILE *fp,FILE *freads);
+extern void multiFileSeqpar(FILE *fp);
+extern long long multiFileParse(int *max_leg, int *min_leg,int *max_name_leg, FILE *fp);
+extern CONNECT *getCntBetween(unsigned int from_ed, unsigned int to_ed);
+extern void createCntMemManager();
+extern void destroyConnectMem();
+extern void createCntLookupTable();
+extern void deleteCntLookupTable();
+extern void putCnt2LookupTable(unsigned int from_c,CONNECT *cnt);
+extern int prlFindOrInsertOccurenceInEdonTree(Kmer kmer, EDON ** T,MEM_MANAGER *node_mem_manager);
+extern EDON *SplayEdonTree(EDON * T,Kmer kmer);
+extern void prlDestroyEdonHash(EDON **hash_table);
+extern void prlRead2Ctg(char *seqfile,char *outfile);
+extern void prlLongRead2Ctg(char *libfile,char *outfile);
+extern boolean prlContig2nodes(char *grapfile,int len_cut);
+extern void scan_libInfo(char *libfile);
+extern int getMaxLongReadLen(int num_libs);
+extern void free_libs();
+extern boolean read1seqInLib(char *src_seq, char *src_name, int *len_seq,
+ int *libNo,boolean pair,unsigned char purpose);
+extern NODE **prlEdge2nodes(char *grapfile);
+extern void prlRead2graph(char *libfile,NODE **hash_table,char *outfile);
+extern void save4laterSolve();
+extern void solveRepsAfter();
+extern void free_pe_mem();
+extern void alloc_pe_mem(int gradsCounter);
+extern NODE *searchNodeTree(NODE * T,Kmer kmer);
+extern EDON *searchEdonTree(EDON * T,Kmer kmer);
+extern void prlDestroyPreArcMem();
+extern preARC *prlAllocatePreArc(unsigned int edgeid,MEM_MANAGER *manager);
+extern boolean prlRead2HashTable(char *libfile,char *outfile);
+extern void free_allSets();
+extern void removeSingleTips();
+extern void removeMinorTips();
+extern void kmer2edges(char *outfile);
+extern void output_vertex(char *outfile);
+extern boolean prlRead2HashTable(char *libfile,char *outfile);
+extern void Links2Scaf(char *infile);
+extern void PE2Links(char *infile);
+extern void basicContigInfo(char *infile);
+extern unsigned int getTwinCtg(unsigned int ctg);
+extern boolean isSmallerThanTwin(unsigned int ctg);
+extern boolean isLargerThanTwin(unsigned int ctg);
+extern boolean isSameAsTwin(unsigned int ctg);
+extern boolean loadMarkerBin(char *graphfile);
+extern void readsCloseGap(char *graphfile);
+extern void prlReadsCloseGap(char *graphfile);
+extern void locateReadOnScaf(char *graphfile);
+extern unsigned int getTwinEdge(unsigned int edge);
+extern boolean EdSmallerThanTwin(unsigned int edge);
+extern boolean EdLargerThanTwin(unsigned int edge);
+extern boolean EdSameAsTwin(unsigned int edge);
+extern void removeLowCovEdges(int lenCutoff,unsigned short covCutoff);
+extern int localGraph(READNEARBY *rdArray,int num,CTGinSCAF *ctg1,CTGinSCAF *ctg2,
+ int origOverlap,Kmer *kmerCtg1,Kmer *kmerCtg2,
+ int overlap,DARRAY *gapSeqArray,char *seqCtg1,char *seqCtg2,char *seqGap);
+
+
diff --git a/fusion/inc/extfunc2.h b/fusion/inc/extfunc2.h
new file mode 100755
index 0000000..cf64e20
--- /dev/null
+++ b/fusion/inc/extfunc2.h
@@ -0,0 +1,7 @@
+#ifndef _MEM_MANAGER
+#define _MEM_MANAGER
+extern MEM_MANAGER *createMem_manager(int num_items,size_t unit_size);
+extern void *getItem(MEM_MANAGER *mem_Manager);
+extern void returnItem(MEM_MANAGER *mem_Manager,void *);
+extern void freeMem_manager(MEM_MANAGER *mem_Manager);
+#endif
diff --git a/fusion/inc/extvab.h b/fusion/inc/extvab.h
new file mode 100755
index 0000000..a0baca8
--- /dev/null
+++ b/fusion/inc/extvab.h
@@ -0,0 +1,92 @@
+/***************************************************************************
+ * Title: extvab.h
+ * Author: Hongmei Zhu
+ * Created: Jun. 2007
+ * Last modified: May. 2009
+ *
+ * All Rights Reserved
+ * See file LICENSE for details.
+ ***************************************************************************/
+/*** global variables ****/
+extern int overlaplen;
+extern int inGraph;
+extern long long n_ban;
+extern Kmer WORDFILTER;
+extern boolean globalFlag;
+extern int thrd_num;
+
+extern int verbosity;
+extern char verboseStr[verboseBufSize];
+
+/**** reads info *****/
+extern long long n_solexa;
+extern long long prevNum;
+extern int ins_size_var;
+extern PE_INFO *pes;
+extern int maxReadLen;
+extern int maxReadLen4all;
+extern int minReadLen;
+extern int maxNameLen;
+extern int num_libs;
+extern LIB_INFO *lib_array;
+extern int libNo;
+extern long long readNumBack;
+extern int gradsCounter;
+/*** used for pregraph *****/
+extern MEM_MANAGER *prearc_mem_manager; //also used in scaffolding
+extern MEM_MANAGER **preArc_mem_managers;
+extern boolean deLowKmer;
+extern boolean deLowEdge;
+extern KmerSet **KmerSets; // also used in mapping
+extern KmerSet **KmerSetsPatch;
+
+extern spcKmerSet *spcSet;
+
+/**** used for contiging ****/
+extern boolean repsTie;
+extern long long arcCounter;
+extern unsigned int num_ed;
+extern unsigned int num_ed_limit;
+extern unsigned int extraEdgeNum;
+extern EDGE *edge_array;
+extern VERTEX *vt_array;
+extern MEM_MANAGER *rv_mem_manager;
+extern MEM_MANAGER *arc_mem_manager;
+extern unsigned int num_vt;
+extern int len_bar;
+extern ARC **arcLookupTable;
+extern long long *markersArray;
+/***** used for scaffolding *****/
+extern MEM_MANAGER *cn_mem_manager;
+extern unsigned int num_ctg;
+extern unsigned int *index_array;
+extern CONTIG *contig_array;
+extern int lineLen;
+extern int weakPE;
+extern long long newCntCounter;
+extern CONNECT **cntLookupTable;
+extern unsigned int ctg_short;
+extern int cvgAvg;
+extern boolean orig2new;
+/**** used for gapFilling ****/
+extern DARRAY *readSeqInGap;
+extern DARRAY *gapSeqDarray;
+extern DARRAY **darrayBuf;
+extern int fillGap;
+/**** used for searchPath *****/
+extern int maxSteps;
+extern int num_trace;
+extern unsigned int**found_routes;
+extern unsigned int*so_far;
+extern int max_n_routes;
+extern boolean maskRep;
+extern int GLDiff;
+extern int initKmerSetSize;
+extern char *shortrdsfile;
+extern char *graphfile;
+extern double OverlapPercent ;
+extern double ConflPercent ;
+extern double close_threshold;
+extern int bund_threshold;
+extern char *ctg_file;
+//extern boolean large_kmer;
diff --git a/fusion/inc/fib.h b/fusion/inc/fib.h
new file mode 100755
index 0000000..40ac9d3
--- /dev/null
+++ b/fusion/inc/fib.h
@@ -0,0 +1,81 @@
+/*
+Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
+
+ This file is part of Velvet.
+
+ Velvet is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ Velvet is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with Velvet; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*/
+/*-
+ * Copyright 1997, 1998-2003 John-Mark Gurney.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: fib.h,v 1.9 2007/04/24 12:16:41 zerbino Exp $
+ *
+ */
+
+#ifndef _FIB_H_
+#define _FIB_H_
+
+//#include "globals.h"
+#include
+#include "def2.h"
+
+typedef Coordinate(*voidcmp) (unsigned int , unsigned int);
+
+/* functions for key heaps */
+boolean fh_isempty(FibHeap *);
+FibHeap *fh_makekeyheap(void);
+FibHeapNode *fh_insertkey(FibHeap *, Coordinate, unsigned int);
+Coordinate fh_minkey(FibHeap *);
+Coordinate fh_replacekey(FibHeap *, FibHeapNode *, Coordinate);
+unsigned int fh_replacekeydata(FibHeap *, FibHeapNode *, Coordinate, unsigned int);
+
+/* functions for unsigned int * heaps */
+FibHeap *fh_makeheap(void);
+voidcmp fh_setcmp(FibHeap *, voidcmp);
+unsigned int fh_setneginf(FibHeap *, unsigned int);
+FibHeapNode *fh_insert(FibHeap *, unsigned int);
+
+/* shared functions */
+unsigned int fh_extractmin(FibHeap *);
+unsigned int fh_min(FibHeap *);
+unsigned int fh_replacedata(FibHeapNode *, unsigned int);
+unsigned int fh_delete(FibHeap *, FibHeapNode *);
+void fh_deleteheap(FibHeap *);
+FibHeap *fh_union(FibHeap *, FibHeap *);
+
+#endif /* _FIB_H_ */
diff --git a/fusion/inc/fibHeap.h b/fusion/inc/fibHeap.h
new file mode 100755
index 0000000..e4adbb3
--- /dev/null
+++ b/fusion/inc/fibHeap.h
@@ -0,0 +1,43 @@
+/*
+Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
+
+ This file is part of Velvet.
+
+ Velvet is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ Velvet is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with Velvet; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*/
+#ifndef _FIBHEAP_H_
+#define _FIBHEAP_H_
+
+FibHeap *newFibHeap();
+
+FibHeapNode *insertNodeIntoHeap(FibHeap * heap, Coordinate key,
+ unsigned int node);
+
+Coordinate minKeyOfHeap(FibHeap * heap);
+
+Coordinate replaceKeyInHeap(FibHeap * heap, FibHeapNode * node,
+ Coordinate newKey);
+
+void replaceValueInHeap(FibHeapNode * node, unsigned int newValue);
+
+unsigned int removeNextNodeFromHeap(FibHeap * heap);
+
+void *destroyNodeInHeap(FibHeapNode * node, FibHeap * heap);
+
+void destroyHeap(FibHeap * heap);
+
+boolean IsHeapEmpty(FibHeap *heap);
+#endif
diff --git a/fusion/inc/fibpriv.h b/fusion/inc/fibpriv.h
new file mode 100755
index 0000000..651a3da
--- /dev/null
+++ b/fusion/inc/fibpriv.h
@@ -0,0 +1,110 @@
+/*
+Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
+
+ This file is part of Velvet.
+
+ Velvet is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ Velvet is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with Velvet; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*/
+/*-
+ * Copyright 1997, 1999-2003 John-Mark Gurney.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: fibpriv.h,v 1.10 2007/10/09 09:56:46 zerbino Exp $
+ *
+ */
+
+#ifndef _FIBPRIV_H_
+#define _FIBPRIV_H_
+
+#include "def2.h"
+
+/*
+ * specific node operations
+ */
+struct fibheap_el {
+ int fhe_degree;
+ boolean fhe_mark;
+ FibHeapNode *fhe_p;
+ FibHeapNode *fhe_child;
+ FibHeapNode *fhe_left;
+ FibHeapNode *fhe_right;
+ Coordinate fhe_key;
+ unsigned int fhe_data;
+};
+
+static FibHeapNode *fhe_newelem(struct fibheap *);
+static void fhe_initelem(FibHeapNode *);
+static void fhe_insertafter(FibHeapNode * a, FibHeapNode * b);
+static inline void fhe_insertbefore(FibHeapNode * a, FibHeapNode * b);
+static FibHeapNode *fhe_remove(FibHeapNode * a);
+
+/*
+ * global heap operations
+ */
+struct fibheap {
+ Coordinate(*fh_cmp_fnct) (unsigned int, unsigned int);
+ MEM_MANAGER *nodeMemory;
+ IDnum fh_n;
+ IDnum fh_Dl;
+ FibHeapNode **fh_cons;
+ FibHeapNode *fh_min;
+ FibHeapNode *fh_root;
+ unsigned int fh_neginf;
+ boolean fh_keys:1;
+};
+
+static void fh_initheap(FibHeap *);
+static void fh_insertrootlist(FibHeap *, FibHeapNode *);
+static void fh_removerootlist(FibHeap *, FibHeapNode *);
+static void fh_consolidate(FibHeap *);
+static void fh_heaplink(FibHeap * h, FibHeapNode * y, FibHeapNode * x);
+static void fh_cut(FibHeap *, FibHeapNode *, FibHeapNode *);
+static void fh_cascading_cut(FibHeap *, FibHeapNode *);
+static FibHeapNode *fh_extractminel(FibHeap *);
+static void fh_checkcons(FibHeap * h);
+static void fh_destroyheap(FibHeap * h);
+static int fh_compare(FibHeap * h, FibHeapNode * a, FibHeapNode * b);
+static int fh_comparedata(FibHeap * h, Coordinate key,
+ unsigned int data, FibHeapNode * b);
+static void fh_insertel(FibHeap * h, FibHeapNode * x);
+
+/*
+ * general functions
+ */
+static inline IDnum ceillog2(IDnum a);
+
+#endif /* _FIBPRIV_H_ */
diff --git a/fusion/inc/general.h b/fusion/inc/general.h
new file mode 100755
index 0000000..ba52eb5
--- /dev/null
+++ b/fusion/inc/general.h
@@ -0,0 +1,89 @@
+/*
+ * Filename: general.h
+ *
+ *
+ * Description:
+ * Basic functions
+ *
+ * Created on: Feb 8, 2010
+ * Author: Ruibang Luo, BGI
+ *
+ * History:
+ * 1.
+ */
+
+#pragma once
+#ifndef GENERAL_H_AQUA_
+#define GENERAL_H_AQUA_
+
+#include
+
+//Useful Variables*************************************************************
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#define FN_SIZE 2048
+//*****************************************************************************
+
+//Types************************************************************************
+typedef unsigned int uint;
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned long ulong;
+typedef unsigned long long ulonglong;
+
+typedef unsigned char BYTE;
+typedef unsigned short WORD;
+typedef unsigned int DWORD;
+
+typedef unsigned char u8_t;
+typedef unsigned short u16_t;
+typedef unsigned int u32_t;
+typedef unsigned long long u64_t;
+
+typedef char * chptr;
+
+//*****************************************************************************
+
+//Debugging********************************************************************
+//Verbose system
+//Verbosity should seperated into 4 levels: 0, 1, 2, 3
+#define VERBOSITY_BOTTOM 0
+#define VERBOSITY_TOP 4
+int ModifyVerbosity(const int);
+#define verboseBufSize 16384
+
+#define ModVerboseStrAndVerbose(level, ...) \
+ {\
+ if(verbosity >> level)\
+ {\
+ snprintf(verboseStr, verboseBufSize, ##__VA_ARGS__);\
+ fprintf(stderr,"[%s]:%s\n",__FUNCTION__,verboseStr);\
+ }\
+ }
+#define mvnv(level, ...) ModVerboseStrAndVerbose(level, ##__VA_ARGS__)
+#define die(...) \
+ {\
+ ModVerboseStrAndVerbose(0, ##__VA_ARGS__);\
+ fprintf(stderr,"Program terminated.\n");\
+ exit(EXIT_FAILURE);\
+ }
+#define sigdie(sig, ...) \
+ {\
+ ModVerboseStrAndVerbose(0, ##__VA_ARGS__);\
+ fprintf(stderr,"Program terminated.\n");\
+ exit(sig);\
+ }
+#define perrdie(...) \
+ {\
+ ModVerboseStrAndVerbose(0, ##__VA_ARGS__);\
+ perror("");\
+ fprintf(stderr,"Program terminated.\n");\
+ exit(EXIT_FAILURE);\
+ }
+#define mk \
+{\
+ fprintf(stderr, "DBG Marker @ %s:%d\n", __FUNCTION__, __LINE__);\
+}
+
+#endif
diff --git a/fusion/inc/global.h b/fusion/inc/global.h
new file mode 100755
index 0000000..5e7c71d
--- /dev/null
+++ b/fusion/inc/global.h
@@ -0,0 +1,74 @@
+int overlaplen=25;
+int verbosity=3;
+char verboseStr[verboseBufSize];
+int inGraph;
+long long n_ban;
+long long n_solexa=0;
+long long prevNum=0;
+int ins_size_var=20;
+PE_INFO *pes=NULL;
+MEM_MANAGER *rv_mem_manager=NULL;
+MEM_MANAGER *cn_mem_manager=NULL;
+MEM_MANAGER *arc_mem_manager=NULL;
+unsigned int num_vt=0;
+unsigned int **found_routes=NULL;
+unsigned int *so_far=NULL;
+int max_n_routes = 10;
+int num_trace;
+Kmer WORDFILTER;
+unsigned int num_ed=0;
+unsigned int num_ctg=0;
+unsigned int num_ed_limit;
+unsigned int extraEdgeNum;
+EDGE *edge_array=NULL;
+VERTEX *vt_array=NULL;
+unsigned int *index_array=NULL;
+CONTIG *contig_array=NULL;
+int lineLen;
+int len_bar=100;
+int weakPE=3;
+int fillGap=0;
+boolean globalFlag;
+long long arcCounter;
+MEM_MANAGER *prearc_mem_manager=NULL;
+MEM_MANAGER **preArc_mem_managers=NULL;
+int maxReadLen=0;
+int maxReadLen4all=0;
+int minReadLen=0;
+int maxNameLen=0;
+ARC **arcLookupTable=NULL;
+long long *markersArray=NULL;
+boolean deLowKmer=0;
+boolean deLowEdge=1;
+long long newCntCounter;
+boolean repsTie=0;
+CONNECT **cntLookupTable=NULL;
+int num_libs=0;
+LIB_INFO *lib_array=NULL;
+int libNo=0;
+long long readNumBack;
+int gradsCounter;
+unsigned int ctg_short=0;
+int thrd_num=8;
+int cvgAvg=0;
+KmerSet **KmerSets=NULL;
+KmerSet **KmerSetsPatch=NULL;
+
+spcKmerSet *spcSet = NULL;
+
+DARRAY *readSeqInGap=NULL;
+DARRAY *gapSeqDarray=NULL;
+DARRAY **darrayBuf;
+boolean orig2new;
+int maxSteps;
+boolean maskRep=1;
+int GLDiff=50;
+int initKmerSetSize = 0;
+char *shortrdsfile;
+char *graphfile;
+double OverlapPercent = 0.05;
+double ConflPercent = 0.05;
+double close_threshold = 0.1;
+int bund_threshold=5;
+char *ctg_file=NULL;
+//boolean large_kmer=0;
diff --git a/fusion/inc/newhash.h b/fusion/inc/newhash.h
new file mode 100644
index 0000000..6a1fd1d
--- /dev/null
+++ b/fusion/inc/newhash.h
@@ -0,0 +1,122 @@
+#ifndef __NEW_HASH_RJ
+#define __NEW_HASH_RJ
+
+#ifndef K_LOAD_FACTOR
+#define K_LOAD_FACTOR 0.75
+#endif
+
+#define MAX_KMER_COV 63
+#define EDGE_BIT_SIZE 6
+#define EDGE_XOR_MASK 0x3FU
+#define LINKS_BITS 0x00FFFFFFU
+
+#define get_kmer_seq(mer) ((mer).seq)
+#define set_kmer_seq(mer, val) ((mer).seq = val)
+
+#define get_kmer_left_cov(mer, idx) (((mer).l_links>>((idx)*EDGE_BIT_SIZE))&EDGE_XOR_MASK)
+#define set_kmer_left_cov(mer, idx, val) ((mer).l_links = ((mer).l_links&(~(EDGE_XOR_MASK<<((idx)*EDGE_BIT_SIZE)))) | (((val)&EDGE_XOR_MASK)<<((idx)*EDGE_BIT_SIZE)) )
+#define get_kmer_left_covs(mer) (get_kmer_left_cov(mer, 0) + get_kmer_left_cov(mer, 1) + get_kmer_left_cov(mer, 2) + get_kmer_left_cov(mer, 3))
+
+#define get_kmer_right_cov(mer, idx) (((mer).r_links>>((idx)*EDGE_BIT_SIZE))&EDGE_XOR_MASK)
+#define set_kmer_right_cov(mer, idx, val) ((mer).r_links = ((mer).r_links&(~(EDGE_XOR_MASK<<((idx)*EDGE_BIT_SIZE)))) | (((val)&EDGE_XOR_MASK)<<((idx)*EDGE_BIT_SIZE)) )
+#define get_kmer_right_covs(mer) (get_kmer_right_cov(mer, 0) + get_kmer_right_cov(mer, 1) + get_kmer_right_cov(mer, 2) + get_kmer_right_cov(mer, 3))
+
+
+#define is_kmer_entity_null(flags, idx) ((flags)[(idx)>>4]>>(((idx)&0x0f)<<1)&0x01)
+#define is_kmer_entity_del(flags, idx) ((flags)[(idx)>>4]>>(((idx)&0x0f)<<1)&0x02)
+#define set_kmer_entity_null(flags, idx) ((flags)[(idx)>>4] |= (0x01u<<(((idx)&0x0f)<<1)))
+#define set_kmer_entity_del(flags, idx) ((flags)[(idx)>>4] |= (0x02u<<(((idx)&0x0f)<<1)))
+#define clear_kmer_entity_null(flags, idx) ((flags)[(idx)>>4] &= ~(0x01u<<(((idx)&0x0f)<<1)))
+#define clear_kmer_entity_del(flags, idx) ((flags)[(idx)>>4] &= ~(0x02u<<(((idx)&0x0f)<<1)))
+#define exists_kmer_entity(flags, idx) (!((flags)[(idx)>>4]>>(((idx)&0x0f)<<1)&0x03))
+
+
+typedef struct kmer_st
+{
+ Kmer seq;
+ ubyte4 l_links; // sever as edgeID since make_edge
+ ubyte4 r_links:4*EDGE_BIT_SIZE;
+ ubyte4 linear:1;
+ ubyte4 deleted:1;
+ ubyte4 checked:1;
+ ubyte4 single:1;
+ ubyte4 twin:2;
+ ubyte4 inEdge:2;
+} kmer_t;
+
+typedef struct kmerSet_st
+{
+ kmer_t *array;
+ ubyte4 *flags;
+ ubyte8 size;
+ ubyte8 count;
+ ubyte8 max;
+ double load_factor;
+ ubyte8 iter_ptr;
+
+ ubyte8 searchCnt;
+ ubyte8 foundCnt;
+ ubyte8 delCnt;
+ ubyte8 searchSpcSeedCnt;
+ ubyte8 getSpcSeedCnt;
+ ubyte8 levelGet[3];
+
+} KmerSet;
+
+typedef struct kmer_pt
+{
+ kmer_t *node;
+ Kmer kmer;
+ boolean isSmaller;
+ struct kmer_pt *next;
+}KMER_PT;
+
+//////////////////////////////////////////////////////////////// spaced seed
+
+typedef struct spaced_base
+{
+ ubyte2 spaced_bases:14;
+ //ubyte2 repeat:1;
+ //ubyte4 edgeID;
+ kmer_t *large_kmer;
+ struct spaced_base *next;
+}spcBase;
+
+typedef struct spaced_kmer
+{
+ Kmer seq;
+ struct spaced_base *start;
+ ubyte4 spaced_base_num;
+}spcKmer;
+
+typedef struct spaced_kmer_set
+{
+ spcKmer *array;
+ ubyte4 *flags;
+ ubyte8 size;
+ ubyte8 count;
+ ubyte8 max;
+ double load_factor;
+} spcKmerSet;
+
+extern spcKmerSet* init_spckmerset(ubyte8 init_size, float load_factor);
+extern void buildSpcKmerSet(KmerSet *set, spcKmerSet *spaced_kset);
+extern int search_spckmerset(spcKmerSet *set, ubyte8 seq, spcKmer **rs);
+extern int put_spckmerset(spcKmerSet *set, Kmer spc_kmer, ubyte2 spaced_bases, kmer_t *node);
+
+//////////////////////////////////////////////////////////////// spaced seed END
+
+extern KmerSet* init_kmerset(ubyte8 init_size, float load_factor);
+extern int search_kmerset(KmerSet *set, ubyte8 seq, kmer_t **rs);
+extern int put_kmerset(KmerSet *set, ubyte8 seq, ubyte left, ubyte right,kmer_t **kmer_p);
+extern byte8 count_kmerset(KmerSet *set);
+extern void free_Sets(KmerSet **KmerSets,int num);
+extern void free_kmerset(KmerSet *set);
+extern void dislink2nextUncertain(kmer_t *node,char ch,boolean smaller);
+extern void dislink2prevUncertain(kmer_t *node,char ch,boolean smaller);
+
+extern int count_branch2prev(kmer_t *node);
+extern int count_branch2next(kmer_t *node);
+extern char firstCharInKmer(Kmer kmer);
+
+#endif
diff --git a/fusion/inc/nuc.h b/fusion/inc/nuc.h
new file mode 100755
index 0000000..fdfe10b
--- /dev/null
+++ b/fusion/inc/nuc.h
@@ -0,0 +1,13 @@
+/***************************************************************************
+ * Title: nuc.h
+ * Author: Haixu Tang
+ * Created: Jun. 2002
+ * Last modified: May. 2004
+ *
+ * Copyright (c) 2001-2004 The Regents of the University of California
+ * All Rights Reserved
+ * See file LICENSE for details.
+ ***************************************************************************/
+int total_nuc = 16;
+char na_name[17] = {'g', 'a', 't', 'c',
+ 'n', 'r', 'y', 'w', 's', 'm', 'k', 'h', 'b', 'v', 'd', 'x'};
diff --git a/fusion/inc/stack.h b/fusion/inc/stack.h
new file mode 100755
index 0000000..c09ed5f
--- /dev/null
+++ b/fusion/inc/stack.h
@@ -0,0 +1,35 @@
+#ifndef __STACK__
+#define __STACK__
+
+#include
+#include
+#include
+
+typedef struct block_starter
+{
+ struct block_starter *prev;
+ struct block_starter *next;
+}BLOCK_STARTER;
+
+typedef struct stack
+{
+ BLOCK_STARTER *block_list;
+ int index_in_block;
+ int items_per_block;
+ int item_c;
+ size_t item_size;
+ BLOCK_STARTER *block_backup;
+ int index_backup;
+ int item_c_backup;
+}STACK;
+
+void stackBackup(STACK *astack);
+void stackRecover(STACK *astack);
+void *stackPush(STACK *astack);
+void *stackPop(STACK *astack);
+void freeStack(STACK *astack);
+void emptyStack(STACK *astack);
+STACK *createStack(int num_items,size_t unit_size);
+
+
+#endif
diff --git a/fusion/inc/stdinc.h b/fusion/inc/stdinc.h
new file mode 100755
index 0000000..9700d5d
--- /dev/null
+++ b/fusion/inc/stdinc.h
@@ -0,0 +1,40 @@
+/***************************************************************************
+
+ * Title: stdinc.h
+
+ * Author: Haixu Tang
+
+ * Created: Jun. 2002
+
+ * Last modified: May. 2004
+
+ *
+
+ * Copyright (c) 2001-2004 The Regents of the University of California
+
+ * All Rights Reserved
+
+ * See file LICENSE for details.
+
+ ***************************************************************************/
+
+#include
+
+#include
+
+#include
+
+#include
+
+#include
+
+#include
+
+#include
+
+#include
+
+#include "def.h"
+
+#include "general.h"
+
diff --git a/fusion/inc/types.h b/fusion/inc/types.h
new file mode 100755
index 0000000..fdeb4f6
--- /dev/null
+++ b/fusion/inc/types.h
@@ -0,0 +1,14 @@
+#ifndef __TYPES_RJ
+#define __TYPES_RJ
+
+typedef unsigned long long ubyte8;
+typedef unsigned int ubyte4;
+typedef unsigned short ubyte2;
+typedef unsigned char ubyte;
+
+typedef long long byte8;
+typedef int byte4;
+typedef short byte2;
+typedef char byte;
+
+#endif
diff --git a/fusion/kmer.c b/fusion/kmer.c
new file mode 100755
index 0000000..37c7da1
--- /dev/null
+++ b/fusion/kmer.c
@@ -0,0 +1,135 @@
+#include "stdinc.h"
+#include "newhash.h"
+#include "extfunc.h"
+#include "extvab.h"
+
+static unsigned char filter_array[8] = { (unsigned char) 1,((unsigned char) 1) << 1,((unsigned char) 1) << 2,((unsigned char) 1) << 3,((unsigned char) 1) << 4,((unsigned char) 1) << 5,((unsigned char) 1) << 6,((unsigned char) 1) << 7};
+
+
+void link2next(NODE *node,char ch)
+{
+ if(node->links & filter_array[(int)ch])
+ node->linksB = node->linksB | filter_array[(int)ch];
+ else
+ node->links = node->links | filter_array[(int)ch];
+
+}
+
+unsigned char check_linkB2next(NODE *node,char ch)
+{
+ return filter_array[(int)ch]&node->linksB;
+}
+
+unsigned char check_link2next(NODE *node,char ch)
+{
+ return filter_array[(int)ch]&node->links;
+}
+
+void unlink2next(NODE *node,char ch)
+{
+ node->links = node->links & (~filter_array[(int)ch]);
+}
+
+
+void link2prev(NODE *node,char ch)
+{
+ if(node->links & filter_array[ch+4])
+ node->linksB = node->linksB | filter_array[ch+4];
+ else
+ node->links = node->links | filter_array[ch+4];
+}
+
+unsigned char check_linkB2prev(NODE *node,char ch)
+{
+ return filter_array[ch+4]&node->linksB;
+}
+
+unsigned char check_link2prev(NODE *node,char ch)
+{
+ return filter_array[ch+4]&node->links;
+}
+
+void unlink2prev(NODE *node,char ch)
+{
+ node->links = node->links & (~filter_array[ch+4]);
+}
+
+int count_link2next(NODE *node)
+{
+ int num = 0,i;
+ unsigned char ch = node->links;
+
+ for(i=0;i<4;i++){
+ num += ch&0x01;
+ ch >>= 1;
+ }
+ return num;
+}
+
+int count_link2nextB(NODE *node)
+{
+ int num = 0,i;
+ unsigned char ch = node->linksB;
+
+ for(i=0;i<4;i++){
+ num += ch&0x01;
+ ch >>= 1;
+ }
+ return num;
+}
+
+int count_link2prevB(NODE *node)
+{
+ int num = 0,i;
+ unsigned char ch = node->linksB;
+
+ ch >>= 4;
+ for(i=0;i<4;i++){
+ num += ch&0x01;
+ ch >>= 1;
+ }
+ return num;
+}
+
+int count_link2prev(NODE *node)
+{
+ int num = 0,i;
+ unsigned char ch = node->links;
+
+ ch >>= 4;
+ for(i=0;i<4;i++){
+ num += ch&0x01;
+ ch >>= 1;
+ }
+ return num;
+}
+
+Kmer KmerPlus(Kmer prev,char ch)
+{
+ Kmer word = prev;
+ word <<= 2;
+ word += ch;
+ return word;
+}
+Kmer nextKmer(Kmer prev,char ch)
+{
+ Kmer word = prev;
+ word <<= 2;
+ word &= WORDFILTER;
+ word += ch;
+ return word;
+}
+
+Kmer prevKmer(Kmer next,char ch)
+{
+ Kmer word = next;
+ word >>= 2;
+ word += ((Kmer)ch) << 2*(overlaplen-1);
+ return word;
+}
+
+char firstCharInKmer(Kmer kmer)
+{
+ return (char) (kmer >> 2*(overlaplen-1));// & 3;
+}
+
diff --git a/fusion/lib.c b/fusion/lib.c
new file mode 100755
index 0000000..ee40348
--- /dev/null
+++ b/fusion/lib.c
@@ -0,0 +1,329 @@
+#include "stdinc.h"
+#include "newhash.h"
+#include "extfunc.h"
+#include "extvab.h"
+
+static char tabs[2][1024];
+
+static boolean splitColumn(char *line)
+{
+ int len = strlen(line);
+ int i=0,j;
+ int tabs_n = 0;
+
+ while(i=32&&line[i]<=126&&line[i]!='='){
+ j=0;
+ while(i=32&&line[i]<=126&&line[i]!='='){
+ tabs[tabs_n][j++] = line[i];
+ i++;
+ }
+ tabs[tabs_n][j] = '\0';
+ tabs_n++;
+ if(tabs_n==2)
+ return 1;
+ }
+ i++;
+ }
+ if(tabs_n==2)
+ return 1;
+ else
+ return 0;
+}
+
+static int cmp_lib(const void *a,const void *b)
+{
+ LIB_INFO *A,*B;
+ A = (LIB_INFO *)a;
+ B = (LIB_INFO *)b;
+
+ if(A->avg_ins>B->avg_ins)
+ return 1;
+ else if(A->avg_ins==B->avg_ins)
+ return 0;
+ else
+ return -1;
+}
+
+void scan_libInfo(char *libfile)
+{
+ FILE *fp;
+ char line[1024],ch;
+ int i,j,index;
+ int libCounter;
+ boolean flag;
+
+ fp = ckopen(libfile,"r");
+ num_libs = 0;
+ while(fgets(line,1024,fp)){
+ ch = line[5];
+ line[5] = '\0';
+ if(strcmp(line,"[LIB]")==0)
+ num_libs++;
+ if(!num_libs){
+ line[5] = ch;
+ flag = splitColumn(line);
+ if(!flag)
+ continue;
+ if(strcmp(tabs[0],"max_rd_len")==0)
+ maxReadLen = atoi(tabs[1]);
+ }
+ }
+//count file numbers of each type
+ lib_array = (LIB_INFO *)ckalloc(num_libs*sizeof(LIB_INFO));
+ for(i=0;i0 ? maxLong:maxReadLen;
+}
+
+void free_libs()
+{
+
+ if(!lib_array)
+ return;
+
+ int i,j;
+ for(i=0;i*B)
+ return 1;
+ else if(*A==*B)
+ return 0;
+ else
+ return -1;
+}
+int uniqueLenSearch(unsigned int *len_array,unsigned int *flag_array,int num,unsigned int target)
+{
+ int mid,low,high;
+ low = 1;
+ high = num;
+
+ while(low<=high){
+ mid = (low+high)/2;
+ if(len_array[mid]==target)
+ break;
+ else if(target>len_array[mid])
+ low = mid+1;
+ else
+ high = mid-1;
+ }
+ if(low>high)
+ return -1;
+ //locate the first same length unflaged
+ return flag_array[mid]++;
+
+}
+
+int lengthSearch(unsigned int *len_array,unsigned int *flag_array,int num,unsigned int target)
+{
+ int mid,low,high,i;
+ low = 1;
+ high = num;
+
+ while(low<=high){
+ mid = (low+high)/2;
+ if(len_array[mid]==target)
+ break;
+ else if(target>len_array[mid])
+ low = mid+1;
+ else
+ high = mid-1;
+ }
+ if(low>high)
+ return -1;
+ //locate the first same length unflaged
+ if(!flag_array[mid]){
+ for(i=mid-1;i>0;i--){
+ if(len_array[i]!=len_array[mid]||flag_array[i])
+ break;
+ }
+ flag_array[i+1] = 1;
+ return i+1;
+ }else{
+ for(i=mid+1;i<=num;i++){
+ if(!flag_array[i])
+ break;
+ }
+ flag_array[i] = 1;
+ return i;
+ }
+
+}
+
+void quick_sort_int(unsigned int *length_array, int low, int high)
+{
+ int i, j;
+ Kmer pivot;
+ if (low < high)
+ {
+ pivot=length_array[low];
+ i=low;
+ j=high;
+
+ while(i=pivot)
+ j--;
+ if(i'){
+ sscanf(line+7,"%d",&length);
+ index_array[++index] = length;
+ length_array[++i] = length;
+ }
+ }
+ num_ctg = index;
+ orig2new = 1;
+ //quick_sort_int(length_array,1,num_ctg);
+ qsort(&(length_array[1]),num_ctg,sizeof(length_array[0]),cmp_int);
+ //extract unique length
+ diff_len = 0;
+ for(i=1;i<=num_ctg;i++){
+ for(j=i+1;j<=num_ctg;j++)
+ if(length_array[j]!=length_array[i])
+ break;
+ length_array[++diff_len] = length_array[i];
+ flag_array[diff_len] = i;
+ i = j - 1;
+ }
+ /*
+ for(i=1;i<=num_ctg;i++)
+ flag_array[i] = 0;
+ */
+ contig_array = (CONTIG *)ckalloc((num_ctg+1)*sizeof(CONTIG));
+
+ //load edges
+ index = 0;
+ rewind(fp);
+ while(fgets(line,sizeof(line),fp)!=NULL){
+ if(line[0]=='>'){
+// if(overlaplen<=31)
+// sscanf(line,">length %u,%llx,%llx,%d,%d",&length,&from_kmer,&to_kmer,&bal_ed,&cvg);
+// else
+ sscanf(line,">length %u,%d,%d",&length,&bal_ed,&cvg);
+ newIndex = uniqueLenSearch(length_array,flag_array,diff_len,length);
+ index_array[++index]=newIndex;
+
+ contig_array[newIndex].length = length;
+ contig_array[newIndex].bal_edge = bal_ed + 1;
+ contig_array[newIndex].downwardConnect = NULL;
+ contig_array[newIndex].mask = 0;
+ contig_array[newIndex].flag = 0;
+ contig_array[newIndex].arcs = NULL;
+ contig_array[newIndex].seq = NULL;
+ contig_array[newIndex].multi = 0;
+ contig_array[newIndex].inSubGraph = 0;
+ contig_array[newIndex].cvg = cvg/10;
+ if(cvg){
+ counter += length;
+ cvgSum += cvg*length;
+ }
+ fprintf(out_fp,"%d %d %d\n",index,newIndex,contig_array[newIndex].bal_edge);
+ }
+ }
+ if(counter)
+ //cvgAvg = cvgSum/counter > 2 ? cvgSum/counter : 3;
+ cvgAvg = cvgSum/counter/10 > 2 ? cvgSum/counter/10 : 3;
+
+ //mark repeats
+ int bal_i;
+ /*if(maskRep){
+ counter = 0;
+ for(i=1;i<=num_ctg;i++){
+ bal_i = getTwinCtg(i);
+ if((contig_array[i].cvg+contig_array[bal_i].cvg)>4*cvgAvg){
+ contig_array[i].mask = 1;
+ contig_array[bal_i].mask = 1;
+ counter += 2;
+ }
+ if(isSmallerThanTwin(i))
+ i++;
+ }
+ printf("average contig coverage : %d. Number of contig(s) masked because of high coverage: %llx\n",
+ cvgAvg,counter);
+ }*/
+
+ counter = 0;
+ for(i=1;i<=num_ctg;i++){
+ if(contig_array[i].mask)
+ continue;
+ bal_i = getTwinCtg(i);
+ if(contig_array[i].lengthmultiplicity = weight;
+ parc->next = contig_array[from_c].arcs;
+ contig_array[from_c].arcs = parc;
+}*/
+
+/*void loadArcs(char *graphfile)
+{
+ FILE *fp;
+ char name[256],line[1024];
+ unsigned int target,weight;
+ unsigned int from_ed;
+ char *seg;
+
+ sprintf(name,"%s.Arc",graphfile);
+ fp = ckopen(name,"r");
+
+ createPreArcMemManager();
+ arcCounter = 0;
+ while(fgets(line,sizeof(line),fp)!=NULL){
+ seg = strtok(line," ");
+ from_ed = atoi(seg);
+ //printf("%d\n",from_ed);
+ while((seg=strtok(NULL," "))!=NULL){
+ target = atoi(seg);
+ seg = strtok(NULL," ");
+ weight = atoi(seg);
+ add1Arc(from_ed,target,weight);
+
+ }
+ }
+ printf("%lld arcs loaded\n",arcCounter);
+ fclose(fp);
+}*/
+
+void loadContig(char *graphfile)
+{
+ //fprintf(stderr,"[%s]entering this function\n",__FUNCTION__);
+ char c,name[256],line[1024],*tightSeq=NULL;
+ FILE *fp;
+ int n=0,length,index=-1,edgeno;
+ unsigned int i;
+ unsigned int newIndex;
+
+ sprintf(name,"%s.contig",graphfile);
+ fp = ckopen(name,"r");
+
+ while(fgets(line,sizeof(line),fp)!=NULL){
+ if(line[0]=='>'){
+ if(index>=0){
+ newIndex = index_array[edgeno];
+ contig_array[newIndex].seq = tightSeq;
+ }
+ n=0;
+ index++;
+ sscanf(line+1,"%d %s %d",&edgeno,name,&length);
+ //printf("contig %d, length %d\n",edgeno,length);
+ tightSeq = (char *)ckalloc((length/4+1)*sizeof(char));
+ //fprintf(stderr,"[%s]loaded %d.\n",__FUNCTION__,edgeno);
+ }else{
+ int tmp_len=strlen(line);
+ for(i=0;i='a' && line[i]<='z'){
+ c = base2int(line[i]-'a'+'A');
+ writeChar2tightString(c,tightSeq,n++);
+ }
+ else if(line[i]>='A' && line[i]<='Z'){
+ c = base2int(line[i]);
+ writeChar2tightString(c,tightSeq,n++);
+ }
+ }
+ }
+
+ }
+ if(index>=0){
+ newIndex = index_array[edgeno];
+ contig_array[newIndex].seq = tightSeq;
+ }
+ printf("[%s]input %d contigs\n",__FUNCTION__,index+1);
+ fclose(fp);
+
+ //printf("the %dth contig with index 107\n",index);
+}
+void freeContig_array()
+{
+ if(!contig_array)
+ return;
+
+ unsigned int i;
+
+ for(i=1;i<=num_ctg;i++){
+ if(contig_array[i].seq)
+ free((void *)contig_array[i].seq);
+ if(contig_array[i].closeReads)
+ freeStack(contig_array[i].closeReads);
+ }
+
+ free((void *)contig_array);
+ contig_array = NULL;
+}
+/*
+void loadCvg(char *graphfile)
+{
+ char name[256],line[1024];
+ FILE *fp;
+ int cvg;
+ unsigned int newIndex,edgeno,bal_ctg;
+
+ sprintf(name,"%s.contigCVG",graphfile);
+ fp = fopen(name,"r");
+ if(!fp){
+ printf("contig coverage file %s is not found!\n",name);
+ return;
+ }
+
+ while(fgets(line,sizeof(line),fp)!=NULL){
+ if(line[0]=='>'){
+ sscanf(line+1,"%d %d",&edgeno,&cvg);
+ newIndex = index_array[edgeno];
+ cvg = cvg <= 255 ? cvg:255;
+ contig_array[newIndex].multi = cvg;
+ bal_ctg = getTwinCtg(newIndex);
+ contig_array[bal_ctg].multi= cvg;
+ }
+ }
+ fclose(fp);
+}
+*/
diff --git a/fusion/localAsm.c b/fusion/localAsm.c
new file mode 100755
index 0000000..becf882
--- /dev/null
+++ b/fusion/localAsm.c
@@ -0,0 +1,1629 @@
+#include "stdinc.h"
+#include "newhash.h"
+#include "extfunc.h"
+#include "extvab.h"
+
+#define CTGendLen 35 // shouldn't larger than max_read_len
+#define UPlimit 5000
+#define MaxRouteNum 10
+
+static Kmer pubKmer = 0x1b4d65165b;
+
+static void kmerSet_mark(KmerSet *set);
+static void trace4Repeat(Kmer currW,int steps,int min,int max,int *num_route,
+ KmerSet *kset,Kmer kmerDest,int overlap,Kmer WORDF,
+ int *traceCounter,int maxRoute,kmer_t **soFarNode,short *multiOccu1,short *multiOccu2,
+ int *routeLens,char **foundRoutes,char *soFarSeq,
+ long long *soFarLinks,double *avgLinks);
+
+static Kmer prevKmerLocal(Kmer next,char ch,int overlap)
+{
+ Kmer word = next;
+ word >>= 2;
+ word += ((Kmer)ch) << 2*(overlap-1);
+ return word;
+}
+static Kmer nextKmerLocal(Kmer prev,char ch,Kmer WordFilter)
+{
+ Kmer word = prev;
+ word <<= 2;
+ word &= WordFilter;
+ word += ch;
+ return word;
+}
+static void singleKmer(int t, KmerSet *kset,int flag,Kmer *kmerBuffer,char *prevcBuffer,char *nextcBuffer)
+{
+ kmer_t *pos;
+
+ put_kmerset(kset, kmerBuffer[t], prevcBuffer[t],nextcBuffer[t],&pos);
+ if(pos->inEdge==flag)
+ return;
+ else if(pos->inEdge==0)
+ pos->inEdge = flag;
+ else if(pos->inEdge==1&&flag==2)
+ pos->inEdge = 3;
+ else if(pos->inEdge==2&&flag==1)
+ pos->inEdge = 3;
+
+}
+
+static void putKmer2DBgraph(KmerSet *kset,int flag,int kmer_c,Kmer *kmerBuffer,char *prevcBuffer,char *nextcBuffer)
+{
+ int t;
+ for(t=0;t0)
+ prevcBuffer[index] = bal_seq[bal_j-1];
+ else
+ prevcBuffer[index] = InvalidCh;
+ nextcBuffer[index++] = bal_seq[bal_j+overlap];
+ //printf("%dth: %p with %p\n",kmer_c-1,bal_word,hashBanBuffer[kmer_c-1]);
+ }
+ }
+ *kmer_c = index;
+}
+
+static void headTightStr(char *tightStr,int length,int start,int headLen,int revS,char *src_seq)
+{
+ int i,index=0;
+
+ if(!revS){
+ for(i=start;i=length-headLen-start;i--)
+ src_seq[index++] = int_comp(getCharInTightString(tightStr,i));
+ }
+}
+
+static int getSeqFromCtg(CTGinSCAF *ctg,boolean fromHead,unsigned int len,int originOverlap,char *src_seq)
+{
+ unsigned int ctgId = ctg->ctgID;
+ unsigned int bal_ctg = getTwinCtg(ctgId);
+
+ if(contig_array[ctgId].length<1)
+ return 0;
+ unsigned int length = contig_array[ctgId].length + originOverlap;
+
+ len = len < length ? len:length;
+ if(fromHead){
+ if(contig_array[ctgId].seq)
+ headTightStr(contig_array[ctgId].seq,length,0,len,0,src_seq);
+ else
+ headTightStr(contig_array[bal_ctg].seq,length,0,len,1,src_seq);
+ }else{
+ if(contig_array[ctgId].seq)
+ headTightStr(contig_array[ctgId].seq,length,length-len,len,0,src_seq);
+ else
+ headTightStr(contig_array[bal_ctg].seq,length,length-len,len,1,src_seq);
+ }
+ return len;
+}
+
+
+static KmerSet *readsInGap2DBgraph(READNEARBY *rdArray, int num, CTGinSCAF *ctg1,CTGinSCAF *ctg2,int originOverlap,
+ Kmer *kmerCtg1,Kmer *kmerCtg2,int overlap,Kmer WordFilter)
+{
+ int kmer_c;
+ Kmer *kmerBuffer;
+ char *nextcBuffer,*prevcBuffer;
+ int i;
+ int buffer_size=maxReadLen > CTGendLen ? maxReadLen:CTGendLen;
+ KmerSet *kmerS=NULL;
+ int lenCtg1;
+ int lenCtg2;
+ char *bal_seq;
+ char *src_seq;
+
+ src_seq = (char *)ckalloc(buffer_size*sizeof(char));
+ bal_seq = (char *)ckalloc(buffer_size*sizeof(char));
+
+ kmerBuffer = (Kmer *)ckalloc(buffer_size*sizeof(Kmer));
+ prevcBuffer = (char *)ckalloc(buffer_size*sizeof(char));
+ nextcBuffer = (char *)ckalloc(buffer_size*sizeof(char));
+
+ kmerS = init_kmerset(1024,0.77f);
+
+ for(i=0;ictgID==3733&&ctg2->ctgID==3067){
+ for(i=0;i=0;i--){
+ ch = kmer&3;
+ kmer >>= 2;
+ kmerSeq[i] = ch;
+ }
+ for(i=0;iiter_ptr = 0;
+ while(set->iter_ptr < set->size){
+ if(!is_kmer_entity_null(set->flags, set->iter_ptr)){
+ in_num = out_num = 0;
+ rs = set->array + set->iter_ptr;
+ word = rs->seq;
+ for(i=0;i<4;i++){
+ cvgSingle = get_kmer_left_cov(*rs,i);
+ if(cvgSingle>0){
+ in_num++;
+ }
+ cvgSingle = get_kmer_right_cov(*rs,i);
+ if(cvgSingle>0){
+ out_num++;
+ }
+ }
+
+ if(rs->single){
+ counter++;
+ }
+ if(in_num==1&&out_num==1){
+ rs->linear = 1;
+ linear++;
+ }
+ }
+ set->iter_ptr ++;
+ }
+ //printf("Allocated %ld node, %ld single nodes, %ld linear\n",(long)count_kmerset(set),counter,linear);
+}
+
+static kmer_t *searchNode(Kmer word,KmerSet *kset,int overlap)
+{
+ Kmer bal_word = reverseComplement(word,overlap);
+ kmer_t *node;
+ boolean found;
+ if(wordUPlimit){
+ /*
+ if(overlap==19&&kmerDest[0]==pubKmer)
+ printf("UPlimit\n");
+ */
+ return;
+ }
+ if(steps>max||*num_route>=maxRoute){
+ /*
+ if(overlap==19&&kmerDest[0]==pubKmer)
+ printf("max steps/maxRoute\n");
+ */
+ return;
+ }
+ Kmer word = reverseComplement(currW,overlap);
+ boolean isSmaller = currW < word;
+ int i;
+ char ch;
+ unsigned char links;
+ if(isSmaller)
+ word = currW;
+
+ kmer_t *node;
+ boolean found = search_kmerset(kset,word,&node);
+ if(!found){
+ printf("Trace: can't find kmer %llx (rc %llx, input %llx) at step %d\n",word,
+ reverseComplement(word,overlap),currW,steps);
+ return;
+ }
+
+ if(node->twin>1)
+ return;
+ if(soFarNode)
+ soFarNode[steps] = node;
+
+ if(steps>0)
+ soFarSeq[steps-1] = currW&0x03;
+
+ int index,end;
+ int linkCounter = *soFarLinks;
+ if(steps>=min&&node->inEdge>1&&(end=searchKmerOnCtg(currW,kmerDest,num))>=0){
+ index = *num_route;
+ if(steps>0)
+ avgLinks[index] = (double)linkCounter/steps;
+ else
+ avgLinks[index] = 0;
+ //find node that appears more than once in the path
+ multiOccu[index] = 0;
+ for(i=0;ideleted = 0;
+ for(i=0;ideleted){
+ multiOccu[index] = 1;
+ break;
+ }
+ soFarNode[i]->deleted = 1;
+ }
+
+ routeEndOnCtg2[index] = end;
+ routeLens[index] = steps;
+ char *array = foundRoutes[index];
+ for(i=0;i0;i--){
+ ch = nPick1(array,i);
+ links = get_kmer_right_cov(*node,ch);
+ if(!links)
+ continue;
+ *soFarLinks = linkCounter + links;
+ word = nextKmerLocal(currW,ch,WORDF);
+ traceAlongDBgraph(word,steps,min,max,num_route,
+ kset,kmerDest,num,overlap,WORDF,
+ foundRoutes,routeEndOnCtg2,routeLens,soFarSeq,
+ traceCounter,maxRoute,soFarNode,multiOccu,
+ soFarLinks,avgLinks);
+ }
+ }else{
+ int array[] = {0,1,2,3};
+ for(i=4;i>0;i--){
+ ch = nPick1(array,i);
+ links = get_kmer_left_cov(*node,ch);
+ if(!links)
+ continue;
+ *soFarLinks = linkCounter + links;
+ word = nextKmerLocal(currW,int_comp(ch),WORDF);
+ traceAlongDBgraph(word,steps,min,max,num_route,
+ kset,kmerDest,num,overlap,WORDF,
+ foundRoutes,routeEndOnCtg2,routeLens,soFarSeq,
+ traceCounter,maxRoute,soFarNode,multiOccu,
+ soFarLinks,avgLinks);
+ }
+ }
+}
+
+static int searchFgap(KmerSet *kset,CTGinSCAF *ctg1,CTGinSCAF *ctg2,Kmer *kmerCtg1,
+ Kmer *kmerCtg2,unsigned int origOverlap,int overlap,DARRAY *gapSeqArray,
+ int len1,int len2,Kmer WordFilter,int *offset1,int *offset2,char *seqGap,int *cut1,int *cut2)
+{
+
+ int i;
+ int ret = 0;
+ kmer_t *node,**soFarNode;
+ int num_route;
+ int gapLen = ctg2->start - ctg1->end - origOverlap + overlap;
+ int min = gapLen-GLDiff>0 ? gapLen-GLDiff:0; //0531
+ int max = gapLen + GLDiff < 10 ? 10 : gapLen + GLDiff;
+ char **foundRoutes;
+ char *soFarSeq;
+ int traceCounter;
+ int *routeEndOnCtg2;
+ int *routeLens;
+ boolean *multiOccu;
+ long long soFarLinks;
+ double *avgLinks;
+
+ //mask linear internal linear kmer on contig1 end
+ routeEndOnCtg2 = (int *)ckalloc(MaxRouteNum*sizeof(int));
+ routeLens = (int *)ckalloc(MaxRouteNum*sizeof(int));
+ multiOccu = (boolean *)ckalloc(MaxRouteNum*sizeof(boolean));
+ short *MULTI1 = (short *)ckalloc(MaxRouteNum*sizeof(short));
+ short *MULTI2 = (short *)ckalloc(MaxRouteNum*sizeof(short));
+ soFarSeq = (char *)ckalloc(max*sizeof(char));
+ soFarNode = (kmer_t **)ckalloc((max+1)*sizeof(kmer_t *));
+ foundRoutes = (char **)ckalloc(MaxRouteNum*sizeof(char *));;
+ avgLinks = (double *)ckalloc(MaxRouteNum*sizeof(double));;
+ for(i=0;i=0;i--){
+
+ num_route = traceCounter = soFarLinks = 0;
+ int steps=0;
+ traceAlongDBgraph(kmerCtg1[i],steps,min,max,&num_route,
+ kset,kmerCtg2,len2,overlap,WordFilter,
+ foundRoutes,routeEndOnCtg2,routeLens,soFarSeq,
+ &traceCounter,MaxRouteNum,soFarNode,multiOccu,
+ &soFarLinks,avgLinks);
+ if(num_route>0){
+ int m,minEnd=routeEndOnCtg2[0];
+ for(m=0;m1){
+ for(m=0;m3)
+ break;
+ printf("%c",int2base((int)foundRoutes[m][j]));
+ }
+ printf(": %4.2f\n",avgLinks[m]);
+ }
+ } */
+
+ num_route = traceCounter = soFarLinks = 0;
+ steps=0;
+ trace4Repeat(kmerCtg1[i],steps,min,max,&num_route,
+ kset,kmerCtg2[minEnd],overlap,WordFilter,
+ &traceCounter,MaxRouteNum,soFarNode,MULTI1,MULTI2,
+ routeLens,foundRoutes,soFarSeq,&soFarLinks,avgLinks);
+ int j,best=0;
+ int maxLen=routeLens[0];
+ double maxLink = avgLinks[0];
+ char *pt;
+ boolean repeat=0,sameLen=1;
+ int leftMost=max,rightMost=max;
+ if(num_route<1){
+ fprintf(stderr,"After trace4Repeat: non route was found\n");
+ continue;
+ }
+ if(num_route>1){
+ // if multi paths are found, we check on the repeatative occurrences and links/length
+ for(m=0;m=0&&MULTI2[m]>=0){
+ repeat = 1;
+ leftMost = leftMost>MULTI1[m] ? MULTI1[m]:leftMost;
+ rightMost = rightMost>MULTI2[m] ? MULTI2[m]:rightMost;
+ }
+ if(routeLens[m]!=maxLen)
+ sameLen = 0;
+ if(routeLens[m]maxLink){
+ maxLink = avgLinks[m];
+ best = m;
+ }
+ }
+ }
+
+ if(repeat){
+ *offset1 = *offset2 = *cut1 = *cut2 = 0;
+ int index=0;
+ char ch;
+ for(j=0;j0||*offset2>0){
+ *cut1 = len1-i-1;
+ *cut2 = minEnd;
+ //fprintf(stderr,"\n");
+ for(m=0;m3)
+ break;
+ //fprintf(stderr,"%c",int2base((int)foundRoutes[m][j]));
+ }
+ //fprintf(stderr,": %4.2f\n",avgLinks[m]);
+ }
+ /*
+ fprintf(stderr,">Gap (%d + %d) (%d + %d)\n",*offset1,*offset2,*cut1,*cut2);
+ for(index=0;index<*offset1+*offset2;index++)
+ fprintf(stderr,"%c",int2base(seqGap[index]));
+ fprintf(stderr,"\n"); */
+ }
+
+ ret = 3;
+ break;
+ }
+
+ if(overlap+(len1-i-1)+minEnd-routeLens[best]>(int)origOverlap)
+ continue;
+
+ ctg1->gapSeqOffset = gapSeqArray->item_c;
+ ctg1->gapSeqLen = routeLens[best];
+ if(!darrayPut(gapSeqArray,ctg1->gapSeqOffset+maxLen/4))
+ continue;
+ pt = (char *)darrayPut(gapSeqArray,ctg1->gapSeqOffset);
+ /*
+ printKmer(stderr,kmerCtg1[i],overlap);
+ fprintf(stderr,"-");
+ */
+ for(j=0;j3)
+ break;
+ writeChar2tightString(foundRoutes[best][j],pt,j);
+ //fprintf(stderr,"%c",int2base((int)foundRoutes[best][j]));
+ }
+ //fprintf(stderr,": GAPSEQ %d + %d, avglink %4.2f\n",len1-i-1,minEnd,avgLinks[best]);
+ ctg1->cutTail = len1-i-1;
+ ctg2->cutHead = overlap + minEnd;
+ ctg2->scaftig_start = 0;
+
+ ret = 1;
+ break;
+ /* }if(num_route>1){
+ ret = 2;
+ break; */
+ }else{ //mark node which leads to dead end
+ node = searchNode(kmerCtg1[i],kset,overlap);
+ if(node)
+ node->twin = 2;
+ }
+
+ }
+ for(i=0;iUPlimit)
+ return;
+ if(steps>max||*num_route>=maxRoute)
+ return;
+ Kmer word = reverseComplement(currW,overlap);
+ boolean isSmaller = currW < word;
+ char ch;
+ unsigned char links;
+ int index,i;
+
+ if(isSmaller)
+ word = currW;
+
+ kmer_t *node;
+ boolean found = search_kmerset(kset,word,&node);
+ if(!found){
+ printf("Trace: can't find kmer %llx (rc %llx, input %llx) at step %d\n",word,
+ reverseComplement(word,overlap),currW,steps);
+ return;
+ }
+ if(soFarNode)
+ soFarNode[steps] = node;
+ if(soFarSeq&&steps>0)
+ soFarSeq[steps-1] = currW&0x03;
+ int linkCounter;
+ if(soFarLinks)
+ linkCounter = *soFarLinks;
+ if(steps>=min&&currW==kmerDest){
+ index = *num_route;
+ if(avgLinks&&steps>0)
+ avgLinks[index] = (double)linkCounter/steps;
+ else if(avgLinks)
+ avgLinks[index] = 0;
+ //find node that appears more than once in the path
+ if(multiOccu1&&multiOccu2){
+ for(i=0;ideleted = 0;
+ int rightMost=0;
+ boolean MULTI=0;
+ for(i=0;ideleted){
+ rightMost = rightMostdeleted = 1;
+ }
+ if(!MULTI)
+ multiOccu1[index] = multiOccu2[index] = -1;
+ else{
+ multiOccu2[index] = steps-2-rightMost<0 ? 0:steps-2-rightMost; //[0 steps-2]
+ for(i=0;ideleted = 0;
+ int leftMost=steps-2;
+ for(i=steps;i>=0;i--){
+ if(soFarNode[i]->deleted)
+ leftMost = leftMost>i-1 ? i-1:leftMost;
+ soFarNode[i]->deleted = 1;
+ }
+ multiOccu1[index] = leftMost<0 ? 0:leftMost; //[0 steps-2]
+ }
+ }
+ if(routeLens)
+ routeLens[index] = steps;
+ if(soFarSeq){
+ char *array = foundRoutes[index];
+ for(i=0;i0;i--){
+ ch = nPick1(array,i);
+ links = get_kmer_right_cov(*node,ch);
+ if(!links)
+ continue;
+ if(soFarLinks)
+ *soFarLinks = linkCounter + links;
+ word = nextKmerLocal(currW,ch,WORDF);
+ trace4Repeat(word,steps,min,max,num_route,
+ kset,kmerDest,overlap,WORDF,traceCounter,maxRoute,soFarNode,
+ multiOccu1,multiOccu2,routeLens,foundRoutes,soFarSeq,
+ soFarLinks,avgLinks);
+ }
+ }else{
+ int array[] = {0,1,2,3};
+ for(i=4;i>0;i--){
+ ch = nPick1(array,i);
+ links = get_kmer_left_cov(*node,ch);
+ if(!links)
+ continue;
+ if(soFarLinks)
+ *soFarLinks = linkCounter + links;
+ word = nextKmerLocal(currW,int_comp(ch),WORDF);
+ trace4Repeat(word,steps,min,max,num_route,
+ kset,kmerDest,overlap,WORDF,traceCounter,maxRoute,soFarNode,
+ multiOccu1,multiOccu2,routeLens,foundRoutes,soFarSeq,
+ soFarLinks,avgLinks);
+ }
+ }
+}
+
+//found repeat node on contig ends
+static void maskRepeatNode(KmerSet *kset,Kmer *kmerCtg1,
+ Kmer *kmerCtg2,int overlap,
+ int len1,int len2,int max,Kmer WordFilter)
+{
+ int i;
+ int num_route,steps;
+ int min = 1,maxRoute=1;
+ int traceCounter;
+ Kmer word,bal_word;
+ kmer_t *node;
+ boolean found;
+ int counter=0;
+ for(i=0;ibal_word)
+ word=bal_word;
+ found = search_kmerset(kset,word,&node);
+ if(!found||node->linear){
+ //printf("Found no node for kmer %llx\n",word);
+ continue;
+ }
+ num_route = traceCounter = 0;
+ steps=0;
+ trace4Repeat(word,steps,min,max,&num_route,
+ kset,word,overlap,WordFilter,
+ &traceCounter,maxRoute,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL);
+ if(num_route<1)
+ continue;
+ counter++;
+ node->checked = 1;
+ }
+ for(i=0;ibal_word)
+ word=bal_word;
+ found = search_kmerset(kset,word,&node);
+ if(!found||node->linear){
+ //printf("Found no node for kmer %llx\n",word);
+ continue;
+ }
+ num_route = traceCounter = 0;
+ steps=0;
+ trace4Repeat(word,steps,min,max,&num_route,
+ kset,word,overlap,WordFilter,
+ &traceCounter,maxRoute,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL);
+ if(num_route<1)
+ continue;
+ counter++;
+ node->checked = 1;
+ }
+ //printf("MR: %d(%d)\n",counter,len1+len2);
+}
+
+/*
+static boolean chopReadFillGap(int len_seq,int overlap,char *src_seq, char *bal_seq,
+ KmerSet *kset,Kmer WORDF,int *start,int *end,boolean *bal,
+ Kmer *KmerCtg1,int len1,Kmer *KmerCtg2,int len2,int *index1,int *index2)
+{
+ int index,j=0,bal_j;
+ Kmer word,bal_word;
+ int flag=0,bal_flag=0;
+ int ctg1start,bal_ctg1start,ctg2end,bal_ctg2end;
+ int seqStart,bal_start,seqEnd,bal_end;
+ kmer_t *node;
+ boolean found;
+
+ if(len_seqlinear&&!node->checked){
+ if(!flag&&node->inEdge==1){
+ ctg1start = searchKmerOnCtg(word,KmerCtg1,len1);
+ if(ctg1start>0){
+ flag = 1;
+ seqStart = j + overlap-1;
+ }
+ }
+ if(!bal_flag&&node->inEdge==2){
+ bal_ctg2end = searchKmerOnCtg(bal_word,KmerCtg2,len2);
+ if(bal_ctg2end>0){
+ bal_flag = 2;
+ bal_end = bal_j+overlap-1;
+ }
+ }
+ }
+
+ for(j = 1; j <= len_seq - overlap; j ++) {
+ word = nextKmerLocal(word,src_seq[j-1+overlap],WORDF);
+ bal_j = len_seq-j-overlap; // j;
+ bal_word = prevKmerLocal(bal_word,bal_seq[bal_j],overlap);
+
+ if(wordlinear&&!node->checked){
+ if(!flag&&node->inEdge==1){
+ ctg1start = searchKmerOnCtg(word,KmerCtg1,len1);
+ if(ctg1start>0){
+ flag = 1;
+ seqStart = j + overlap-1;
+ }
+ }else if(flag==1&&node->inEdge==1){
+ index = searchKmerOnCtg(word,KmerCtg1,len1);
+ if(index>ctg1start){ // choose hit closer to gap
+ ctg1start = index;
+ seqStart = j + overlap-1;
+ }
+ }else if(flag==1&&node->inEdge==2){
+ ctg2end = searchKmerOnCtg(word,KmerCtg2,len2);
+ if(ctg2end>0){
+ flag = 3;
+ seqEnd = j+overlap-1;
+ break;
+ }
+ }
+
+ if(!bal_flag&&node->inEdge==2){
+ bal_ctg2end = searchKmerOnCtg(bal_word,KmerCtg2,len2);
+ if(bal_ctg2end>0){
+ bal_flag = 2;
+ bal_end = bal_j+overlap-1;
+ }
+ }else if(bal_flag==2&&node->inEdge==2){
+ index = searchKmerOnCtg(bal_word,KmerCtg2,len2);
+ if(indexinEdge==1){
+ bal_ctg1start = searchKmerOnCtg(bal_word,KmerCtg1,len1);
+ if(bal_ctg1start>0){
+ bal_flag = 3;
+ bal_start = bal_j+overlap-1;
+ break;
+ }
+ }
+ }
+ }
+ if(flag==3){
+ *start = seqStart;
+ *end = seqEnd;
+ *bal = 0;
+ *index1 = ctg1start;
+ *index2 = ctg2end;
+ return 1;
+ }else if(bal_flag==3){
+ *start = bal_start;
+ *end = bal_end;
+ *bal = 1;
+ *index1 = bal_ctg1start;
+ *index2 = bal_ctg2end;
+ return 1;
+ }
+ return 0;
+}
+
+static boolean readsCrossGap(READNEARBY *rdArray, int num, int originOverlap,DARRAY *gapSeqArray,
+ Kmer *kmerCtg1,Kmer *kmerCtg2,int overlap,int len1,int len2,
+ CTGinSCAF *ctg1,CTGinSCAF *ctg2,KmerSet *kmerS,Kmer WordFilter,int min,int max)
+{
+ int i,j,start,end,startOnCtg1,endOnCtg2;
+ char *bal_seq;
+ char *src_seq;
+ char *pt;
+ boolean bal,ret=0,FILL;
+
+ src_seq = (char *)ckalloc(maxReadLen*sizeof(char));
+ bal_seq = (char *)ckalloc(maxReadLen*sizeof(char));
+
+ for(i=0;imax)
+ continue;
+ fprintf(stderr,"Read across\n");
+ //printf("Filled: K %d, ctg1 %d ctg2 %d,start %d end %d\n",overlap,startOnCtg1,endOnCtg2,start,end);
+ if(overlap+(len1-startOnCtg1-1)+endOnCtg2-(end-start)>(int)originOverlap)
+ continue; // contig1 and contig2 could not overlap more than origOverlap bases
+
+ ctg1->gapSeqOffset = gapSeqArray->item_c;
+ ctg1->gapSeqLen = end-start;
+ if(!darrayPut(gapSeqArray,ctg1->gapSeqOffset+(end-start)/4))
+ continue;
+ pt = (char *)darrayPut(gapSeqArray,ctg1->gapSeqOffset);
+ for(j=start+1;j<=end;j++){
+ if(bal)
+ writeChar2tightString(bal_seq[j],pt,j-start-1);
+ else
+ writeChar2tightString(src_seq[j],pt,j-start-1);
+
+ }
+ ctg1->cutTail = len1-startOnCtg1-1;
+ ctg2->cutHead = overlap + endOnCtg2;
+ ctg2->scaftig_start = 0;
+
+ ret = 1;
+ break;
+ }
+
+ free((void*)src_seq);
+ free((void*)bal_seq);
+ return ret;
+}
+*/
+static void kmerSet_markTandem(KmerSet *set,Kmer WordFilter,int overlap);
+static boolean readsCrossGap(READNEARBY *rdArray, int num, int originOverlap,DARRAY *gapSeqArray,
+ Kmer *kmerCtg1,Kmer *kmerCtg2,int overlap,
+ CTGinSCAF *ctg1,CTGinSCAF *ctg2,KmerSet *kmerS,Kmer WordFilter,int min,int max,
+ int offset1,int offset2,char *seqGap,char *seqCtg1,char *seqCtg2,int cut1,int cut2);
+
+int localGraph(READNEARBY *rdArray,int num,CTGinSCAF *ctg1,CTGinSCAF *ctg2,
+ int origOverlap,Kmer *kmerCtg1,Kmer *kmerCtg2,
+ int overlap,DARRAY *gapSeqArray,char *seqCtg1,char *seqCtg2,char *seqGap)
+{
+ /**************** put kmer in DBgraph ****************/
+ KmerSet *kmerSet;
+ Kmer WordFilter = (((Kmer) 1) << (2*overlap)) - 1;
+/*
+ if(ctg1->ctgID==56410&&ctg2->ctgID==61741)
+ printf("Extract %d reads for gap [%d %d]\n",num,ctg1->ctgID,ctg2->ctgID);
+*/
+ kmerSet = readsInGap2DBgraph(rdArray,num,ctg1,ctg2,origOverlap,
+ kmerCtg1,kmerCtg2,overlap,WordFilter);
+ time_t tt;
+ time(&tt);
+// srand48((int)tt);
+/*
+ int i,j;
+ for(i=0;i<2;i++){
+ int array[] = {0,1,2,3};
+ for(j=4;j>0;j--)
+ fprintf(stderr,"%d ", nPick1(array,j));
+ }
+ fprintf(stderr,"\n");
+*/
+ /***************** search path to connect contig ends ********/
+ int gapLen = ctg2->start - ctg1->end - origOverlap + overlap;
+ int min = gapLen-GLDiff>0 ? gapLen-GLDiff:0;
+ int max = gapLen + GLDiff < 10 ? 10 : gapLen + GLDiff;
+ //count kmer number for contig1 and contig2 ends
+ int len1,len2;
+ len1 = CTGendLenctgID].length+origOverlap ?
+ CTGendLen:contig_array[ctg1->ctgID].length+origOverlap;
+ len2 = CTGendLenctgID].length+origOverlap ?
+ CTGendLen:contig_array[ctg2->ctgID].length+origOverlap;
+ len1 -= overlap-1;
+ len2 -= overlap-1;
+
+ //int pathNum = 2;
+ int offset1=0,offset2=0,cut1=0,cut2=0;
+ int pathNum = searchFgap(kmerSet,ctg1,ctg2,kmerCtg1,kmerCtg2,
+ origOverlap,overlap,gapSeqArray,
+ len1,len2,WordFilter,&offset1,&offset2,seqGap,&cut1,&cut2);
+
+ //printf("SF: %d K %d\n",pathNum,overlap);
+ if(pathNum==0){
+ free_kmerset(kmerSet);
+ return 0;
+ }else if(pathNum==1){
+ free_kmerset(kmerSet);
+ return 1;
+ }/*
+ else{
+ printf("ret %d\n",pathNum);
+ free_kmerset(kmerSet);
+ return 0;
+ } */
+
+ /******************* cross the gap by single reads *********/
+ //kmerSet_markTandem(kmerSet,WordFilter,overlap);
+ maskRepeatNode(kmerSet,kmerCtg1,kmerCtg2,overlap,
+ len1,len2,max,WordFilter);
+ boolean found = readsCrossGap(rdArray,num,origOverlap,gapSeqArray,
+ kmerCtg1,kmerCtg2,overlap,ctg1,ctg2,kmerSet,WordFilter,min,max,
+ offset1,offset2,seqGap,seqCtg1,seqCtg2,cut1,cut2);
+ if(found){
+ //fprintf(stderr,"read across\n");
+ free_kmerset(kmerSet);
+ return found;
+ }
+ else{
+ free_kmerset(kmerSet);
+ return 0;
+ }
+
+}
+
+static void kmerSet_markTandem(KmerSet *set,Kmer WordFilter,int overlap)
+{
+ kmer_t *rs;
+ long long counter = 0;
+ int num_route,steps;
+ int min=1,max=overlap,maxRoute=1;
+ int traceCounter;
+
+ set->iter_ptr = 0;
+ while(set->iter_ptr < set->size){
+ if(!is_kmer_entity_null(set->flags, set->iter_ptr)){
+ rs = set->array + set->iter_ptr;
+ if(rs->inEdge>0){
+ set->iter_ptr ++;
+ continue;
+ }
+ num_route = traceCounter = 0;
+ steps=0;
+ trace4Repeat(rs->seq,steps,min,max,&num_route,
+ set,rs->seq,overlap,WordFilter,
+ &traceCounter,maxRoute,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL);
+ if(num_route<1){
+ set->iter_ptr ++;
+ continue;
+ }
+ /*
+ printKmer(stderr,rs->seq,overlap);
+ fprintf(stderr, "\n");
+ */
+ rs->checked = 1;
+ counter++;
+ }
+ set->iter_ptr ++;
+ }
+}
+/******************* the following is for read-crossing gaps *************************/
+
+#define MAXREADLENGTH 100
+
+static const int INDEL = 0;
+static const int SIM[4][4] = {
+ {1, 0, 0, 0},
+ {0, 1, 0, 0},
+ {0, 0, 1, 0},
+ {0, 0, 0, 1}
+};
+static char fastSequence[MAXREADLENGTH];
+static char slowSequence[MAXREADLENGTH];
+
+static int Fmatrix[MAXREADLENGTH + 1][MAXREADLENGTH + 1];
+static int slowToFastMapping[MAXREADLENGTH + 1];
+static int fastToSlowMapping[MAXREADLENGTH + 1];
+
+static int max(int A, int B, int C)
+{
+ A = A>=B ? A:B;
+ return (A>=C ? A:C);
+
+}
+
+static int compareSequences(char * sequence1, char * sequence2, int length1, int length2)
+{
+ if(length1<1||length2<1||length1>MAXREADLENGTH||length2>MAXREADLENGTH)
+ return 0;
+ int i, j;
+ int Choice1, Choice2, Choice3;
+ int maxScore;
+
+ for (i = 0; i <= length1; i++)
+ Fmatrix[i][0] = 0;
+ for (j = 0; j <= length2; j++)
+ Fmatrix[0][j] = 0;
+
+ for (i = 1; i <= length1; i++) {
+ for (j = 1; j <= length2; j++) {
+ Choice1 =
+ Fmatrix[i - 1][j - 1] +
+ SIM[(int) sequence1[i-1]]
+ [(int) sequence2[j-1]];
+ Choice2 = Fmatrix[i - 1][j] + INDEL;
+ Choice3 = Fmatrix[i][j - 1] + INDEL;
+ Fmatrix[i][j] = max(Choice1, Choice2, Choice3);
+ }
+ }
+
+ maxScore = Fmatrix[length1][length2];
+ return maxScore;
+}
+
+static void mapSlowOntoFast(int slowSeqLength,int fastSeqLength)
+{
+ int slowIndex = slowSeqLength;
+ int fastIndex = fastSeqLength;
+ int fastn, slown;
+
+ if (slowIndex == 0) {
+ slowToFastMapping[0] = fastIndex;
+
+ while (fastIndex >= 0)
+ fastToSlowMapping[fastIndex--] = 0;
+
+ return;
+ }
+
+ if (fastIndex == 0) {
+ while (slowIndex >= 0)
+ slowToFastMapping[slowIndex--] = 0;
+
+ fastToSlowMapping[0] = slowIndex;
+
+ return;
+ }
+
+ while (slowIndex > 0 && fastIndex > 0) {
+ fastn = (int) fastSequence[fastIndex-1]; //getCharInTightString(fastSequence,fastIndex-1);
+ slown = (int) slowSequence[slowIndex-1]; //getCharInTightString(slowSequence,slowIndex-1);
+
+ if (Fmatrix[fastIndex][slowIndex] ==
+ Fmatrix[fastIndex - 1][slowIndex - 1] +
+ SIM[fastn][slown]) {
+ fastToSlowMapping[--fastIndex] = --slowIndex;
+ slowToFastMapping[slowIndex] = fastIndex;
+ } else if (Fmatrix[fastIndex][slowIndex] ==
+ Fmatrix[fastIndex - 1][slowIndex] + INDEL)
+ fastToSlowMapping[--fastIndex] = slowIndex - 1;
+
+ else if (Fmatrix[fastIndex][slowIndex] ==
+ Fmatrix[fastIndex][slowIndex - 1] + INDEL)
+ slowToFastMapping[--slowIndex] = fastIndex - 1;
+
+ else {
+ printf("compareSequence: Error trace\n");
+ fflush(stdout);
+ abort();
+ }
+ }
+
+ while (slowIndex > 0)
+ slowToFastMapping[--slowIndex] = -1;
+ while (fastIndex > 0)
+ fastToSlowMapping[--fastIndex] = -1;
+
+ slowToFastMapping[slowSeqLength] =
+ fastSeqLength;
+ fastToSlowMapping[fastSeqLength] =
+ slowSeqLength;
+}
+
+static boolean chopReadFillGap(int len_seq,int overlap,char *src_seq, char *bal_seq,
+ KmerSet *kset,Kmer WORDF,int *start,int *end,boolean *bal,
+ Kmer *KmerCtg1,int len1,Kmer *KmerCtg2,int len2,int *index1,int *index2)
+{
+ int index,j=0,bal_j;
+ Kmer word,bal_word;
+ int flag=0,bal_flag=0;
+ int ctg1start,bal_ctg1start,ctg2end,bal_ctg2end;
+ int seqStart,bal_start,seqEnd,bal_end;
+ kmer_t *node;
+ boolean found;
+
+ if(len_seqlinear&&!node->checked){
+ if(!flag&&node->inEdge==1){
+ ctg1start = searchKmerOnCtg(word,KmerCtg1,len1);
+ if(ctg1start>=0){
+ flag = 1;
+ seqStart = j + overlap-1;
+ }
+ }
+ if(!bal_flag&&node->inEdge==2){
+ bal_ctg2end = searchKmerOnCtg(bal_word,KmerCtg2,len2);
+ if(bal_ctg2end>=0){
+ bal_flag = 2;
+ bal_end = bal_j+overlap-1;
+ }
+ }
+ }
+
+ for(j = 1; j <= len_seq - overlap; j ++) {
+ word = nextKmerLocal(word,src_seq[j-1+overlap],WORDF);
+ bal_j = len_seq-j-overlap; // j;
+ bal_word = prevKmerLocal(bal_word,bal_seq[bal_j],overlap);
+
+ if(wordlinear&&!node->checked){
+ if(!flag&&node->inEdge==1){
+ ctg1start = searchKmerOnCtg(word,KmerCtg1,len1);
+ if(ctg1start>=0){
+ flag = 1;
+ seqStart = j + overlap-1;
+ }
+ }else if(flag==1&&node->inEdge==1){
+ index = searchKmerOnCtg(word,KmerCtg1,len1);
+ if(index>=0&&index>ctg1start){ // choose hit closer to gap
+ ctg1start = index;
+ seqStart = j + overlap-1;
+ }
+ }else if(flag==1&&node->inEdge==2){
+ ctg2end = searchKmerOnCtg(word,KmerCtg2,len2);
+ if(ctg2end>=0){
+ flag = 3;
+ seqEnd = j+overlap-1;
+ break;
+ }
+ }
+
+ if(!bal_flag&&node->inEdge==2){
+ bal_ctg2end = searchKmerOnCtg(bal_word,KmerCtg2,len2);
+ if(bal_ctg2end>=0){
+ bal_flag = 2;
+ bal_end = bal_j+overlap-1;
+ }
+ }else if(bal_flag==2&&node->inEdge==2){
+ index = searchKmerOnCtg(bal_word,KmerCtg2,len2);
+ if(index>=0&&indexinEdge==1){
+ bal_ctg1start = searchKmerOnCtg(bal_word,KmerCtg1,len1);
+ if(bal_ctg1start>=0){
+ bal_flag = 3;
+ bal_start = bal_j+overlap-1;
+ break;
+ }
+ }
+ }
+ }
+ if(flag==3){
+ *start = seqStart;
+ *end = seqEnd;
+ *bal = 0;
+ *index1 = ctg1start;
+ *index2 = ctg2end;
+ return 1;
+ }else if(bal_flag==3){
+ *start = bal_start;
+ *end = bal_end;
+ *bal = 1;
+ *index1 = bal_ctg1start;
+ *index2 = bal_ctg2end;
+ return 1;
+ }
+ return 0;
+}
+
+
+static int cutSeqFromTightStr(char *tightStr,int length,int start,int end,int revS,char *src_seq)
+{
+ int i,index=0;
+ end = end < length ? end:length-1;
+ start = start>=0 ? start:0;
+
+ if(!revS){
+ for(i=start;i<=end;i++)
+ src_seq[index++] = getCharInTightString(tightStr,i);
+ }
+ else{
+ for(i=length-1-start;i>=length-end-1;i--)
+ src_seq[index++] = int_comp(getCharInTightString(tightStr,i));
+ }
+ return end-start+1;
+}
+
+static int cutSeqFromCtg(unsigned int ctgID,int start,int end, char *sequence,int originOverlap)
+{
+
+ unsigned int bal_ctg = getTwinCtg(ctgID);
+ if(contig_array[ctgID].length<1)
+ return 0;
+ int length = contig_array[ctgID].length+originOverlap;
+ if(contig_array[ctgID].seq)
+ return cutSeqFromTightStr(contig_array[ctgID].seq,length,start,end,0,sequence);
+ else
+ return cutSeqFromTightStr(contig_array[bal_ctg].seq,length,start,end,1,sequence);
+
+}
+
+static int cutSeqFromRead(char *src_seq,int length,int start,int end,char *sequence)
+{
+ if(end>=length)
+ printf("******: end %d length %d\n",end,length);
+ end = end=0 ? start:0;
+ int i;
+ for(i=start;i<=end;i++)
+ sequence[i-start] = src_seq[i];
+ return end-start+1;
+}
+
+static void printSeq(FILE *fo,char *seq,int len)
+{
+ int i;
+ for(i=0;i 100 ? maxReadLen:100;
+ int length = contig_array[ctg1->ctgID].length+originOverlap;
+ if(buffer_size>offset1){
+ lenCtg1 = cutSeqFromCtg(ctg1->ctgID,length-cut1-(buffer_size-offset1),length-1-cut1,seqCtg1,originOverlap);
+ for(i=0;ictgID].length+originOverlap;
+ if(buffer_size>offset2){
+ lenCtg2 = cutSeqFromCtg(ctg2->ctgID,cut2,buffer_size-offset2-1+cut2,&(seqCtg2[offset2]),originOverlap);
+ for(i=0;i0||offset2>0){
+ for(i=0;imax)
+ continue;
+ if(overlap+(len1-startOnCtg1-1)+endOnCtg2-(end-start)>(int)originOverlap)
+ continue; // contig1 and contig2 could not overlap more than origOverlap bases
+ START[i] = start;
+ END[i] = end;
+ INDEX1[i] = startOnCtg1;
+ INDEX2[i] = endOnCtg2;
+ BAL[i] = bal;
+
+ int matchLen = 2*overlap<(end-start+overlap) ? 2*overlap:(end-start+overlap);
+ int match;
+ int alignLen = matchLen;
+ //compare the left of hit kmer on ctg1
+ //int ctgLeft = (contig_array[ctg1->ctgID].length+originOverlap)-(len1+overlap-1)+startOnCtg1;
+ int ctgLeft = (lenCtg1)-(len1+overlap-1)+startOnCtg1;
+ int readLeft = start-overlap+1;
+ int cmpLen = ctgLeftctgID,ctgLeft-cmpLen,ctgLeft-1,fastSequence,originOverlap);
+ cutSeqFromRead(seqCtg1,lenCtg1,ctgLeft-cmpLen,ctgLeft-1,fastSequence);
+ if(!bal)
+ cutSeqFromRead(src_seq,rdArray[i].len,readLeft-cmpLen,readLeft-1,slowSequence);
+ else
+ cutSeqFromRead(bal_seq,rdArray[i].len,readLeft-cmpLen,readLeft-1,slowSequence);
+ match = compareSequences(fastSequence,slowSequence, cmpLen, cmpLen);
+
+ alignLen += cmpLen;
+ matchLen += match;
+
+ //compare the right of hit kmer on ctg1
+ int ctgRight = len1-startOnCtg1-1;
+
+ cmpLen = ctgRight<(rdArray[i].len-start-1) ? ctgRight:(rdArray[i].len-start-1);
+ cmpLen = cmpLen<=MAXREADLENGTH ? cmpLen:MAXREADLENGTH;
+ //cutSeqFromCtg(ctg1->ctgID,ctgLeft+overlap,ctgLeft+overlap+cmpLen-1,fastSequence,originOverlap);
+ cutSeqFromRead(seqCtg1,lenCtg1,ctgLeft+overlap,ctgLeft+overlap+cmpLen-1,fastSequence);
+ if(!bal)
+ cutSeqFromRead(src_seq,rdArray[i].len,start+1,start+cmpLen,slowSequence);
+ else
+ cutSeqFromRead(bal_seq,rdArray[i].len,start+1,start+cmpLen,slowSequence);
+ match = compareSequences(fastSequence,slowSequence, cmpLen, cmpLen);
+ //fprintf(stderr,"%d -- %d\n",match,cmpLen);
+
+ alignLen += cmpLen;
+ matchLen += match;
+
+ //compare the left of hit kmer on ctg2
+ ctgLeft = endOnCtg2;
+ readLeft = end-overlap+1;
+ cmpLen = ctgLeftctgID,endOnCtg2-cmpLen,endOnCtg2-1,fastSequence,originOverlap);
+ cutSeqFromRead(seqCtg2,lenCtg2,endOnCtg2-cmpLen,endOnCtg2-1,fastSequence);
+ if(!bal)
+ cutSeqFromRead(src_seq,rdArray[i].len,readLeft-cmpLen,readLeft-1,slowSequence);
+ else
+ cutSeqFromRead(bal_seq,rdArray[i].len,readLeft-cmpLen,readLeft-1,slowSequence);
+ match = compareSequences(fastSequence,slowSequence, cmpLen, cmpLen);
+ alignLen += cmpLen;
+ matchLen += match;
+
+ //compare the right of hit kmer on ctg2
+ //ctgRight = contig_array[ctg2->ctgID].length+originOverlap-endOnCtg2-overlap;
+ ctgRight = lenCtg2-endOnCtg2-overlap;
+ cmpLen = ctgRight<(rdArray[i].len-end-1) ? ctgRight:(rdArray[i].len-end-1);
+ cmpLen = cmpLen<=MAXREADLENGTH ? cmpLen:MAXREADLENGTH;
+ //cutSeqFromCtg(ctg2->ctgID,endOnCtg2+overlap,endOnCtg2+overlap+cmpLen-1,fastSequence,originOverlap);
+ cutSeqFromRead(seqCtg2,lenCtg2,endOnCtg2+overlap,endOnCtg2+overlap+cmpLen-1,fastSequence);
+ if(!bal)
+ cutSeqFromRead(src_seq,rdArray[i].len,end+1,end+cmpLen,slowSequence);
+ else
+ cutSeqFromRead(bal_seq,rdArray[i].len,end+1,end+cmpLen,slowSequence);
+ match = compareSequences(fastSequence,slowSequence, cmpLen, cmpLen);
+ alignLen += cmpLen;
+ matchLen += match;
+ /*
+ if(cmpLen>0&&match!=cmpLen+overlap){
+ printSeq(stderr,fastSequence,cmpLen+overlap);
+ printSeq(stderr,slowSequence,cmpLen+overlap);
+ printKmer(stderr,kmerCtg2[endOnCtg2],overlap);
+ fprintf(stderr,": %d(%d)\n",bal,endOnCtg2);
+ }else if(cmpLen>0&&match==cmpLen+overlap)
+ fprintf(stderr,"Perfect\n");
+ */
+ double score = (double)matchLen/alignLen;
+ if(maxScore0.0)
+ fprintf(stderr,"SCORE: %4.2f\n",maxScore);
+ */
+ if(maxScore>0.9){
+ /*
+ for(i=0;i0 ? offset1-(len1-INDEX1[maxIndex]-1):0;
+ int rightRemain = offset2-(overlap+INDEX2[maxIndex])>0 ? offset2-(overlap+INDEX2[maxIndex]):0;
+
+ ctg1->gapSeqOffset = gapSeqArray->item_c;
+ ctg1->gapSeqLen = END[maxIndex]-START[maxIndex]+leftRemain+rightRemain;
+ if(darrayPut(gapSeqArray,ctg1->gapSeqOffset+(END[maxIndex]-START[maxIndex]+leftRemain+rightRemain)/4)){
+ pt = (char *)darrayPut(gapSeqArray,ctg1->gapSeqOffset);
+ for(j=0;jcutTail=len1-INDEX1[maxIndex]-1-offset1+cut1>cut1 ?len1-INDEX1[maxIndex]-1-offset1+cut1:cut1;
+ ctg2->cutHead=overlap+INDEX2[maxIndex]-offset2+cut2>cut2 ?overlap+INDEX2[maxIndex]-offset2+cut2:cut2;
+ ctg2->scaftig_start = 0;
+ ret = 1;
+ }
+ }
+ free((void*)START);
+ free((void*)END);
+ free((void*)INDEX1);
+ free((void*)INDEX2);
+ free((void*)SCORE);
+ free((void*)BAL);
+
+ free((void*)src_seq);
+ free((void*)bal_seq);
+ return ret;
+}
+
diff --git a/fusion/main.c b/fusion/main.c
new file mode 100755
index 0000000..ddd9ee6
--- /dev/null
+++ b/fusion/main.c
@@ -0,0 +1,163 @@
+#include "stdinc.h"
+#include "newhash.h"
+#include "extfunc.h"
+#include "global.h"
+
+
+extern int call_scaffold();
+extern int call_align();
+extern int call_bundle();
+extern int data_prepare();
+
+#define MAPPING 0
+#define SCAFF 1
+#define BUNDLE 2
+#define PREPARE 3
+#define POTENT 4
+static void usage();
+int main(int argc, char **argv)
+{
+ printf("Mapping & Scaffolding module.\n");
+
+ if(argc==1){
+ usage();
+ return 0;
+ }
+ int c=0;
+ int inpseq, outseq;
+ //char optarg[256];
+ int mode =-1;
+
+ //char temp[100];
+ while((c=getopt(argc,argv,"s:g:p:L:t:i:u:c:P:K:MSBDO"))!=EOF){
+ switch(c){
+ case 'M':
+ mode=MAPPING;
+ break;
+ case 'S':
+ mode=SCAFF;
+ break;
+ case 'B':
+ mode=BUNDLE;
+ break;
+ case 'D':
+ mode=PREPARE;
+ break;
+ case 'O':
+ mode=POTENT;
+ break;
+ case 's':
+ inpseq = 1;
+ shortrdsfile=(char *)ckalloc(256*sizeof(char));
+ strcpy(shortrdsfile,optarg);
+ break;
+ case 'g':
+ outseq = 1;
+ graphfile=(char *)ckalloc(256*sizeof(char));
+ strcpy(graphfile,optarg);
+ break;
+ case 'p':
+ thrd_num = atoi(optarg);
+ break;
+ case 'L':
+ ctg_short = atoi(optarg);
+ break;
+ case 'P':
+ OverlapPercent = atof (optarg);
+ break;
+ case 't':
+ close_threshold = atof (optarg);
+ break;
+ case 'i':
+ ins_size_var = atoi (optarg);
+ break;
+ case 'u':
+ bund_threshold = atoi (optarg);
+ break;
+ case 'c':
+ ctg_file = (char *)ckalloc(256*sizeof(char));
+ strcpy(ctg_file,optarg);
+ break;
+ case 'K':
+ overlaplen = atoi(optarg);
+ break;
+ case 'h':
+ usage();
+ break;
+ case '?':
+ usage();
+ exit(1);
+ default:
+ usage();
+ exit(1);
+ }
+ }
+
+ if(mode==-1){
+ usage();
+ exit(1);
+ }else if(mode==MAPPING){
+ printf("[%s]Mapping mode selected .\n",__FUNCTION__);
+ if(outseq==0||inpseq==0){
+ usage();
+ exit(1);
+ }
+
+ call_align();
+ }else if(mode==SCAFF){
+ printf("[%s]Scaffolding mode selected .\n",__FUNCTION__);
+ if(outseq==0){
+ usage();
+ exit(1);
+ }
+ call_scaffold();
+ }else if(mode==BUNDLE){
+ printf("[%s]Bundling mode selected .\n",__FUNCTION__);
+ if(outseq==0){
+ usage();
+ exit(1);
+ }
+ call_bundle();
+ }else if(mode==PREPARE){
+ printf("[%s]Data prepare mode selected .\n",__FUNCTION__);
+ if(outseq==0||ctg_file==NULL){
+ usage();
+ exit(1);
+ }
+ data_prepare();
+ }else if(mode==POTENT){
+ printf("[%s]Potential analysis mode selected .\n",__FUNCTION__) ;
+ if(outseq==NULL){
+ usage();
+ exit(1);
+ }
+ potential();
+ }
+
+ return 0;
+}
+
+static void usage(){
+ printf("parameters:\n");
+ printf("global:\n");
+ printf("-s\tLibrary file.\n");
+ printf("-g\tPrefix of input files.\n");
+ printf("-p\tThreads.\n\n");
+ printf("Data prepare mode:\n");
+ printf("-D\tEnable this mode.\n");
+ printf("-K\tKmer.\n");
+ printf("-c\tInput contig file.(can't be name prefix.contig)\n\n");
+ printf("Mapping mode:\n");
+ printf("-M\tEnable this mode.\n\n");
+ printf("Bundling mode.\n");
+ printf("-B\tEnable this mode.\n");
+ printf("-u\tWeight threshold for outputting bundle file.(default 3)\n\n");
+ printf("Potential analysis mode.\n");
+ printf("-O\tEnable this mode.\n");
+ printf("Scaffolding mode:\n");
+ printf("-S\tEnable this mode.\n");
+ printf("-L\tthreshold for minimum length of contig(default K+2).\n");
+ printf("-P\tOverlap percent threshold for a subgraph(default 0.075).\n");
+ printf("-t\tOverlap percent threshold for a PE(default 0.2).\n");
+ printf("-i\tOverlap length threshold for remove transitive connect(default 20).\n");
+}
diff --git a/fusion/map.c b/fusion/map.c
new file mode 100755
index 0000000..8c46099
--- /dev/null
+++ b/fusion/map.c
@@ -0,0 +1,42 @@
+#include "stdinc.h"
+#include "newhash.h"
+#include "extfunc.h"
+#include "extvab.h"
+
+//static void initenv(int argc, char **argv);
+
+
+static void display_map_usage();
+
+int call_align()
+{
+ time_t start_t,stop_t,time_bef,time_aft;
+ time(&start_t);
+
+
+ time(&time_bef);
+ ctg_short = overlaplen+2;
+ //printf("contig len cutoff: %d\n",ctg_short);
+ prlContig2nodes(graphfile,ctg_short);
+ time(&time_aft);
+ //printf("time spent on De bruijn graph construction: %ds\n\n",
+ // (int)(time_aft-time_bef));
+ //map read to edge one by one
+ //printf("All contigs loaded");
+ time(&time_bef);
+ prlLongRead2Ctg(shortrdsfile,graphfile);
+ time(&time_aft);
+ //printf("time spent on mapping long reads: %ds\n\n",(int)(time_aft-time_bef));
+
+ time(&time_bef);
+ prlRead2Ctg(shortrdsfile,graphfile);
+ time(&time_aft);
+ //printf("time spent on mapping reads: %ds\n\n",(int)(time_aft-time_bef));
+
+ free_Sets(KmerSets,thrd_num);
+
+ time(&stop_t);
+ //printf("overall time for alignment: %dm\n\n",(int)(stop_t-start_t)/60);
+ printf("[%s]total time on mapping reads to contig :%dm\n",__FUNCTION__,(int)(stop_t-start_t)/60);
+ return 0;
+}
diff --git a/fusion/mem_manager.c b/fusion/mem_manager.c
new file mode 100755
index 0000000..77f8024
--- /dev/null
+++ b/fusion/mem_manager.c
@@ -0,0 +1,89 @@
+#include "stdinc.h"
+#include "newhash.h"
+#include "extfunc.h"
+#include "extvab.h"
+
+MEM_MANAGER *createMem_manager(int num_items,size_t unit_size)
+{
+ MEM_MANAGER *mem_Manager = (MEM_MANAGER *)ckalloc(1*sizeof(MEM_MANAGER));
+
+ mem_Manager->block_list = NULL;
+ mem_Manager->items_per_block = num_items;
+ mem_Manager->item_size = unit_size;
+ mem_Manager->recycle_list = NULL;
+ mem_Manager->counter = 0;
+ return mem_Manager;
+}
+
+void freeMem_manager(MEM_MANAGER *mem_Manager)
+{
+ BLOCK_START *ite_block,*temp_block;
+
+ if(!mem_Manager)
+ return;
+
+ ite_block = mem_Manager->block_list;
+ while(ite_block){
+ temp_block = ite_block;
+ ite_block = ite_block->next;
+ free((void *)temp_block);
+ }
+
+ free((void *)mem_Manager);
+}
+
+void *getItem(MEM_MANAGER *mem_Manager)
+{
+ RECYCLE_MARK *mark; //this is the type of return value
+ BLOCK_START *block;
+
+ if(!mem_Manager)
+ return NULL;
+
+ if(mem_Manager->recycle_list){
+ mark = mem_Manager->recycle_list;
+ mem_Manager->recycle_list = mark->next;
+ return mark;
+ }
+ mem_Manager->counter++;
+ if(!mem_Manager->block_list||mem_Manager->index_in_block==mem_Manager->items_per_block){
+ //pthread_mutex_lock(&gmutex);
+ block = ckalloc(sizeof(BLOCK_START)+mem_Manager->items_per_block*mem_Manager->item_size);
+ //mem_Manager->counter += sizeof(BLOCK_START)+mem_Manager->items_per_block*mem_Manager->item_size;
+ //pthread_mutex_unlock(&gmutex);
+ block->next = mem_Manager->block_list;
+ mem_Manager->block_list = block;
+ mem_Manager->index_in_block = 1;
+ return (RECYCLE_MARK *)((void *)block+sizeof(BLOCK_START));
+ }
+
+ block = mem_Manager->block_list;
+ return (RECYCLE_MARK *)((void *)block+sizeof(BLOCK_START)+mem_Manager->item_size*(mem_Manager->index_in_block++));
+
+}
+
+void returnItem(MEM_MANAGER *mem_Manager,void *item)
+{
+ RECYCLE_MARK *mark;
+
+ mark = item;
+
+ mark->next = mem_Manager->recycle_list;
+ mem_Manager->recycle_list = mark;
+
+}
+
+/*
+void test_mem_manager()
+{
+ MEM_MANAGER *test_manager;
+ NODE *temp_node;
+
+ test_manager = createMem_manager(NODEBLOCKSIZE,sizeof(NODE));
+ temp_node = (NODE *)getItem(test_manager);
+ returnItem(test_manager,temp_node);
+
+ freeMem_manager(test_manager);
+}
+*/
+
diff --git a/fusion/newhash.c b/fusion/newhash.c
new file mode 100755
index 0000000..b568afd
--- /dev/null
+++ b/fusion/newhash.c
@@ -0,0 +1,465 @@
+#include "stdinc.h"
+#include "newhash.h"
+#include "extfunc.h"
+#include "extvab.h"
+
+#define PUBLIC_FUNC
+#define PROTECTED_FUNC
+
+static const kmer_t empty_kmer = {0, 0, 0, 0, 0, 0, 1, 0,0};
+
+static inline void update_kmer(kmer_t *mer, ubyte left, ubyte right){
+ ubyte4 cov;
+
+ if(left<4){
+ cov = get_kmer_left_cov(*mer, left);
+ if(cov < MAX_KMER_COV){
+ set_kmer_left_cov(*mer, left, cov + 1);
+ }
+ }
+
+ if(right<4){
+ cov = get_kmer_right_cov(*mer, right);
+ if(cov < MAX_KMER_COV){
+ set_kmer_right_cov(*mer, right, cov + 1);
+ }
+ }
+}
+
+static inline void set_new_kmer(kmer_t *mer, ubyte8 seq, ubyte left, ubyte right){
+ *mer = empty_kmer;
+ set_kmer_seq(*mer, seq);
+ if(left<4)
+ set_kmer_left_cov(*mer, left, 1);
+ if(right<4)
+ set_kmer_right_cov(*mer, right, 1);
+}
+
+
+static inline int is_prime_kh(ubyte8 num){
+ ubyte8 i, max;
+ if(num < 4) return 1;
+ if(num % 2 == 0) return 0;
+ max = (ubyte8)sqrt((float)num);
+ for(i=3;isize = init_size;
+ set->count = 0;
+
+ set->searchCnt = 0;
+ set->foundCnt = 0;
+ set->delCnt = 0;
+ set->searchSpcSeedCnt = 0;
+ set->getSpcSeedCnt = 0;
+ set->levelGet[0] = 0;
+ set->levelGet[1] = 0;
+ set->levelGet[2] = 0;
+
+ set->max = set->size * load_factor;
+ if(load_factor <= 0) load_factor = 0.25f;
+ else if(load_factor >= 1) load_factor = 0.75f;
+ set->load_factor = load_factor;
+ set->iter_ptr = 0;
+ set->array = calloc(set->size, sizeof(kmer_t));
+ set->flags = malloc((set->size + 15)/16 * 4);
+ memset(set->flags, 0x55, (set->size + 15) / 16 * 4);
+ return set;
+}
+
+PROTECTED_FUNC static inline ubyte8 get_kmerset(KmerSet *set, ubyte8 seq){
+ ubyte8 hc;
+ hc = seq % set->size;
+ while(1){
+ if(is_kmer_entity_null(set->flags, hc)){
+ return hc;
+ } else {
+ if(get_kmer_seq(set->array[hc]) == seq) return hc;
+ }
+ hc ++;
+ if(hc == set->size) hc = 0;
+ }
+ return set->size;
+}
+
+PUBLIC_FUNC int search_kmerset(KmerSet *set, ubyte8 seq, kmer_t **rs){
+ ubyte8 hc;
+ hc = seq % set->size;
+ while(1){
+ if(is_kmer_entity_null(set->flags, hc)){
+ return 0;
+ } else {
+ if(get_kmer_seq(set->array[hc]) == seq){
+ *rs = set->array + hc;
+ return 1;
+ }
+ }
+ hc ++;
+ if(hc == set->size) hc = 0;
+ }
+ return 0;
+}
+
+PUBLIC_FUNC static inline int exists_kmerset(KmerSet *set, ubyte8 seq){
+ ubyte8 idx;
+ idx = get_kmerset(set, seq);
+ return !is_kmer_entity_null(set->flags, idx);
+}
+
+PROTECTED_FUNC static inline void encap_kmerset(KmerSet *set, ubyte8 num){
+ ubyte4 *flags, *f;
+ ubyte8 i, n, size, hc;
+ kmer_t key, tmp;
+ if(set->count + num <= set->max) return;
+ n = set->size;
+ do{
+ if(n < 0xFFFFFFFU)
+ n <<= 1;
+ else
+ n += 0xFFFFFFU;
+ n = find_next_prime_kh(n);
+ } while(n * set->load_factor < set->count + num);
+
+ set->array = realloc(set->array, n * sizeof(kmer_t));
+ if(set->array == NULL){
+ fprintf(stderr, "-- Out of memory --\n");
+ abort();
+ }
+ flags = malloc((n+15)/16 * 4);
+ memset(flags, 0x55, (n+15)/16 * 4);
+ size = set->size;
+ set->size = n;
+ set->max = n * set->load_factor;
+ f = set->flags;
+ set->flags = flags;
+ flags = f;
+ for(i=0;iarray[i];
+ set_kmer_entity_del(flags, i);
+ while(1){
+ hc = get_kmer_seq(key) % set->size;
+ while(!is_kmer_entity_null(set->flags, hc)){ hc ++; if(hc == set->size) hc = 0; }
+ clear_kmer_entity_null(set->flags, hc);
+ if(hc < size && exists_kmer_entity(flags, hc)){
+ tmp = key;
+ key = set->array[hc];
+ set->array[hc] = tmp;
+ set_kmer_entity_del(flags, hc);
+ } else {
+ set->array[hc] = key;
+ break;
+ }
+ }
+ }
+ free(flags);
+}
+
+PUBLIC_FUNC int put_kmerset(KmerSet *set, ubyte8 seq, ubyte left, ubyte right, kmer_t **kmer_p){
+ ubyte8 hc;
+ encap_kmerset(set, 1);
+ hc = seq % set->size;
+ do{
+ if(is_kmer_entity_null(set->flags, hc)){
+ clear_kmer_entity_null(set->flags, hc);
+ set_new_kmer(set->array + hc, seq, left, right);
+ set->count ++;
+ *kmer_p = set->array + hc;
+ return 0;
+ } else {
+ if(get_kmer_seq(set->array[hc]) == seq){
+ update_kmer(set->array + hc, left, right);
+ set->array[hc].single = 0;
+ *kmer_p = set->array + hc;
+ return 1;
+ }
+ }
+ hc ++;
+ if(hc == set->size) hc = 0;
+ } while(1);
+ *kmer_p = NULL;
+ return 0;
+}
+
+PUBLIC_FUNC byte8 count_kmerset(KmerSet *set){ return set->count; }
+
+PUBLIC_FUNC static inline void reset_iter_kmerset(KmerSet *set){ set->iter_ptr = 0; }
+
+PUBLIC_FUNC static inline ubyte8 iter_kmerset(KmerSet *set, kmer_t **rs){
+ while(set->iter_ptr < set->size){
+ if(!is_kmer_entity_null(set->flags, set->iter_ptr)){
+ *rs = set->array + set->iter_ptr;
+ set->iter_ptr ++;
+ return 1;
+ }
+ set->iter_ptr ++;
+ }
+ return 0;
+}
+
+PUBLIC_FUNC void free_kmerset(KmerSet *set){
+ free(set->array);
+ free(set->flags);
+ free(set);
+}
+
+PUBLIC_FUNC void free_Sets(KmerSet **sets,int num){
+ int i;
+ for(i=0;i0)
+ num++;
+ }
+ return num;
+}
+
+int count_branch2next(kmer_t *node)
+{
+ int num = 0,i;
+
+ for(i=0;i<4;i++){
+ if(get_kmer_right_cov(*node,i)>0)
+ num++;
+ }
+ return num;
+}
+
+void dislink2prevUncertain(kmer_t *node,char ch,boolean smaller)
+{
+ if(smaller)
+ set_kmer_left_cov(*node,ch,0);
+ else
+ set_kmer_right_cov(*node,int_comp(ch),0);
+
+}
+
+void dislink2nextUncertain(kmer_t *node,char ch,boolean smaller)
+{
+ if(smaller)
+ set_kmer_right_cov(*node,ch,0);
+ else
+ set_kmer_left_cov(*node,int_comp(ch),0);
+}
+
+
+
+
+
+
+////////////////// functions for spaced seed Kmer hash
+
+static const spcKmer empty_spckmer = {0, NULL, 1};
+
+static inline int update_spckmer(spcKmer *mer, ubyte2 s_bases, kmer_t *node){
+// if(mer->start == NULL)
+// fprintf(stderr, "start err at:\t%llu\n",mer->seq);
+
+ spcBase *tmpBase=mer->start;
+
+ spcBase *newSpcBase;
+ newSpcBase = (spcBase*)malloc(sizeof(spcBase));
+ newSpcBase->spaced_bases = s_bases;
+// newSpcBase->edgeID = edgeID;
+ newSpcBase->large_kmer = node;
+ newSpcBase->next = tmpBase->next;
+ tmpBase->next = newSpcBase;
+
+ mer->spaced_base_num++;
+
+// mvnv(0,"update %llu :\t%hu\tnum: %u\n", mer->seq, tmpBase->next->spaced_bases, mer->spaced_base_num);
+ return 0;
+}
+
+static inline void set_new_spckmer(spcKmer *mer, Kmer spc_kmer, ubyte2 s_bases, kmer_t *node){
+ *mer = empty_spckmer;
+ set_kmer_seq(*mer, spc_kmer);
+
+ spcBase *newSpcBase;
+ newSpcBase = (spcBase*)malloc(sizeof(spcBase));
+ newSpcBase->spaced_bases = s_bases;
+// newSpcBase->repeat = 0;
+// newSpcBase->edgeID = edgeID;
+ newSpcBase->large_kmer = node;
+ newSpcBase->next = NULL;
+
+ mer->start = newSpcBase;
+
+// mvnv(0,"new %llu :\t%hu\n", mer->seq, mer->start->spaced_bases)
+
+}
+
+PUBLIC_FUNC spcKmerSet* init_spckmerset(ubyte8 init_size, float load_factor){
+ spcKmerSet *set;
+ if(init_size < 3) init_size = 3;
+ else init_size = find_next_prime_kh(init_size);
+
+ set = (spcKmerSet*)malloc(sizeof(spcKmerSet));
+ set->size = init_size;
+ set->count = 0;
+ set->max = set->size * load_factor;
+ if(load_factor <= 0) load_factor = 0.25f;
+ else if(load_factor >= 1) load_factor = 0.75f;
+ set->load_factor = load_factor;
+ //set->iter_ptr = 0;
+ set->array = calloc(set->size, sizeof(spcKmer));
+ set->flags = malloc((set->size + 15)/16 * 4);
+ memset(set->flags, 0x55, (set->size + 15) / 16 * 4);
+ return set;
+}
+
+PUBLIC_FUNC int search_spckmerset(spcKmerSet *set, ubyte8 seq, spcKmer **rs){
+ ubyte8 hc;
+ hc = seq % set->size;
+ while(1){
+ if(is_kmer_entity_null(set->flags, hc)){
+ return 0;
+ } else {
+ if(get_kmer_seq(set->array[hc]) == seq){
+ *rs = set->array + hc;
+ return 1;
+ }
+ }
+ hc ++;
+ if(hc == set->size) hc = 0;
+ }
+ return 0;
+}
+
+PROTECTED_FUNC static inline void encap_spckmerset(spcKmerSet *set, ubyte8 num){
+ ubyte4 *flags, *f;
+ ubyte8 i, n, size, hc;
+ spcKmer key, tmp;
+ if(set->count + num <= set->max) return;
+
+ n = set->size;
+ do{
+ if(n < 0xFFFFFFFU)
+ n <<= 1;
+ else
+ n += 0xFFFFFFU;
+ n = find_next_prime_kh(n);
+ } while(n * set->load_factor < set->count + num);
+
+ set->array = realloc(set->array, n * sizeof(spcKmer));
+ if(set->array == NULL){
+ fprintf(stderr, "-- Out of memory --\n");
+ abort();
+ }
+ flags = malloc((n+15)/16 * 4);
+ memset(flags, 0x55, (n+15)/16 * 4);
+ size = set->size;
+ set->size = n;
+ set->max = n * set->load_factor;
+ f = set->flags;
+ set->flags = flags;
+ flags = f;
+ for(i=0;iarray[i];
+ set_kmer_entity_del(flags, i);
+ while(1){
+ hc = get_kmer_seq(key) % set->size;
+ while(!is_kmer_entity_null(set->flags, hc)){ hc ++; if(hc == set->size) hc = 0; }
+ clear_kmer_entity_null(set->flags, hc);
+ if(hc < size && exists_kmer_entity(flags, hc)){
+ tmp = key;
+ key = set->array[hc];
+ set->array[hc] = tmp;
+ set_kmer_entity_del(flags, hc);
+ } else {
+ set->array[hc] = key;
+ break;
+ }
+ }
+ }
+ free(flags);
+}
+
+PUBLIC_FUNC int put_spckmerset(spcKmerSet *set, Kmer spc_kmer, ubyte2 spaced_bases, kmer_t *node){
+ ubyte8 hc;
+ encap_spckmerset(set, 1);
+ hc = spc_kmer % set->size;
+ do{
+ if(is_kmer_entity_null(set->flags, hc)){ //new! repeat_flag==0
+ clear_kmer_entity_null(set->flags, hc);
+ set_new_spckmer(set->array + hc, spc_kmer, spaced_bases, node);
+ set->count ++;
+ return 0;
+ } else {
+ if(get_kmer_seq(set->array[hc]) == spc_kmer){ //exists! repeat_flag==1 or 0
+ return update_spckmer(set->array + hc, spaced_bases, node);
+ }
+ }
+ hc ++;
+ if(hc == set->size) hc = 0;
+ } while(1);
+ return 3;
+}
+
+PUBLIC_FUNC void buildSpcKmerSet(KmerSet *set, spcKmerSet *spaced_kset)
+{
+ boolean spcFlag;
+ Kmer buff_kmer, spc_kmer;
+ ubyte2 spc_bases;
+
+ ubyte8 i=0,j=0;
+ for(i=0;isize;i++)
+ {
+ if(is_kmer_entity_null(set->flags, i))
+ continue;
+ else
+ {
+// kmer_t **kmer_p;
+// *kmer_p = set->array+i;
+ if(set->array[i].deleted != 1) //kmer not repeat
+ {
+ //spaced seed: 18 of 25, build masker and use >>,&,| for each part, only assign once
+ // 1 1111 1010 1100 1111 1101 0110 !!!OLD!!!
+ // 1 1111 1111 1111 1010 1100 1000 !!!NEW!!!
+ // 11 11111111 11111111 11111111 11001100 11110000 11000000 !!!NEW!!!
+
+ buff_kmer = get_kmer_seq(set->array[i]);
+
+ spc_kmer = ((buff_kmer>>14)&0xFFFFFFF00) | ((buff_kmer>>12)&0xC0) | ((buff_kmer>>10)&0x3C) | ((buff_kmer>>6)&0x3);
+ //0xFFFFFFF00 = 1111 11111111 11111111 11111111 00000000
+ // 0xC0 = 0000 00000000 00000000 00000000 11000000
+ // 0x3C = 0000 00000000 00000000 00000000 00111100
+ // 0x3 = 0000 00000000 00000000 00000000 00000011
+
+ spc_bases = ((buff_kmer>>8)&0x3000) | ((buff_kmer>>6)&0xC00) | ((buff_kmer>>2)&0x3C0) | (buff_kmer&0x3F);
+ // 0x3000 = 110000 00000000
+ // 0xC00 = 001100 00000000
+ // 0x3C0 = 000011 11000000
+ // 0x3F = 000000 00111111
+
+ //build the 18mer and the spaced bases(7mer), put them in the spaced_kmer hash
+ spcFlag = put_spckmerset(spaced_kset, spc_kmer, spc_bases, set->array+i);
+ if(spcFlag!=0)
+ fprintf(stderr, "flag error: %c\tkmer exists: %llu %hu\n", spcFlag, spc_kmer, spc_bases);
+// if((++j)%100000==0)
+// fprintf(stderr,"--- %lluth spaced Kmer built\n",j);
+ }
+
+ }
+ }
+ //fprintf(stderr,"--- total %llu spaced Kmer built in a KmerSet\n",j);
+}
diff --git a/fusion/orderContig.c b/fusion/orderContig.c
new file mode 100755
index 0000000..b81fa18
--- /dev/null
+++ b/fusion/orderContig.c
@@ -0,0 +1,3485 @@
+#include "stdinc.h"
+#include "newhash.h"
+#include "extfunc.h"
+#include "extvab.h"
+#include "dfibHeap.h"
+#include "fibHeap.h"
+#include "darray.h"
+
+#define CNBLOCKSIZE 10000
+#define MAXC 10000
+#define MAXCinBetween 200
+
+#define MaxNodeInSub 10000
+#define GapLowerBound -2000
+#define GapUpperBound 300000
+
+//static boolean static_f=0;
+
+
+static int gapCounter;
+static int orienCounter;
+static int throughCounter;
+
+static DARRAY *solidArray;
+static DARRAY *tempArray;
+
+static int solidCounter;
+
+static CTGinHEAP ctg4heapArray[MaxNodeInSub+1]; // index in this array are put to heaps, start from 1
+static unsigned int nodesInSub[MaxNodeInSub];
+static int nodeDistance[MaxNodeInSub];
+static int nodeCounter;
+
+static unsigned int nodesInSubInOrder[MaxNodeInSub];
+static int nodeDistanceInOrder[MaxNodeInSub];
+
+static DARRAY *scaf3,*scaf5;
+static DARRAY *gap3,*gap5;
+
+static unsigned int downstreamCTG[MAXCinBetween];
+static unsigned int upstreamCTG[MAXCinBetween];
+static int dsCtgCounter;
+static int usCtgCounter;
+
+static CONNECT *checkConnect(unsigned int from_c,unsigned int to_c);
+static int maskPuzzle(int num_connect,unsigned int contigLen);
+static void freezing();
+static boolean checkOverlapInBetween(double tolerance);
+static int setConnectDelete(unsigned int from_c,unsigned int to_c,char flag,boolean cleanBinding);
+static int setConnectWP(unsigned int from_c,unsigned int to_c,char flag);
+
+static void general_linearization(boolean strict);
+static void debugging2();
+static void smallScaf();
+static void detectBreakScaf();
+static boolean checkSimple(DARRAY *ctgArray,int count);
+static void checkCircle();
+
+//find the only connection involved in connection binding
+static CONNECT *getBindCnt(unsigned int ctg)
+{
+ CONNECT *ite_cnt;
+ CONNECT *bindCnt=NULL;
+ CONNECT *temp_cnt=NULL;
+ CONNECT *temp3_cnt=NULL;
+ int count = 0;
+ int count2 = 0;
+ int count3 = 0;
+
+ ite_cnt = contig_array[ctg].downwardConnect;
+ while(ite_cnt){
+ if(ite_cnt->nextInScaf){
+ count++;
+ bindCnt = ite_cnt;
+ }
+ if(ite_cnt->prevInScaf){
+ temp_cnt = ite_cnt;
+ count2++;
+ }
+ if(ite_cnt->singleInScaf){
+ temp3_cnt = ite_cnt;
+ count3++;
+ }
+ ite_cnt = ite_cnt->next;
+ }
+ if(count==1)
+ return bindCnt;
+
+ if(count==0&&count2==1)
+ return temp_cnt;
+ if(count==0&&count2==0&&count3==1)
+ return temp3_cnt;
+ return NULL;
+}
+
+static void createAnalogousCnt(unsigned int sourceStart,
+ CONNECT *originCnt, int gap,
+ unsigned int targetStart,unsigned int targetStop)
+{
+ CONNECT *temp_cnt;
+ unsigned int balTargetStart=getTwinCtg(targetStart);
+ unsigned int balTargetStop=getTwinCtg(targetStop);
+
+ unsigned int balSourceStart = getTwinCtg(sourceStart);
+ unsigned int balSourceStop = getTwinCtg(originCnt->contigID);
+
+ originCnt->deleted = 1;
+ temp_cnt = getCntBetween(balSourceStop,balSourceStart);
+ temp_cnt->deleted = 1;
+
+ if(gapweight,1);
+ if(temp_cnt)
+ temp_cnt->inherit = 1;
+ temp_cnt = add1Connect(balTargetStop,balTargetStart,gap,originCnt->weight,1);
+ if(temp_cnt)
+ temp_cnt->inherit = 1;
+}
+// increase #long_pe_support for a conncet by 1
+static void add1LongPEcov(unsigned int fromCtg,unsigned int toCtg,int weight)
+{
+ //check if they are on the same scaff
+ if(contig_array[fromCtg].from_vt!=contig_array[toCtg].from_vt ||
+ contig_array[fromCtg].to_vt!=contig_array[toCtg].to_vt){
+ printf("Warning from add1LongPEcov: contig %d and %d not on the same scaffold\n",
+ fromCtg,toCtg);
+ return;
+ }
+ if(contig_array[fromCtg].indexInScaf>=contig_array[toCtg].indexInScaf){
+ printf("Warning from add1LongPEcov: wrong about order between contig %d and %d\n",
+ fromCtg,toCtg);
+ return;
+ }
+ CONNECT *bindCnt;
+ unsigned int prevCtg = fromCtg;
+ bindCnt = getBindCnt(fromCtg);
+ while(bindCnt){
+ if(bindCnt->maxGap + weight<=1000)
+ bindCnt->maxGap += weight;
+ else
+ bindCnt->maxGap = 1000;
+
+ if(fromCtg==0&&toCtg==0)
+ printf("link (%d %d ) covered by link (%d %d), wt %d\n",
+ prevCtg,bindCnt->contigID,fromCtg,toCtg,weight);
+ if(bindCnt->contigID==toCtg)
+ break;
+ prevCtg = bindCnt->contigID;
+ bindCnt = bindCnt->nextInScaf;
+ }
+ unsigned int bal_fc = getTwinCtg(fromCtg);
+ unsigned int bal_tc = getTwinCtg(toCtg);
+ bindCnt = getBindCnt(bal_tc);
+ prevCtg = bal_tc;
+ while(bindCnt){
+ if(bindCnt->maxGap + weight<=1000)
+ bindCnt->maxGap += weight;
+ else
+ bindCnt->maxGap = 1000;
+ if(fromCtg==0&&toCtg==0)
+ printf("link (%d %d ) covered by link (%d %d), wt %d\n",
+ prevCtg,bindCnt->contigID,fromCtg,toCtg,weight);
+ if(bindCnt->contigID==bal_fc)
+ return;
+ prevCtg = bindCnt->contigID;
+ bindCnt = bindCnt->nextInScaf;
+ }
+ printf("Warning from add1LongPEcov: not reach the end (%d %d) (B)\n",bal_tc,bal_fc);
+}
+
+// for long pair ends, move the connections along scaffolds established by shorter pair ends till reach the ends
+static void downSlide()
+{
+ fprintf(stderr,"[%s]entering this function.\n",__FUNCTION__);
+ int len=0,gap;
+ unsigned int i;
+ CONNECT *ite_cnt,*bindCnt,*temp_cnt;
+ unsigned int bottomCtg,topCtg,bal_i;
+ unsigned int targetCtg,bal_target;
+ boolean getThrough,orienConflict;
+ int slideLen,slideLen2;
+
+ orienCounter = throughCounter = 0;
+ for(i=1;i<=num_ctg;i++){
+ if(contig_array[i].mask||!contig_array[i].downwardConnect)
+ continue;
+ bindCnt = getBindCnt(i);
+ if(!bindCnt)
+ continue;
+ bal_i = getTwinCtg(i);
+ len = slideLen = 0;
+ bottomCtg = i;
+
+ //find the last unmasked contig in this binding
+ while(bindCnt->nextInScaf){
+ len += bindCnt->gapLen + contig_array[bindCnt->contigID].length;
+ if(contig_array[bindCnt->contigID].mask==0){
+ bottomCtg = bindCnt->contigID;
+ slideLen = len;
+ }
+ bindCnt = bindCnt->nextInScaf;
+ }
+ len += bindCnt->gapLen + contig_array[bindCnt->contigID].length;
+
+ if(contig_array[bindCnt->contigID].mask==0||bottomCtg==0){
+ bottomCtg = bindCnt->contigID;
+ slideLen = len;
+ }
+ //check each connetion from long pair ends
+ ite_cnt = contig_array[i].downwardConnect;
+ while(ite_cnt){
+ if(ite_cnt->deleted||ite_cnt->mask||ite_cnt->singleInScaf
+ ||ite_cnt->nextInScaf||ite_cnt->prevInScaf||ite_cnt->inherit){
+ ite_cnt = ite_cnt->next;
+ continue;
+ }
+ targetCtg = ite_cnt->contigID;
+ if(contig_array[i].from_vt==contig_array[targetCtg].from_vt){ // on the same scaff
+ if(contig_array[i].indexInScaf>contig_array[targetCtg].indexInScaf)
+ orienCounter++;
+ else
+ throughCounter++;
+
+ setConnectDelete(i,ite_cnt->contigID,1,0);
+ ite_cnt = ite_cnt->next;
+ continue;
+
+ }
+ //check if this connection conflicts with previous scaffold orientationally
+ temp_cnt = getBindCnt(targetCtg);
+ orienConflict = 0;
+ if(temp_cnt){
+ while(temp_cnt->nextInScaf){
+ if(temp_cnt->contigID==i){
+ orienConflict = 1;
+ printf("Warning from downSlide: still on the same scaff: %d and %d\n"
+ ,i,targetCtg);
+ printf("on scaff %d and %d\n",
+ contig_array[i].from_vt,contig_array[targetCtg].from_vt);
+ printf("on bal_scaff %d and %d\n",
+ contig_array[bal_target].to_vt,contig_array[bal_i].to_vt);
+ break;
+ }
+ temp_cnt = temp_cnt->nextInScaf;
+ }
+ if(temp_cnt->contigID==i)
+ orienConflict = 1;
+ }
+ if(orienConflict){
+ orienCounter++;
+ setConnectDelete(i,ite_cnt->contigID,1,0);
+ ite_cnt = ite_cnt->next;
+ continue;
+ }
+ //find the most top contig along previous scaffold starting with the target contig of this connection
+ bal_target = getTwinCtg(targetCtg);
+ slideLen2 = 0;
+ if(contig_array[targetCtg].mask==0){
+ topCtg = bal_target;
+ }else{
+ topCtg = 0;
+ }
+
+ temp_cnt = getBindCnt(bal_target);
+ getThrough = len = 0;
+ if(temp_cnt){
+ //find the last contig in this binding
+ while(temp_cnt->nextInScaf){
+ //check if this route reaches bal_i
+ if(temp_cnt->contigID==bal_i){
+ printf("Warning from downSlide: (B) still on the same scaff: %d and %d (%d and %d)\n",
+ i,targetCtg,bal_target,bal_i);
+ printf("on scaff %d and %d\n",
+ contig_array[i].from_vt,contig_array[targetCtg].from_vt);
+ printf("on bal_scaff %d and %d\n",
+ contig_array[bal_target].to_vt,contig_array[bal_i].to_vt);
+ getThrough = 1;
+ break;
+ }
+ len += temp_cnt->gapLen + contig_array[temp_cnt->contigID].length;
+ if(contig_array[temp_cnt->contigID].mask==0){
+ topCtg = temp_cnt->contigID;
+ slideLen2 = len;
+ }
+ temp_cnt = temp_cnt->nextInScaf;
+ }
+ len += temp_cnt->gapLen + contig_array[temp_cnt->contigID].length;
+ if(contig_array[temp_cnt->contigID].mask==0||topCtg==0){
+ topCtg = temp_cnt->contigID;
+ slideLen2 = len;
+ }
+ if(temp_cnt->contigID==bal_i)
+ getThrough = 1;
+ else
+ topCtg = getTwinCtg(topCtg);
+ }else
+ topCtg = targetCtg;
+
+ if(getThrough){
+ throughCounter++;
+ setConnectDelete(i,ite_cnt->contigID,1,0);
+ ite_cnt = ite_cnt->next;
+ continue;
+ }
+ //add a connection between bottomCtg and topCtg
+ gap = ite_cnt->gapLen - slideLen - slideLen2;
+ if(bottomCtg!=topCtg&&!(i==bottomCtg&&targetCtg==topCtg)){
+ createAnalogousCnt(i,ite_cnt,gap,bottomCtg,topCtg);
+ if(contig_array[bottomCtg].mask||contig_array[topCtg].mask)
+ printf("downSlide to masked contig\n");
+ }
+ ite_cnt = ite_cnt->next;
+ } //for each connect
+ } // for each contig
+ //printf("downSliding is done...orienConflict %d, fall inside %d\n",
+ // orienCounter,throughCounter);
+}
+
+static boolean setNextInScaf(CONNECT *cnt, CONNECT *nextCnt)
+{
+ if(!cnt){
+ printf("setNextInScaf: empty pointer\n");
+ return 0;
+ }
+ if(!nextCnt){
+ cnt->nextInScaf = nextCnt;
+ return 1;
+ }
+ if(cnt->mask||cnt->deleted){
+ printf("setNextInScaf: cnt is masked or deleted\n");
+ return 0;
+ }
+ if(nextCnt->deleted||nextCnt->mask){
+ printf("setNextInScaf: nextCnt is masked or deleted\n");
+ return 0;
+ }
+ cnt->nextInScaf = nextCnt;
+ return 1;
+}
+
+static boolean setPrevInScaf(CONNECT *cnt, boolean flag)
+{
+ if(!cnt){
+ printf("setPrevInScaf: empty pointer\n");
+ return 0;
+ }
+ if(!flag){
+ cnt->prevInScaf = flag;
+ return 1;
+ }
+ if(cnt->mask||cnt->deleted){
+ printf("setPrevInScaf: cnt is masked or deleted\n");
+ return 0;
+ }
+ cnt->prevInScaf = flag;
+ return 1;
+}
+
+/*
+connect A is upstream to B, replace A with C
+from_c
+ > branch_c - to_c
+from_c_new
+*/
+static void substitueUSinScaf(CONNECT *origin, unsigned int from_c_new)
+{
+ if(!origin||!origin->nextInScaf)
+ return;
+
+ unsigned int branch_c, to_c;
+ unsigned int bal_branch_c, bal_to_c;
+ unsigned int bal_from_c_new = getTwinCtg(from_c_new);
+ CONNECT *bal_origin,*bal_nextCNT,*prevCNT,*bal_prevCNT;
+
+
+ branch_c = origin->contigID;
+ to_c = origin->nextInScaf->contigID;
+ bal_branch_c = getTwinCtg(branch_c);
+ bal_to_c = getTwinCtg(to_c);
+
+ prevCNT = checkConnect(from_c_new,branch_c);
+ bal_nextCNT = checkConnect(bal_to_c,bal_branch_c);
+ if(!bal_nextCNT){
+ printf("substitueUSinScaf: no connect between %d and %d\n",bal_to_c,bal_branch_c);
+ return;
+ }
+ bal_origin = bal_nextCNT->nextInScaf;
+ bal_prevCNT = checkConnect(bal_branch_c,bal_from_c_new);
+
+ setPrevInScaf(bal_nextCNT->nextInScaf,0);
+ setNextInScaf(prevCNT,origin->nextInScaf);
+ setNextInScaf(bal_nextCNT,bal_prevCNT);
+ setPrevInScaf(bal_prevCNT,1);
+
+ setNextInScaf(origin,NULL);
+ setPrevInScaf(bal_origin,0);
+}
+
+/*
+connect B is downstream to C, replace B with A
+ to_c
+from_c - branch_c <
+ to_c_new
+*/
+static void substitueDSinScaf(CONNECT *origin, unsigned int branch_c, unsigned int to_c_new)
+{
+ if(!origin||!origin->prevInScaf)
+ return;
+
+ unsigned int to_c;
+ unsigned int bal_branch_c, bal_to_c,bal_to_c_new;
+ unsigned int from_c,bal_from_c;
+ CONNECT *bal_origin,*prevCNT,*bal_prevCNT;
+ CONNECT *nextCNT,*bal_nextCNT;
+
+
+ to_c = origin->contigID;
+ bal_branch_c = getTwinCtg(branch_c);
+ bal_to_c = getTwinCtg(to_c);
+ bal_origin = getCntBetween(bal_to_c,bal_branch_c);
+ if(!bal_origin){
+ printf("substitueDSinScaf: no connect between %d and %d\n",bal_to_c,bal_branch_c);
+ return;
+ }
+ bal_from_c = bal_origin->nextInScaf->contigID;
+ from_c = getTwinCtg(bal_from_c);
+ bal_to_c_new = getTwinCtg(to_c_new);
+
+ prevCNT = checkConnect(from_c,branch_c);
+ nextCNT = checkConnect(branch_c,to_c_new);
+ setNextInScaf(prevCNT,nextCNT);
+ setPrevInScaf(nextCNT,1);
+
+ bal_nextCNT = checkConnect(bal_to_c_new,bal_branch_c);
+ bal_prevCNT = checkConnect(bal_branch_c,bal_from_c);
+
+ setNextInScaf(bal_nextCNT,bal_prevCNT);
+ setPrevInScaf(origin,0);
+ setNextInScaf(bal_origin,NULL);
+}
+
+static int validConnect(unsigned int ctg, CONNECT *preCNT)
+{
+ if(preCNT&&preCNT->nextInScaf)
+ return 1;
+
+ CONNECT *cn_temp;
+ int count=0;
+ if(!contig_array[ctg].downwardConnect)
+ return count;
+ cn_temp = contig_array[ctg].downwardConnect;
+ while(cn_temp){
+ if(!cn_temp->deleted&&!cn_temp->mask)
+ count++;
+ cn_temp = cn_temp->next;
+ }
+ return count;
+}
+
+static CONNECT *getNextContig(unsigned int ctg, CONNECT *preCNT, boolean *exception)
+{
+ CONNECT *cn_temp,*retCNT=NULL;
+ int count=0,valid_in;
+ unsigned int nextCtg,bal_ctg;
+
+ *exception = 0;
+ if(preCNT&&preCNT->nextInScaf){
+ if(preCNT->contigID!=ctg)
+ printf("pre cnt does not lead to %d\n",ctg);
+ nextCtg = preCNT->nextInScaf->contigID;
+ cn_temp = getCntBetween(ctg,nextCtg);
+ if(cn_temp&&(cn_temp->mask||cn_temp->deleted)){
+ printf("getNextContig: arc(%d %d) twin (%d %d) with mask %d deleted %d\n"
+ ,ctg,nextCtg,getTwinCtg(nextCtg),getTwinCtg(ctg)
+ ,cn_temp->mask,cn_temp->deleted);
+ if(!cn_temp->prevInScaf)
+ printf("not even has a prevInScaf\n");
+ cn_temp = getCntBetween(getTwinCtg(nextCtg),
+ getTwinCtg(ctg));
+ if(!cn_temp->nextInScaf)
+ printf("its twin cnt not has a nextInScaf\n");
+ fflush(stdout);
+ *exception = 1;
+ }else
+ return preCNT->nextInScaf;
+ }
+
+ bal_ctg = getTwinCtg(ctg);
+ valid_in = validConnect(bal_ctg,NULL);
+ if(valid_in>1)
+ return NULL;
+ if(!contig_array[ctg].downwardConnect)
+ return NULL;
+ cn_temp = contig_array[ctg].downwardConnect;
+ while(cn_temp){
+ if(cn_temp->mask||cn_temp->deleted){
+ cn_temp = cn_temp->next;
+ continue;
+ }
+ count++;
+ if(count==1)
+ retCNT = cn_temp;
+ else if(count==2)
+ return NULL;
+ cn_temp = cn_temp->next;
+ }
+ return retCNT;
+}
+
+// get the valid connect between 2 given ctgs
+static CONNECT *checkConnect(unsigned int from_c,unsigned int to_c)
+{
+ CONNECT *cn_temp=getCntBetween(from_c,to_c);
+ if(!cn_temp)
+ return NULL;
+ if(!cn_temp->mask&&!cn_temp->deleted)
+ return cn_temp;
+ return NULL;
+}
+
+static int setConnectMask(unsigned int from_c,unsigned int to_c,char mask)
+{
+ CONNECT *cn_temp,*cn_bal,*cn_ds,*cn_us;
+ unsigned int bal_fc = getTwinCtg(from_c);
+ unsigned int bal_tc = getTwinCtg(to_c);
+ unsigned int ctg3,bal_ctg3;
+
+ cn_temp = getCntBetween(from_c,to_c);
+ cn_bal = getCntBetween(bal_tc,bal_fc);
+ if(!cn_temp||!cn_bal){
+ return 0;
+ }
+ cn_temp->mask = mask;
+ cn_bal->mask = mask;
+ if(!mask)
+ return 1;
+
+ if(cn_temp->nextInScaf){ //undo the binding
+ setPrevInScaf(cn_temp->nextInScaf,0);
+ ctg3 = cn_temp->nextInScaf->contigID;
+ setNextInScaf(cn_temp,NULL);
+ bal_ctg3 = getTwinCtg(ctg3);
+ cn_ds = getCntBetween(bal_ctg3,bal_tc);
+ setNextInScaf(cn_ds,NULL);
+ setPrevInScaf(cn_bal,0);
+ }
+
+ // ctg3 -> from_c -> to_c
+ // bal_ctg3 <- bal_fc <- bal_tc
+ if(cn_bal->nextInScaf){
+ setPrevInScaf(cn_bal->nextInScaf,0);
+ bal_ctg3 = cn_bal->nextInScaf->contigID;
+ setNextInScaf(cn_bal,NULL);
+ ctg3 = getTwinCtg(bal_ctg3);
+ cn_us = getCntBetween(ctg3,from_c);
+ setNextInScaf(cn_us,NULL);
+ setPrevInScaf(cn_temp,0);
+ }
+
+ return 1;
+}
+
+
+static boolean setConnectUsed(unsigned int from_c,unsigned int to_c,char flag)
+{
+ CONNECT *cn_temp,*cn_bal;
+ unsigned int bal_fc = getTwinCtg(from_c);
+ unsigned int bal_tc = getTwinCtg(to_c);
+
+ cn_temp = getCntBetween(from_c,to_c);
+ cn_bal = getCntBetween(bal_tc,bal_fc);
+ if(!cn_temp||!cn_bal){
+ return 0;
+ }
+ cn_temp->used = flag;
+ cn_bal->used = flag;
+
+ return 1;
+}
+
+static int setConnectWP(unsigned int from_c,unsigned int to_c,char flag)
+{
+ CONNECT *cn_temp,*cn_bal;
+ unsigned int bal_fc = getTwinCtg(from_c);
+ unsigned int bal_tc = getTwinCtg(to_c);
+
+ cn_temp = getCntBetween(from_c,to_c);
+ cn_bal = getCntBetween(bal_tc,bal_fc);
+ if(!cn_temp||!cn_bal){
+ return 0;
+ }
+ cn_temp->weakPoint = flag;
+ cn_bal->weakPoint = flag;
+ //fprintf(stderr,"contig %d and %d, weakPoint %d\n",from_c,to_c,cn_temp->weakPoint);
+ //fprintf(stderr,"contig %d and %d, weakPoint %d\n",bal_tc,bal_fc,cn_bal->weakPoint);
+ return 1;
+}
+
+static int setConnectDelete(unsigned int from_c,unsigned int to_c,char flag,boolean cleanBinding)
+{
+ CONNECT *cn_temp,*cn_bal;
+ unsigned int bal_fc = getTwinCtg(from_c);
+ unsigned int bal_tc = getTwinCtg(to_c);
+
+ cn_temp = getCntBetween(from_c,to_c);
+ cn_bal = getCntBetween(bal_tc,bal_fc);
+
+ if(!cn_temp||!cn_bal){
+ return 0;
+ }
+ cn_temp->deleted = flag;
+ cn_bal->deleted = flag;
+ if(!flag)
+ return 1;
+ if(cleanBinding){
+ cn_temp->prevInScaf = 0;
+ cn_temp->nextInScaf = NULL;
+ cn_bal->prevInScaf = 0;
+ cn_bal->nextInScaf = NULL;
+ }
+ return 1;
+}
+
+static void maskContig(unsigned int ctg,boolean flag)
+{
+ unsigned int bal_ctg,ctg2,bal_ctg2;
+ CONNECT *cn_temp;
+
+ bal_ctg = getTwinCtg(ctg);
+ cn_temp = contig_array[ctg].downwardConnect;
+ while(cn_temp){
+ if(cn_temp->mask||cn_temp->prevInScaf||cn_temp->nextInScaf||cn_temp->singleInScaf){
+ cn_temp = cn_temp->next;
+ continue;
+ }
+ ctg2 = cn_temp->contigID;
+ setConnectMask(ctg,ctg2,flag);
+ cn_temp = cn_temp->next;
+ }
+ // bal_ctg2 <- bal_ctg
+ cn_temp = contig_array[bal_ctg].downwardConnect;
+ while(cn_temp){
+ if(cn_temp->mask||cn_temp->prevInScaf||cn_temp->nextInScaf||cn_temp->singleInScaf){
+ cn_temp = cn_temp->next;
+ continue;
+ }
+ bal_ctg2 = cn_temp->contigID;
+ setConnectMask(bal_ctg,bal_ctg2,flag);
+ cn_temp = cn_temp->next;
+ }
+
+ contig_array[ctg].mask = flag;
+ contig_array[bal_ctg].mask = flag;
+}
+
+static int maskPuzzle(int num_connect,unsigned int contigLen)
+{
+ int in_num,out_num,flag=0,puzzleCounter=0;
+ unsigned int i,bal_i;
+
+ for(i=1;i<=num_ctg;i++){
+ if(contigLen&&contig_array[i].length>contigLen)
+ break;
+ if(contig_array[i].mask)
+ continue;
+ bal_i = getTwinCtg(i);
+ in_num = validConnect(bal_i,NULL);
+ out_num = validConnect(i,NULL);
+ if((in_num>1||out_num>1)&&(in_num+out_num>=num_connect)){
+ flag++;
+ maskContig(i,1);
+ }
+ in_num = validConnect(bal_i,NULL);
+ out_num = validConnect(i,NULL);
+ if(in_num>1||out_num>1){
+ puzzleCounter++;
+ //debugging2(i);
+ }
+
+ if(isSmallerThanTwin(i))
+ i++;
+ }
+ //printf("Masked %d contigs, %d puzzle left\n",flag,puzzleCounter);
+ return flag;
+}
+
+static void deleteWeakCnt(int cut_off)
+{
+ unsigned int i;
+ CONNECT *cn_temp1;
+ int weaks=0,counter=0;
+ //fprintf(stderr,"[%s]entering this function. num_ctg=%d\n",__FUNCTION__,num_ctg);
+ for(i=1;i<=num_ctg;i++){
+ //fprintf(stderr,"[%s]iterating %d.\n",__FUNCTION__,i);
+ cn_temp1 = contig_array[i].downwardConnect;
+ while(cn_temp1){
+ if(!cn_temp1->mask&&!cn_temp1->deleted&&!cn_temp1->nextInScaf
+ &&!cn_temp1->singleInScaf&&!cn_temp1->prevInScaf){
+ counter++;
+ }
+ if(cn_temp1->weak&&cn_temp1->deleted&&cn_temp1->weight>=cut_off){
+ cn_temp1->deleted = 0;
+ cn_temp1->weak = 0;
+ }
+ else if(!cn_temp1->deleted&&cn_temp1->weight>0&&cn_temp1->weightnextInScaf&&!cn_temp1->prevInScaf){
+ cn_temp1->deleted = 1;
+ cn_temp1->weak = 1;
+ if(cn_temp1->singleInScaf)
+ cn_temp1->singleInScaf = 0;
+ if(!cn_temp1->mask)
+ weaks++;
+ }
+ cn_temp1 = cn_temp1->next;
+ }
+
+ }
+ fprintf(stderr,"[%s]%d connects doesn't meet weight threshold .\n",__FUNCTION__,weaks);
+ checkCircle();
+}
+
+//check if one contig is linearly connected to the other ->C1->C2...
+static int linearC2C(unsigned int starter,CONNECT *cnt2c1,unsigned int c2,int min_dis,int max_dis)
+{
+ int out_num,in_num;
+ CONNECT *prevCNT,*cnt,*cn_temp;
+ unsigned int c1,bal_c1,ctg,bal_c2;
+ int len=0;
+ unsigned int bal_start = getTwinCtg(starter);
+ boolean excep;
+
+ c1 = cnt2c1->contigID;
+
+ if(c1==c2){
+ printf("linearC2C: c1(%d) and c2(%d) are the same contig\n",c1,c2);
+ return -1;
+ }
+
+ bal_c1 = getTwinCtg(c1);
+ in_num = validConnect(bal_c1,NULL);
+ if(in_num>1)
+ return 0;
+
+ dsCtgCounter = 1;
+ usCtgCounter = 0;
+ downstreamCTG[dsCtgCounter++] = c1;
+ bal_c2 = getTwinCtg(c2);
+ upstreamCTG[usCtgCounter++] = bal_c2;
+ // check if c1 is linearly connected to c2 by pe connections
+ cnt = prevCNT = cnt2c1;
+ while((cnt=getNextContig(c1,prevCNT,&excep))!=NULL){
+ c1 = cnt->contigID;
+ len += cnt->gapLen+contig_array[c1].length;
+ if(c1==c2)
+ return 1;
+
+ if(len>max_dis||c1==starter||c1==bal_start)
+ return 0;
+ downstreamCTG[dsCtgCounter++] = c1;
+ if(dsCtgCounter>=MAXCinBetween){
+ printf("%d downstream contigs, start at %d, max_dis %d, current dis %d\n"
+ ,dsCtgCounter,starter,max_dis,len);
+ return 0;
+ }
+ prevCNT = cnt;
+ }
+ out_num = validConnect(c1,NULL);
+ if(out_num)
+ return 0;
+
+
+ //find the most upstream contig to c2
+ cnt = prevCNT = NULL;
+ ctg = bal_c2;
+ while((cnt=getNextContig(ctg,prevCNT,&excep))!=NULL){
+ ctg = cnt->contigID;
+ len += cnt->gapLen+contig_array[ctg].length;
+ if(len>max_dis||ctg==starter||ctg==bal_start)
+ return 0;
+
+ prevCNT = cnt;
+ upstreamCTG[usCtgCounter++] = ctg;
+ if(usCtgCounter>=MAXCinBetween){
+ printf("%d upstream contigs, start at %d, max_dis %d, current dis %d\n"
+ ,usCtgCounter,starter,max_dis,len);
+ return 0;
+ }
+ }
+ if(dsCtgCounter+usCtgCounter>MAXCinBetween){
+ printf("%d downstream and %d upstream contigs\n",dsCtgCounter,usCtgCounter);
+ return 0;
+ }
+ out_num = validConnect(ctg,NULL);
+ if(out_num){
+ return 0;
+ }
+
+ c2 = getTwinCtg(ctg);
+ min_dis -= len;
+ max_dis -= len;
+ if(c1==c2||c1==ctg||max_dis<0)
+ return 0;
+
+ cn_temp = getCntBetween(c1,c2);
+ if(cn_temp){
+ setConnectMask(c1,c2,0);
+ setConnectDelete(c1,c2,0,0);
+ return 1;
+ }
+ len = (min_dis+max_dis)/2 >= 0 ? (min_dis+max_dis)/2 : 0;
+ cn_temp = allocateCN(c2,len);
+ if(cntLookupTable)
+ putCnt2LookupTable(c1,cn_temp);
+ cn_temp->weight = 0; // special connect from the original graph
+ cn_temp->next = contig_array[c1].downwardConnect;
+ contig_array[c1].downwardConnect = cn_temp;
+
+ bal_c1 = getTwinCtg(c1);
+ bal_c2 = getTwinCtg(c2);
+
+ cn_temp = allocateCN(bal_c1,len);
+ if(cntLookupTable)
+ putCnt2LookupTable(bal_c2,cn_temp);
+ cn_temp->weight = 0; // special connect from the original graph
+ cn_temp->next = contig_array[bal_c2].downwardConnect;
+ contig_array[bal_c2].downwardConnect = cn_temp;
+ return 1;
+}
+//catenate upstream contig array and downstream contig array to solidArray
+static void catUsDsContig()
+{
+ int i;
+
+ for(i=0;i=0;i--){
+ *(unsigned int *)darrayPut(solidArray,dsCtgCounter++) = getTwinCtg(upstreamCTG[i]);
+ }
+
+ solidCounter = dsCtgCounter;
+}
+
+//binding the connections between contigs in solidArray
+static void consolidate()
+{
+ int i,j;
+ CONNECT *prevCNT=NULL;
+ CONNECT *cnt;
+ unsigned int to_ctg;
+ unsigned int from_ctg = *(unsigned int *)darrayGet(solidArray,0);
+
+ for(i=1;i",*(unsigned int *)darrayGet(solidArray,j));
+ printf("\n");
+ return;
+ }
+ cnt->singleInScaf = solidCounter==2 ? 1:0;
+ if(prevCNT){
+ setNextInScaf(prevCNT,cnt);
+ setPrevInScaf(cnt,1);
+ }
+ prevCNT = cnt;
+ from_ctg = to_ctg;
+ }
+
+ //the reverse complementary path
+ from_ctg = getTwinCtg(*(unsigned int*)darrayGet(solidArray,solidCounter-1));
+ prevCNT = NULL;
+ for(i=solidCounter-2;i>=0;i--){
+ to_ctg = getTwinCtg(*(unsigned int *)darrayGet(solidArray,i));
+ cnt = checkConnect(from_ctg,to_ctg);
+ if(!cnt){
+ printf("consolidate B: no connect from %d to %d\n",from_ctg,to_ctg);
+ return;
+ }
+ cnt->singleInScaf = solidCounter==2 ? 1:0;
+ if(prevCNT){
+ setNextInScaf(prevCNT,cnt);
+ setPrevInScaf(cnt,1);
+ }
+ prevCNT = cnt;
+ from_ctg = to_ctg;
+ }
+
+}
+
+static void debugging1(unsigned int ctg1,unsigned int ctg2)
+{
+ CONNECT *cn1;
+ cn1 = getCntBetween(ctg1,ctg2);
+ if(cn1){
+ printf("(%d,%d) mask %d deleted %d w %d,singleInScaf %d\n",
+ ctg1,ctg2,cn1->mask,cn1->deleted,cn1->weight,cn1->singleInScaf);
+ if(cn1->nextInScaf)
+ printf("%d->%d->%d\n",ctg1,ctg2,cn1->nextInScaf->contigID);
+ if(cn1->prevInScaf)
+ printf("*->%d->%d\n",ctg1,ctg2);
+ else if(!cn1->nextInScaf)
+ printf("NULL->%d->%d->NULL\n",ctg1,ctg2);
+ }else
+ printf("%d -X- %d\n",ctg1,ctg2);
+}
+//remove transitive connections which cross linear paths (these paths may be broken)
+//if a->b->c and a->c, mask a->c
+static void removeTransitive()
+{
+ unsigned int i,bal_ctg;
+ int flag=1,out_num,in_num,count,min,max,linear;
+ CONNECT *cn_temp,*cn1=NULL,*cn2=NULL;
+
+ while(flag){
+ flag = 0;
+ for(i=1;i<=num_ctg;i++){
+ if(contig_array[i].mask)
+ continue;
+ out_num = validConnect(i,NULL);
+ if(out_num!=2)
+ continue;
+ cn_temp = contig_array[i].downwardConnect;
+ count = 0;
+ while(cn_temp){
+ if(cn_temp->deleted||cn_temp->mask){
+ cn_temp = cn_temp->next;
+ continue;
+ }
+ count++;
+ if(count==1)
+ cn1 = cn_temp;
+ else if(count==2){
+ cn2 = cn_temp;
+ }else // count > 2
+ break;
+
+ cn_temp = cn_temp->next;
+ }
+ if(count>2){
+ printf("%d valid connections from ctg %d\n",count,i);
+ continue;
+ }
+ if(cn1->gapLen>cn2->gapLen){
+ cn_temp = cn1;
+ cn1 = cn2;
+ cn2 = cn_temp;
+ } //make sure cn1 is closer to contig i than cn2
+ if(cn1->prevInScaf&&cn2->prevInScaf)
+ continue;
+ bal_ctg = getTwinCtg(cn2->contigID);
+ in_num = validConnect(bal_ctg,NULL);
+ if(in_num>2)
+ continue;
+ min = cn2->gapLen - cn1->gapLen - contig_array[cn1->contigID].length - ins_size_var/2;
+ max = cn2->gapLen - cn1->gapLen - contig_array[cn1->contigID].length + ins_size_var/2;
+
+ if(max<0)
+ continue;
+ //temprarily delete cn2
+ setConnectDelete(i,cn2->contigID,1,0);
+ linear = linearC2C(i,cn1,cn2->contigID,min,max);
+ if(linear!=1){
+ setConnectDelete(i,cn2->contigID,0,0);
+ continue;
+ }else{
+ downstreamCTG[0] = i;
+ catUsDsContig();
+ if(!checkSimple(solidArray,solidCounter))
+ continue;
+ cn1 = getCntBetween(*(unsigned int *)darrayGet(solidArray,solidCounter-2),
+ *(unsigned int *)darrayGet(solidArray,solidCounter-1));
+ if(cn1&&cn1->nextInScaf&&cn2->nextInScaf){
+ setConnectDelete(i,cn2->contigID,0,0);
+ continue;
+ }
+ consolidate();
+ if(cn2->prevInScaf)
+ substitueDSinScaf(cn2,*(unsigned int *)darrayGet(solidArray,0),
+ *(unsigned int *)darrayGet(solidArray,1));
+ if(cn2->nextInScaf)
+ substitueUSinScaf(cn2,*(unsigned int *)darrayGet(solidArray,solidCounter-2));
+ flag++;
+ }
+ } //for each contig
+ //printf("a remove transitive lag, %d connections removed\n",flag);
+ }
+
+}
+
+//get repeat contigs back into the scaffold according to connected unique contigs on both sides
+/*
+ A ------ D
+ > [i] <
+ B E
+*/
+static void debugging2(unsigned int ctg)
+{
+ CONNECT *cn1 = contig_array[ctg].downwardConnect;
+ while(cn1){
+ if(cn1->nextInScaf)
+ fprintf(stderr,"with nextInScaf,");
+ if(cn1->prevInScaf)
+ fprintf(stderr,"with prevInScaf,");
+ fprintf(stderr,"%u >> %d, mask %d deleted %d, inherit %d, singleInScaf %d\n",
+ ctg,cn1->contigID,cn1->mask,cn1->deleted,cn1->inherit,cn1->singleInScaf);
+ cn1 = cn1->next;
+ }
+}
+static void debugging()
+{
+/*
+ debugging1(1777,1468);
+ debugging2(8065);
+ debugging2(8066);
+*/
+}
+
+static void simplifyCnt()
+{
+ removeTransitive();
+ debugging();
+ general_linearization(1);
+ debugging();
+}
+
+static int getIndexInArray(unsigned int node)
+{
+ int index;
+ for(index=0;index0){
+ //printf("exists\n");
+ return 0;
+ }
+ if(index>=MaxNodeInSub)
+ return -1;
+ insertNodeIntoHeap(heap,distance,node);
+ nodesInSub[index] = node;
+ nodeDistance[index] = distance;
+ return 1;
+}
+
+static boolean putChainIntoSubgraph(FibHeap *heap,int distance,unsigned int node,int *index,CONNECT *prevC)
+{
+ unsigned int ctg = node;
+ CONNECT *nextCnt;
+ boolean excep,flag;
+ int counter = *index;
+
+ while(1){
+ nextCnt=getNextContig(ctg,prevC,&excep);
+ if(excep||!nextCnt){
+ *index = counter;
+ return 1;
+ }
+ ctg = nextCnt->contigID;
+ distance += nextCnt->gapLen + ctg;
+ flag = putNodeIntoSubgraph(heap,distance,ctg,counter);
+ if(flag<0)
+ return 0;
+ if(flag>0)
+ counter++;
+ prevC = nextCnt;
+ }
+}
+//check if nodes in subgraph have a potential heter form
+static boolean check_het_overlap(double tolerance){
+
+ int i,gap,overlap_point;
+ unsigned int node;
+ int len_sum,over3_len,over3_sum;
+ boolean flag=0;
+ len_sum=0;
+ over3_len=0;
+ over3_sum=0;
+ for(i=1;i<=nodeCounter;i++){
+ node = ctg4heapArray[i].ctgID;
+ len_sum += contig_array[node].length;
+ }
+ if(len_sum<1)
+ return 2;
+ for(i=1;i0){
+ flag=0;
+ }
+ else{
+ if(flag){
+ over3_len=ctg4heapArray[i+1].dis - overlap_point
+ - contig_array[ctg4heapArray[i+1].ctgID].length;
+ over3_sum+=over3_len;
+ if((double)over3_sum/len_sum>tolerance)
+ return 0;
+ }
+ flag=1;
+ overlap_point=ctg4heapArray[i].dis;
+ }
+ }
+
+ return 2;
+}
+
+// check if a contig is unique by trying to line its downstream/upstream nodes together
+static boolean checkUnique(unsigned int node,double tolerance)
+{
+ CONNECT *ite_cnt;
+ unsigned int currNode;
+ int distance;
+ int popCounter = 0;
+ boolean flag;
+
+ currNode = node;
+ FibHeap *heap = newFibHeap();
+
+ putNodeIntoSubgraph(heap, 0, currNode, 0);
+ nodeCounter = 1;
+ ite_cnt = contig_array[currNode].downwardConnect;
+ while(ite_cnt){
+ if(ite_cnt->deleted||ite_cnt->mask){
+ ite_cnt = ite_cnt->next;
+ continue;
+ }
+ currNode = ite_cnt->contigID;
+ distance = ite_cnt->gapLen + contig_array[currNode].length;
+ flag = putNodeIntoSubgraph(heap, distance, currNode, nodeCounter);
+ if(flag<0){
+ destroyHeap(heap);
+ return 0;
+ }
+ if(flag>0)
+ nodeCounter++;
+
+ flag = putChainIntoSubgraph(heap,distance,currNode,&nodeCounter,ite_cnt);
+ if(!flag){
+ destroyHeap(heap);
+ return 0;
+ }
+
+ ite_cnt = ite_cnt->next;
+ }
+ if(nodeCounter<=2){ // no more than 2 valid connections
+ destroyHeap(heap);
+ return 1;
+ }
+
+ while((currNode=removeNextNodeFromHeap(heap))!=0)
+ nodesInSubInOrder[popCounter++] = currNode;
+
+ destroyHeap(heap);
+
+ flag = checkOverlapInBetween(tolerance);
+ if(flag==1){
+ return 1;
+ }else{
+ flag = check_het_overlap(0.02);//check the heter form
+ }
+ return flag;
+}
+
+//find longest path and break the other
+static void process_ds_contig(unsigned int ctg){
+ unsigned int target=ctg4heapArray[nodeCounter].ctgID;
+ //int boarder = ctg4heapArray[nodeCounter].dis;
+ boolean excep;
+ CONNECT *route=contig_array[ctg].downwardConnect;
+ CONNECT *max_route=route;
+
+ int max_dis=0;
+
+ boolean end_flag=0;
+ while(route){
+
+ int dis=0;
+ CONNECT *tmp_cnt=route;
+ while(tmp_cnt){
+ dis+=route->gapLen+contig_array[route->contigID].length;
+ if(route->contigID==target){
+ end_flag=1;
+ break;
+ }
+ tmp_cnt=getNextContig(route->contigID,tmp_cnt,&excep);
+ }
+ if(dis>max_dis){
+ max_dis=dis;
+ max_route=route;
+ }
+ if(end_flag){
+ max_route=route;
+ break;
+ }
+ route=route->next;
+ }
+ //delete connect except max_route
+ route=contig_array[ctg].downwardConnect;
+ while(route){
+ if(route!=max_route){
+ setConnectMask(ctg,route->contigID,1);
+ }
+ route=route->next;
+ }
+
+}
+static void process_us_contig(unsigned int ctg){
+ unsigned int target=ctg4heapArray[1].ctgID;
+ //int boarder = ctg4heapArray[1].dis;
+ boolean excep;
+ CONNECT *route=contig_array[ctg].downwardConnect;
+ CONNECT *min_route=route;
+
+ int min_dis=0;
+
+ boolean end_flag=0;
+ while(route){
+
+ int dis=0;
+ CONNECT *tmp_cnt=route;
+ while(tmp_cnt){
+ dis-=route->gapLen+contig_array[route->contigID].length;
+ if(route->contigID==target){
+ end_flag=1;
+ break;
+ }
+ tmp_cnt=getNextContig(route->contigID,tmp_cnt,&excep);
+ }
+ if(disnext;
+ }
+ //delete connect except min_route
+ route=contig_array[ctg].downwardConnect;
+ while(route){
+ if(route!=min_route){
+ setConnectMask(ctg,route->contigID,1);
+ }
+ route=route->next;
+ }
+
+}
+
+//mask contigs with downstream and/or upstream can not be lined
+static void maskRepeat()
+{
+ int in_num,out_num,flagA,flagB;
+ int counter = 0;
+ int puzzleCounter = 0;
+ unsigned int i,bal_i;
+ int het_counter = 0;
+ for(i=1;i<=num_ctg;i++){
+ if(contig_array[i].mask)
+ continue;
+ bal_i = getTwinCtg(i);
+ in_num = validConnect(bal_i,NULL);
+ out_num = validConnect(i,NULL);
+ if(in_num>1||out_num>1)
+ puzzleCounter++;
+ else{
+ if(isSmallerThanTwin(i))
+ i++;
+ continue;
+
+ }
+
+ if(contig_array[i].cvg>2*cvgAvg){
+ counter++;
+ maskContig(i,1);
+ //printf("thick mask contig %d and %d\n",i,bal_i);
+ if(isSmallerThanTwin(i))
+ i++;
+ continue;
+ }
+
+ if(in_num>1)
+ flagA = checkUnique(bal_i,OverlapPercent);
+ else
+ flagA = 1;
+ if(out_num>1)
+ flagB = checkUnique(i,OverlapPercent);
+ else
+ flagB = 1;
+
+ if(flagA==0||flagB==0){
+ counter++;
+ maskContig(i,1);
+ }else{
+ if(flagA==2){//us find longest path
+ process_us_contig(bal_i);
+ }
+ if(flagB==2){//ds find longest path
+ process_ds_contig(i);
+ }
+ }
+ if(flagA==2||flagB==2)
+ het_counter++;
+
+ if(isSmallerThanTwin(i))
+ i++;
+ }
+ printf("[%s]%d contigs masked from %d puzzles\n",__FUNCTION__,counter,puzzleCounter);
+ printf("[%s]%d processed as heterozygous .\n",__FUNCTION__,het_counter);
+}
+
+
+static void ordering(boolean deWeak,boolean downS, boolean nonlinear, char *infile)
+{
+ //debugging();
+ if(downS){
+ downSlide();
+ //debugging();
+ if(deWeak)
+ deleteWeakCnt(weakPE);
+ }else{
+ if(deWeak)
+ deleteWeakCnt(weakPE);
+ }
+ //output_scaf(infile);
+ //debugging();
+ //printf("variance for insert size %d\n",ins_size_var);
+ simplifyCnt();
+ //debugging();
+
+ maskRepeat();
+ //debugging();
+ simplifyCnt();
+
+ if(nonlinear){
+ //printf("non-strict linearization\n");
+ general_linearization(0);
+ //linearization(0,0);
+ }
+ //maskRepeat();//???
+
+ maskPuzzle(2,0);
+ //debugging();
+ freezing();
+ //debugging();
+
+}
+
+//check if contigs next to each other have reasonable overlap
+boolean checkOverlapInBetween(double tolerance)
+{
+ int i,gap;
+ int index;
+ unsigned int node;
+ int lenSum,lenOlp;
+ lenSum = lenOlp = 0;
+ for(i=0;i0)
+ lenOlp += -gap;
+ //if(-gap>ins_size_var)
+ if((double)lenOlp/lenSum>tolerance)
+ return 0;
+ }
+ return 1;
+}
+
+
+/********* the following codes are for freezing current scaffolds ****************/
+//set connections between contigs in a array to used or not
+//meanwhile set mask to the opposite value
+static boolean setUsed(unsigned int start,unsigned int *array,int max_steps,boolean flag)
+{
+ unsigned int prevCtg = start;
+ unsigned int twinA,twinB;
+ int j;
+ CONNECT *cnt;
+ boolean usedFlag=0;
+ // save 'used' to 'checking'
+ prevCtg = start;
+ for(j=0;jused==flag||cnt->nextInScaf||cnt->prevInScaf||cnt->singleInScaf){
+ return 1;
+ }
+ cnt->checking = cnt->used;
+ twinA = getTwinCtg(prevCtg);
+ twinB = getTwinCtg(array[j]);
+ cnt = getCntBetween(twinB,twinA);
+ if(cnt)
+ cnt->checking = cnt->used;
+ prevCtg = array[j];
+ }
+ // set used to flag
+ prevCtg = start;
+ for(j=0;jused==flag){
+ usedFlag = 1;
+ break;
+ }
+ cnt->used = flag;
+ twinA = getTwinCtg(prevCtg);
+ twinB = getTwinCtg(array[j]);
+ cnt = getCntBetween(twinB,twinA);
+ if(cnt)
+ cnt->used = flag;
+ prevCtg = array[j];
+ }
+ // set mask to 'NOT flag' or set used to original value
+ prevCtg = start;
+ for(j=0;jmask = 1-flag;
+ else
+ cnt->used = cnt->checking;
+ twinA = getTwinCtg(prevCtg);
+ twinB = getTwinCtg(array[j]);
+ cnt = getCntBetween(twinB,twinA);
+ cnt->used = 1-flag;
+ if(!usedFlag)
+ cnt->mask = 1-flag;
+ else
+ cnt->used = cnt->checking;
+ prevCtg = array[j];
+ }
+ return usedFlag;
+}
+// break down scaffolds poorly supported by longer PE
+static void recoverMask()
+{
+ unsigned int i,ctg,bal_ctg,start,finish;
+ int num3,num5,j,t;
+ CONNECT *bindCnt,*cnt;
+ int min,max,max_steps=5,num_route,length;
+ int tempCounter,recoverCounter=0;
+ boolean multiUSE,change;
+
+ for(i=1;i<=num_ctg;i++)
+ contig_array[i].flag = 0;
+
+ so_far = (unsigned int *)ckalloc(max_n_routes*sizeof(unsigned int));
+ found_routes = (unsigned int **)ckalloc(max_n_routes*sizeof(unsigned int *));
+ for(j=0;jused)
+ break;
+ setConnectUsed(ctg,bindCnt->contigID,1);
+ ctg = bindCnt->contigID;
+ *(unsigned int *)darrayPut(scaf5,num5++) = ctg;
+ bal_ctg = getTwinCtg(ctg);
+ contig_array[ctg].flag = 1;
+ contig_array[bal_ctg].flag = 1;
+ bindCnt = bindCnt->nextInScaf;
+ }
+
+ ctg = getTwinCtg(i);
+ bindCnt = getBindCnt(ctg);
+ while(bindCnt){
+ if(bindCnt->used)
+ break;
+ setConnectUsed(ctg,bindCnt->contigID,1);
+ ctg = bindCnt->contigID;
+ bal_ctg = getTwinCtg(ctg);
+ contig_array[ctg].flag = 1;
+ contig_array[bal_ctg].flag = 1;
+ *(unsigned int *)darrayPut(scaf3,num3++) = bal_ctg;
+ bindCnt = bindCnt->nextInScaf;
+ }
+ if(num5+num3<2)
+ continue;
+ tempCounter = solidCounter = 0;
+ for(j=num3-1;j>=0;j--)
+ *(unsigned int *)darrayPut(tempArray,tempCounter++) =
+ *(unsigned int *)darrayGet(scaf3,j);
+ for(j=0;jgapLen + contig_array[finish].length;
+ min = length - 1.5*ins_size_var;
+ max = length + 1.5*ins_size_var;
+ traceAlongMaskedCnt(finish,start,max_steps,min,max,0,0,&num_route);
+ if(finish==start){
+ for(j=0;j%d",*(unsigned int *)darrayGet(tempArray,j));
+ printf(": start at %d\n",i);
+ }
+
+ if(num_route==1){
+ for(j=0;jused = 0;
+ cnt->checking = 0;
+ cnt = cnt->next;
+ }
+ }
+
+ for(j=0;j B -> C -> D un-bind link B->C to link A->B and B->C
+// A' <- B' <- C' <- D'
+static void unBindLink(unsigned int CB,unsigned int CC)
+{
+ //fprintf(stderr,"Unbind link (%d %d) to others...\n",CB,CC);
+ CONNECT *cnt1 = getCntBetween(CB,CC);
+ if(!cnt1)
+ return;
+ if(cnt1->singleInScaf)
+ cnt1->singleInScaf = 0;
+ CONNECT *cnt2 = getCntBetween(getTwinCtg(CC),getTwinCtg(CB));
+ if(!cnt2)
+ return;
+ if(cnt2->singleInScaf)
+ cnt2->singleInScaf = 0;
+ if(cnt1->nextInScaf){
+ unsigned int CD = cnt1->nextInScaf->contigID;
+ cnt1->nextInScaf->prevInScaf = 0;
+ cnt1->nextInScaf = NULL;
+ CONNECT *cnt3 = getCntBetween(getTwinCtg(CD),getTwinCtg(CC));
+ if(cnt3)
+ cnt3->nextInScaf = NULL;
+ cnt2->prevInScaf = 0;
+ }
+ if(cnt2->nextInScaf){
+ unsigned int bal_CA = cnt2->nextInScaf->contigID;
+ cnt2->nextInScaf->prevInScaf = 0;
+ cnt2->nextInScaf = NULL;
+ CONNECT *cnt4 = getCntBetween(getTwinCtg(bal_CA),CB);
+ if(cnt4)
+ cnt4->nextInScaf = NULL;
+ cnt1->prevInScaf = 0;
+ }
+}
+
+static void freezing()
+{
+ int num5,num3;
+ unsigned int ctg,bal_ctg;
+ unsigned int i;
+ int j,t;
+ CONNECT *cnt,*prevCNT,*nextCnt;
+ boolean excep;
+
+ for(i=1;i<=num_ctg;i++){
+ contig_array[i].flag = 0;
+ contig_array[i].from_vt = 0;
+ contig_array[i].to_vt = 0;
+ cnt = contig_array[i].downwardConnect;
+ while(cnt){
+ cnt->used = 0;
+ cnt->checking = 0;
+ cnt->singleInScaf = 0;
+ cnt = cnt->next;
+ }
+ }
+
+ for(i=1;i<=num_ctg;i++){
+ if(contig_array[i].flag||contig_array[i].mask)
+ continue;
+
+ if(!contig_array[i].downwardConnect||!validConnect(i,NULL)){
+ continue;
+ }
+
+ num5 = num3 = 0;
+ ctg = i;
+ *(unsigned int *)darrayPut(scaf5,num5++) = i;
+ contig_array[i].flag = 1;
+ contig_array[getTwinCtg(i)].flag = 1;
+ prevCNT = NULL;
+ cnt = getNextContig(ctg,prevCNT,&excep);
+ while(cnt){
+ if(contig_array[cnt->contigID].flag){
+ unBindLink(ctg,cnt->contigID);
+ break;
+ }
+ nextCnt=getNextContig(cnt->contigID,cnt,&excep);
+ setConnectUsed(ctg,cnt->contigID,1);
+ ctg = cnt->contigID;
+ *(unsigned int *)darrayPut(scaf5,num5++) = ctg;
+ bal_ctg = getTwinCtg(ctg);
+ contig_array[ctg].flag = 1;
+ contig_array[bal_ctg].flag = 1;
+ prevCNT = cnt;
+ cnt = nextCnt;
+ }
+
+ ctg = getTwinCtg(i);
+ if(num5>=2)
+ prevCNT = checkConnect(getTwinCtg(*(unsigned int *)darrayGet(scaf5,1)),ctg);
+ else
+ prevCNT = NULL;
+ cnt = getNextContig(ctg,prevCNT,&excep);
+ while(cnt){
+ if(contig_array[cnt->contigID].flag){
+ unBindLink(ctg,cnt->contigID);
+ break;
+ }
+ nextCnt=getNextContig(cnt->contigID,cnt,&excep);
+ setConnectUsed(ctg,cnt->contigID,1);
+ ctg = cnt->contigID;
+ bal_ctg = getTwinCtg(ctg);
+ contig_array[ctg].flag = 1;
+ contig_array[bal_ctg].flag = 1;
+ *(unsigned int *)darrayPut(scaf3,num3++) = bal_ctg;
+ prevCNT = cnt;
+ cnt = nextCnt;
+ }
+ if(num5+num3<2)
+ continue;
+ solidCounter = 0;
+ for(j=num3-1;j>=0;j--)
+ *(unsigned int *)darrayPut(solidArray,solidCounter++) =
+ *(unsigned int *)darrayGet(scaf3,j);
+ for(j=0;j=0;t--)
+ if(!contig_array[*(unsigned int *)darrayGet(solidArray,t)].mask){
+ lastCtg = *(unsigned int *)darrayGet(solidArray,t);
+ break;
+ }
+ if(firstCtg==0||lastCtg==0){
+ printf("scaffold start at %d, stop at %d, freezing began with %d\n",firstCtg,lastCtg,i);
+ for(j=0;j%d(%d %d)",*(unsigned int *)darrayGet(solidArray,j)
+ ,contig_array[*(unsigned int *)darrayGet(solidArray,j)].mask
+ ,contig_array[*(unsigned int *)darrayGet(solidArray,j)].flag);
+ printf("\n");
+ }else{
+ firstTwin = getTwinCtg(firstCtg);
+ lastTwin = getTwinCtg(lastCtg);
+ }
+ for(t=0;t0){
+ contig_array[ctg].mask = 1;
+ contig_array[getTwinCtg(ctg)].mask = 1;
+ printf("Repeat: contig %d (%d) appears more than once\n",ctg,getTwinCtg(ctg));
+ }else{
+ contig_array[ctg].from_vt = firstCtg;
+ contig_array[ctg].to_vt = lastCtg;
+ contig_array[ctg].indexInScaf = t+1;
+ contig_array[getTwinCtg(ctg)].from_vt = lastTwin;
+ contig_array[getTwinCtg(ctg)].to_vt = firstTwin;
+ contig_array[getTwinCtg(ctg)].indexInScaf = solidCounter-t;
+ }
+ }
+ consolidate();
+ }
+
+ //printf("Freezing is done....\n");
+ fflush(stdout);
+
+ for(i=1;i<=num_ctg;i++){
+ if(contig_array[i].flag)
+ contig_array[i].flag = 0;
+
+ if(contig_array[i].from_vt==0){
+ contig_array[i].from_vt = i;
+ contig_array[i].to_vt = i;
+ }
+ cnt = contig_array[i].downwardConnect;
+ while(cnt){
+ cnt->used = 0;
+ cnt->checking = 0;
+ cnt = cnt->next;
+ }
+ }
+
+}
+
+/************** codes below this line are for pulling the scaffolds out ************/
+void output1gap(FILE *fo,int max_steps)
+{
+ int i,len,seg;
+ len = seg = 0;
+
+ for(i=0;ibySmall&&bindCnt->weakPoint){
+ weakCounter++;
+ fprintf(fp,"\tWP");
+ ret = 1;
+ }
+
+ while(cnt){
+ if(cnt->weight&&!cnt->inherit){
+ if(!flag){
+ flag = 1;
+ fprintf(fp,"\t#DOWN ");
+ }
+ linkCtg = cnt->contigID;
+ if(isLargerThanTwin(linkCtg))
+ linkCtg = getTwinCtg(linkCtg);
+
+ fprintf(fp,"%d:%d:%d ",index_array[linkCtg],cnt->weight,cnt->gapLen);
+ }
+ cnt = cnt->next;
+ }
+ flag = 0;
+ cnt = contig_array[bal_ctg].downwardConnect;
+ while(cnt){
+ if(cnt->weight&&!cnt->inherit){
+ if(!flag){
+ flag = 1;
+ fprintf(fp,"\t#UP ");
+ }
+ linkCtg = cnt->contigID;
+ if(isLargerThanTwin(linkCtg))
+ linkCtg = getTwinCtg(linkCtg);
+
+ fprintf(fp,"%d:%d:%d ",index_array[linkCtg],cnt->weight,cnt->gapLen);
+ }
+ cnt = cnt->next;
+ }
+ fprintf(fp,"\n");
+ return ret;
+}
+
+void scaffolding(unsigned int len_cut,char *outfile)
+{
+ unsigned int prev_ctg,ctg,bal_ctg,*length_array,count=0,num_lctg=0;
+ unsigned int i,max_steps=5;
+ int num5,num3,j,len,flag,num_route,gap_c=0;
+ short gap=0;
+ long long sum=0,N50,N90;
+ FILE *fp,*fo=NULL;
+ char name[256];
+ CONNECT *cnt,*prevCNT,*nextCnt;
+ boolean excep,weak;
+ weakCounter = 0;
+
+ so_far = (unsigned int *)ckalloc(max_n_routes*sizeof(unsigned int));
+ found_routes = (unsigned int **)ckalloc(max_n_routes*sizeof(unsigned int*));
+ for(j=0;j0)
+ length_array[index_array[i]] = i;
+ }
+ for(i=1;i<=num_ctg;i++)
+ index_array[i] = length_array[i]; //contig i with original index: index_array[i]
+
+ orig2new = 0;
+
+ sprintf(name,"%s.scaf",outfile);
+ fp = ckopen(name,"w");
+ sprintf(name,"%s.scaf_gap",outfile);
+ fo = ckopen(name,"w");
+
+ scaf3 = (DARRAY *)createDarray(1000,sizeof(unsigned int));
+ scaf5 = (DARRAY *)createDarray(1000,sizeof(unsigned int));
+ gap3 = (DARRAY *)createDarray(1000,sizeof(int));
+ gap5 = (DARRAY *)createDarray(1000,sizeof(int));
+
+ for(i=1;i<=num_ctg;i++)
+ contig_array[i].flag = 0;
+ for(i=1;i<=num_ctg;i++){
+ if(contig_array[i].length+(unsigned int)overlaplen>=len_cut)
+ num_lctg++;
+ else
+ continue;
+ if(contig_array[i].flag||contig_array[i].mask||!contig_array[i].downwardConnect||!validConnect(i,NULL))
+ continue;
+
+ num5 = num3 = 0;
+ ctg = i;
+ //printf("%d",i);
+ *(unsigned int *)darrayPut(scaf5,num5++) = i;
+ contig_array[i].flag = 1;
+ bal_ctg = getTwinCtg(ctg);
+ contig_array[bal_ctg].flag = 1;
+ len = contig_array[i].length;
+ prevCNT = NULL;
+ cnt = getNextContig(ctg,prevCNT,&excep);
+ while(cnt){
+ nextCnt = getNextContig(cnt->contigID,cnt,&excep);
+ if(excep&&prevCNT)
+ printf("scaffolding: exception --- prev cnt from %u\n",prevCNT->contigID);
+ if(nextCnt&&nextCnt->used)
+ break;
+ setConnectUsed(ctg,cnt->contigID,1);
+ *(int *)darrayPut(gap5,num5-1) = cnt->gapLen;
+ ctg = cnt->contigID;
+ *(unsigned int *)darrayPut(scaf5,num5++) = ctg;
+ len += cnt->gapLen+contig_array[ctg].length;
+ bal_ctg = getTwinCtg(ctg);
+ contig_array[ctg].flag = 1;
+ contig_array[bal_ctg].flag = 1;
+ prevCNT = cnt;
+ cnt = nextCnt;
+ //printf("->%d",ctg);
+ }
+ //printf("\n");
+
+ ctg = getTwinCtg(i);
+ if(num5>=2)
+ prevCNT = checkConnect(getTwinCtg(*(unsigned int *)darrayGet(scaf5,1)),ctg);
+ else
+ prevCNT = NULL;
+ //printf("%d",i);
+ //fflush(stdout);
+ cnt = getNextContig(ctg,prevCNT,&excep);
+ while(cnt){
+ nextCnt=getNextContig(cnt->contigID,cnt,&excep);
+ if(excep&&prevCNT)
+ printf("scaffolding: exception -- prev cnt from %u\n",prevCNT->contigID);
+ if(nextCnt&&nextCnt->used)
+ break;
+ setConnectUsed(ctg,cnt->contigID,1);
+ ctg = cnt->contigID;
+ len += cnt->gapLen+contig_array[ctg].length;
+ bal_ctg = getTwinCtg(ctg);
+ contig_array[ctg].flag = 1;
+ contig_array[bal_ctg].flag = 1;
+ //printf("<-%d",bal_ctg);
+ //fflush(stdout);
+ *(int *)darrayPut(gap3,num3) = cnt->gapLen;
+ *(unsigned int *)darrayPut(scaf3,num3++) = bal_ctg;
+ prevCNT = cnt;
+ cnt = nextCnt;
+ }
+ //printf("\n");
+ len += overlaplen;
+ sum += len;
+ length_array[count++] = len;
+ if(num5+num3<1){
+ //printf("no scaffold created for contig %d\n",i);
+ continue;
+ }
+ fprintf(fp,">scaffold%d %d %d\n",count,num5+num3,len);
+ fprintf(fo,">scaffold%d %d %d\n",count,num5+num3,len);
+ len = prev_ctg = 0;
+ for(j=num3-1;j>=0;j--){
+ if(!isLargerThanTwin(*(unsigned int *)darrayGet(scaf3,j))){
+ fprintf(fp,"%-10d %-10d + %d "
+ ,index_array[*(unsigned int *)darrayGet(scaf3,j)],len,
+ contig_array[*(unsigned int *)darrayGet(scaf3,j)].length+overlaplen);
+ weak = printCnts(fp,*(unsigned int *)darrayGet(scaf3,j));
+ /*
+ if(weak)
+ fprintf(stderr,"scaffold%d\n",count);
+ */
+ }else{
+ fprintf(fp,"%-10d %-10d - %d "
+ ,index_array[getTwinCtg(*(unsigned int *)darrayGet(scaf3,j))],len
+ ,contig_array[*(unsigned int *)darrayGet(scaf3,j)].length+overlaplen);
+ weak = printCnts(fp,*(unsigned int *)darrayGet(scaf3,j));
+ /*
+ if(weak)
+ fprintf(stderr,"scaffold%d\n",count);
+ */
+ }
+ if(prev_ctg){
+ num_route = num_trace = 0;
+ traceAlongArc(*(unsigned int *)darrayGet(scaf3,j),prev_ctg,max_steps
+ ,gap-ins_size_var,gap+ins_size_var,0,0,&num_route);
+ if(num_route==1){
+ output1gap(fo,max_steps);
+ gap_c++;
+ }
+ }
+ fprintf(fo,"%-10d %-10d\n",*(unsigned int *)darrayGet(scaf3,j),len);
+ len += contig_array[*(unsigned int *)darrayGet(scaf3,j)].length + *(int *)darrayGet(gap3,j);
+ prev_ctg = *(unsigned int *)darrayGet(scaf3,j);
+ gap = *(int *)darrayGet(gap3,j)>0 ? *(int *)darrayGet(gap3,j):0;
+ }
+ for(j=0;j0 ? *(int *)darrayGet(gap5,j):0;
+ }
+ }
+
+ }
+
+ freeDarray(scaf3);
+ freeDarray(scaf5);
+ freeDarray(gap3);
+ freeDarray(gap5);
+
+ fclose(fp);
+ fclose(fo);
+ //printf("\n%d scaffolds from %d contigs sum up %lldbp, with average length %lld, %d gaps filled\n"
+ // ,count,num_lctg/2,sum,sum/count,gap_c);
+ printf("[%s]scaffold(s) created : %d , total length : %lld.\n",__FUNCTION__,count ,sum);
+ //output singleton
+ for(i=1;i<=num_ctg;i++){
+ if(contig_array[i].length+(unsigned int)overlaplen=0;j--){
+ sum += length_array[j];
+ if(!flag&&sum>=N50){
+ printf("[%s]N50 : %d bp, ",__FUNCTION__,length_array[j]);
+ flag++;
+ }
+ if(sum>=N90){
+ printf(" N90 : %d bp\n",length_array[j]);
+ break;
+ }
+ }
+ //printf("Found %d weak points in scaffolds\n",weakCounter);
+ fflush(stdout);
+ free((void *)length_array);
+ for(j=0;jweight<1){
+ cnts = cnts->next;
+ continue;
+ }
+ fprintf(fp,"%-10d %-10d\t%d\t%d\t%d\n"
+ ,i,cnts->contigID,cnts->gapLen,cnts->weight,insertS);
+ cnts->weight = 0;
+
+ bal_toCtg = getTwinCtg(cnts->contigID);
+ temp_cnt = getCntBetween(bal_toCtg,bal_ctg);
+ if(temp_cnt)
+ temp_cnt->weight = 0;
+
+ cnts = cnts->next;
+ }
+ }
+}
+
+//use pe info in ascent order
+void PE2Links(char *infile)
+{
+ fprintf(stderr,"[%s]entering this function.\n",__FUNCTION__);
+ char name[256],*line;
+ FILE *fp,*linkF;
+ int i;
+ int flag=0;
+ unsigned int j;
+
+
+ sprintf(name,"%s.links",infile);
+ /*linkF = fopen(name,"r");
+ if(linkF){
+ printf("file %s exists, skip creating the links...\n",name);
+ fclose(linkF);
+ return;
+ }*/
+
+ linkF = ckopen(name,"w");
+
+ if(!pes)
+ loadPEgrads(infile);
+
+ sprintf(name,"%s.readOnContig",infile);
+ fp = ckopen(name,"r");
+
+ lineLen = 1024;
+ line = (char *)ckalloc(lineLen*sizeof(char));
+
+ fgets(line,lineLen,fp);
+ line[0] = '\0';
+
+ //printf("\n");
+ for(i=0;i=ctg_short&&contig_array[toCtg].length>=ctg_short){
+ if(1){
+ bal_ctg = getTwinCtg(ctg);
+ bal_toCtg = getTwinCtg(toCtg);
+ add1Connect(ctg,toCtg,gap,wt,0);
+ add1Connect(bal_toCtg,bal_ctg,gap,wt,0);
+ counter++;
+ if(contig_array[ctg].mask||contig_array[toCtg].mask)
+ maskCounter++;
+
+ if(insertS>1000&&
+ contig_array[ctg].from_vt==contig_array[toCtg].from_vt&& // on the same scaff
+ contig_array[ctg].indexInScafinsertS)
+ break;
+ /*
+ if(contig_array[ctg].length1000&&
+ contig_array[ctg].from_vt==contig_array[toCtg].from_vt&& // on the same scaff
+ contig_array[ctg].indexInScaf1000&&isPrevSmall){
+ smallScaf();
+ isPrevSmall = 0;
+ }*/
+ flag2 = inputLinks(fp,pes[i].insertS,line);
+ //printf("Insert size %d: %d links input\n",pes[i].insertS,flag2);
+ if(flag2){
+ lib_n++;
+ cutoff_sum += pes[i].pair_num_cut;
+ weakPE=cutoff_sum;
+ }
+ flag += flag2;
+ if(!flag){
+ //printf("\n");
+ continue;
+ }
+ if(i==gradsCounter-1|| pes[i+1].rank!=pes[i].rank){
+ flag = nonLinear = downS = markSmall = 0;
+
+ if(pes[i].insertS>1000&&pes[i].rank>1)
+ downS = 1;
+ if(pes[i].insertS<=1000)
+ smallPE = 1;
+
+ if(pes[i].insertS>=1000){
+ ins_size_var = 50;
+ //OverlapPercent = 0.05;
+ }else if(pes[i].insertS>=300){
+ ins_size_var = 30;
+ //OverlapPercent = 0.05;
+ }else{
+ ins_size_var = 20;
+ //OverlapPercent = 0.05;
+ }
+ //if(pes[i].insertS>1000)
+ //weakPE = 5;
+ //static_f = 1;
+ //if(lib_n>0){
+ //weakPE = weakPEmaxNodes)
+ return 0;
+ if(contig_array[getTwinCtg(node)].inSubGraph)
+ return 0;
+ ctg4heapArray[index].ctgID = node;
+ ctg4heapArray[index].dis = dis;
+ contig_array[node].inSubGraph = 1;
+
+ ctg4heapArray[index].ds_shut4dheap = 0;
+ ctg4heapArray[index].us_shut4dheap = 0;
+ ctg4heapArray[index].ds_shut4uheap = 0;
+ ctg4heapArray[index].us_shut4uheap = 0;
+
+ return 1;
+}
+
+static void setInGraph(boolean flag)
+{
+ int i;
+ int node;
+ nodeCounter = nodeCounter>MaxNodeInSub ? MaxNodeInSub:nodeCounter;
+ for(i=1;i<=nodeCounter;i++){
+ node = ctg4heapArray[i].ctgID;
+ if(node>0)
+ contig_array[node].inSubGraph = flag;
+ }
+}
+
+static boolean dispatch1node(int dis,unsigned int tempNode,int maxNodes,
+ FibHeap *dheap,FibHeap *uheap,int *DmaxDis,int *UmaxDis)
+{
+ boolean ret;
+ if(dis>=0){ // put it to Dheap
+ nodeCounter++;
+ ret = putNodeInArray(tempNode,maxNodes,dis);
+ if(!ret)
+ return 0;
+ insertNodeIntoHeap(dheap,dis,nodeCounter);
+ if(dis>*DmaxDis)
+ *DmaxDis = dis;
+ return 1;
+ }else{ // put it to Uheap
+ nodeCounter++;
+ ret = putNodeInArray(tempNode,maxNodes,dis);
+ if(!ret)
+ return 0;
+ insertNodeIntoHeap(uheap,-dis,nodeCounter);
+ int temp_len = contig_array[tempNode].length;
+ if(-dis+temp_len>*UmaxDis)
+ *UmaxDis = -dis+contig_array[tempNode].length;
+ return -1;
+ }
+ return 0;
+}
+
+static boolean canDheapWait(unsigned int currNode,int dis, int DmaxDis)
+{
+ if(disctgID;
+ dis0 = ctgInHeap->dis;
+
+ isEmpty = IsHeapEmpty(dheap);
+
+ twin = getTwinCtg(currNode);
+ us_cnt = ctgInHeap->us_shut4dheap? NULL:contig_array[twin].downwardConnect;
+ while(us_cnt){
+ if(us_cnt->deleted||us_cnt->mask||
+ contig_array[getTwinCtg(us_cnt->contigID)].inSubGraph){
+ us_cnt = us_cnt->next;
+ continue;
+ }
+
+ tempNode = getTwinCtg(us_cnt->contigID);
+ if(contig_array[tempNode].inSubGraph){
+ us_cnt = us_cnt->next;
+ continue;
+ }
+ dis = dis0 - us_cnt->gapLen - (int)contig_array[twin].length;
+
+ ret = dispatch1node(dis,tempNode,maxNodes,dheap,uheap,DmaxDis,UmaxDis);
+ if(ret==0)
+ return 0;
+ else if(ret<0)
+ *Uwait = 0;
+
+ us_cnt = us_cnt->next;
+ }
+
+ if(nodeCounter>1&&isEmpty){
+ *Dwait = canDheapWait(currNode,dis0,*DmaxDis);
+ if(*Dwait){
+ isEmpty = IsHeapEmpty(dheap);
+ insertNodeIntoHeap(dheap,dis0,indexInArray);
+ ctg4heapArray[indexInArray].us_shut4dheap = 1;
+ if(isEmpty)
+ return 1;
+ else
+ continue;
+ }
+ }
+ ds_cnt = ctgInHeap->ds_shut4dheap? NULL:contig_array[currNode].downwardConnect;
+ while(ds_cnt){
+ if(ds_cnt->deleted||ds_cnt->mask||contig_array[ds_cnt->contigID].inSubGraph){
+ ds_cnt = ds_cnt->next;
+ continue;
+ }
+ tempNode = ds_cnt->contigID;
+ dis = dis0 + ds_cnt->gapLen + (int)contig_array[tempNode].length;
+ ret = dispatch1node(dis,tempNode,maxNodes,dheap,uheap,DmaxDis,UmaxDis);
+ if(ret==0)
+ return 0;
+ else if(ret<0)
+ *Uwait = 0;
+ } // for each downstream connections
+ } // for each node comes off the heap
+
+ *Dwait = 1;
+ return 1;
+}
+
+static boolean canUheapWait(unsigned int currNode,int dis, int UmaxDis)
+{
+ int temp_len = contig_array[currNode].length;
+ if(-dis+temp_lenctgID;
+ dis0 = ctgInHeap->dis;
+
+ isEmpty = IsHeapEmpty(uheap);
+ ds_cnt = ctgInHeap->ds_shut4uheap? NULL:contig_array[currNode].downwardConnect;
+ while(ds_cnt){
+ if(ds_cnt->deleted||ds_cnt->mask||contig_array[ds_cnt->contigID].inSubGraph){
+ ds_cnt = ds_cnt->next;
+ continue;
+ }
+ tempNode = ds_cnt->contigID;
+ dis = dis0 + ds_cnt->gapLen + contig_array[tempNode].length;
+ ret = dispatch1node(dis,tempNode,maxNodes,dheap,uheap,DmaxDis,UmaxDis);
+ if(ret==0)
+ return 0;
+ else if(ret>0)
+ *Dwait = 0;
+
+ } // for each downstream connections
+
+ if(nodeCounter>1&&isEmpty){
+ *Uwait = canUheapWait(currNode,dis0,*UmaxDis);
+ if(*Uwait){
+ isEmpty = IsHeapEmpty(uheap);
+ insertNodeIntoHeap(uheap,dis0,indexInArray);
+ ctg4heapArray[indexInArray].ds_shut4uheap = 1;
+ if(isEmpty)
+ return 1;
+ else
+ continue;
+ }
+ }
+
+ twin = getTwinCtg(currNode);
+ us_cnt = ctgInHeap->us_shut4uheap? NULL:contig_array[twin].downwardConnect;
+ while(us_cnt){
+ if(us_cnt->deleted||us_cnt->mask||
+ contig_array[getTwinCtg(us_cnt->contigID)].inSubGraph){
+ us_cnt = us_cnt->next;
+ continue;
+ }
+
+ tempNode = getTwinCtg(us_cnt->contigID);
+ if(contig_array[tempNode].inSubGraph){
+ us_cnt = us_cnt->next;
+ continue;
+ }
+ dis = dis0 - us_cnt->gapLen - contig_array[twin].length;
+
+ ret = dispatch1node(dis,tempNode,maxNodes,dheap,uheap,DmaxDis,UmaxDis);
+ if(ret==0)
+ return 0;
+ else if(ret>0)
+ *Dwait = 1;
+
+ us_cnt = us_cnt->next;
+ }
+
+ } // for each node comes off the heap
+
+ *Uwait = 1;
+ return 1;
+}
+
+static boolean pickUpGeneralSubgraph(unsigned int node1,int maxNodes)
+{
+ FibHeap *Uheap = newFibHeap(); // heap for upstream contigs to node1
+ FibHeap *Dheap = newFibHeap();
+ int UmaxDis; // max distance upstream to node1
+ int DmaxDis;
+ boolean Uwait; // wait signal for Uheap
+ boolean Dwait;
+ int dis;
+ boolean ret;
+
+ //initiate: node1 is put to array once, and to both Dheap and Uheap
+ dis = 0;
+ nodeCounter = 1;
+ putNodeInArray(node1,maxNodes,dis);
+ insertNodeIntoHeap(Dheap,dis,nodeCounter);
+ ctg4heapArray[nodeCounter].us_shut4dheap = 1;
+ Dwait = 0;
+ DmaxDis = 0;
+
+ insertNodeIntoHeap(Uheap,dis,nodeCounter);
+ ctg4heapArray[nodeCounter].ds_shut4uheap = 1;
+ Uwait = 1;
+ UmaxDis = contig_array[node1].length;
+
+ while(1){
+ ret = workOnDheap(Dheap,Uheap,&Dwait,&Uwait,&DmaxDis,&UmaxDis,maxNodes);
+ if(!ret){
+ setInGraph(0);
+ destroyHeap(Dheap);
+ destroyHeap(Uheap);
+ return 0;
+ }
+ ret = workOnUheap(Dheap,Uheap,&Dwait,&Uwait,&DmaxDis,&UmaxDis,maxNodes);
+ if(!ret){
+ setInGraph(0);
+ destroyHeap(Dheap);
+ destroyHeap(Uheap);
+ return 0;
+ }
+ if(Uwait&&Dwait){
+ destroyHeap(Dheap);
+ destroyHeap(Uheap);
+ return 1;
+ }
+ }
+
+}
+
+static int cmp_ctg(const void *a,const void *b)
+{
+ CTGinHEAP *A,*B;
+ A = (CTGinHEAP *)a;
+ B = (CTGinHEAP *)b;
+
+ if(A->dis>B->dis)
+ return 1;
+ else if(A->dis==B->dis)
+ return 0;
+ else
+ return -1;
+}
+
+static boolean checkEligible()
+{
+ unsigned int firstNode = ctg4heapArray[1].ctgID;
+ unsigned int twin;
+ int i;
+ boolean flag = 0;
+
+ //check if the first node has incoming link from twin of any node in subgraph
+ // or it has multi outgoing links bound to incoming links
+ twin = getTwinCtg(firstNode);
+ CONNECT *ite_cnt = contig_array[twin].downwardConnect;
+ while(ite_cnt){
+ if(ite_cnt->deleted||ite_cnt->mask){
+ ite_cnt = ite_cnt->next;
+ continue;
+ }
+ if(contig_array[ite_cnt->contigID].inSubGraph){
+/*
+ if(firstNode==3693)
+ printf("eligible link %d -> %d\n",twin,ite_cnt->contigID);
+*/
+ return 0;
+ }
+ if(ite_cnt->prevInScaf){
+ if(flag)
+ return 0;
+ flag = 1;
+ }
+ ite_cnt = ite_cnt->next;
+ }
+
+ //check if the last node has outgoing link to twin of any node in subgraph
+ // or it has multi outgoing links bound to incoming links
+ unsigned int lastNode = ctg4heapArray[nodeCounter].ctgID;
+ ite_cnt = contig_array[lastNode].downwardConnect;
+ flag = 0;
+ while(ite_cnt){
+ if(ite_cnt->deleted||ite_cnt->mask){
+ ite_cnt = ite_cnt->next;
+ continue;
+ }
+ twin = getTwinCtg(ite_cnt->contigID);
+ if(contig_array[twin].inSubGraph){
+/*
+ if(firstNode==3693)
+ printf("eligible link %d -> %d\n",lastNode,ite_cnt->contigID);
+*/
+ return 0;
+ }
+ if(ite_cnt->prevInScaf){
+ if(flag)
+ return 0;
+ flag = 1;
+ }
+ ite_cnt = ite_cnt->next;
+ }
+ //check if any node has outgoing link to node outside the subgraph
+ for(i=1;ideleted||ite_cnt->mask){
+ ite_cnt = ite_cnt->next;
+ continue;
+ }
+ if(!contig_array[ite_cnt->contigID].inSubGraph){
+ /*
+ printf("eligible check: ctg %d links to ctg %d\n",
+ ctg4heapArray[i].ctgID,ite_cnt->contigID);
+ */
+ return 0;
+ }
+ ite_cnt = ite_cnt->next;
+ }
+ }
+ //check if any node has incoming link from node outside the subgraph
+ for(i=2;i<=nodeCounter;i++){
+ twin = getTwinCtg(ctg4heapArray[i].ctgID);
+ ite_cnt = contig_array[twin].downwardConnect;
+ while(ite_cnt){
+ if(ite_cnt->deleted||ite_cnt->mask){
+ ite_cnt = ite_cnt->next;
+ continue;
+ }
+ if(!contig_array[getTwinCtg(ite_cnt->contigID)].inSubGraph){
+ /*
+ printf("eligible check: ctg %d links to ctg %d\n",
+ ctg4heapArray[i].ctgID,ite_cnt->contigID);
+ */
+ return 0;
+ }
+ ite_cnt = ite_cnt->next;
+ }
+ }
+
+ return 1;
+}
+
+//put nodes in sub-graph in a line
+static void arrangeNodes_general()
+{
+ int i,gap;
+ CONNECT *ite_cnt,*temp_cnt,*bal_cnt,*prev_cnt,*next_cnt;
+ unsigned int node1,node2;
+ unsigned int bal_nd1,bal_nd2;
+ //delete original connections
+ for(i=1;i<=nodeCounter;i++){
+ node1 = ctg4heapArray[i].ctgID;
+ ite_cnt = contig_array[node1].downwardConnect;
+ while(ite_cnt){
+ if(ite_cnt->mask||ite_cnt->deleted||!contig_array[ite_cnt->contigID].inSubGraph){
+ ite_cnt = ite_cnt->next;
+ continue;
+ }
+ ite_cnt->deleted = 1;
+ setNextInScaf(ite_cnt,NULL);
+ setPrevInScaf(ite_cnt,0);
+ ite_cnt = ite_cnt->next;
+ }
+
+ bal_nd1 = getTwinCtg(node1);
+ ite_cnt = contig_array[bal_nd1].downwardConnect;
+ while(ite_cnt){
+ if(ite_cnt->mask||ite_cnt->deleted||!contig_array[getTwinCtg(ite_cnt->contigID)].inSubGraph){
+ ite_cnt = ite_cnt->next;
+ continue;
+ }
+ ite_cnt->deleted = 1;
+ setNextInScaf(ite_cnt,NULL);
+ setPrevInScaf(ite_cnt,0);
+ ite_cnt = ite_cnt->next;
+ }
+ }
+ //create new connections
+ prev_cnt = next_cnt = NULL;
+ for(i=1;ideleted = 0;
+ temp_cnt->mask = 0;
+ //temp_cnt->gapLen = gap;
+ bal_cnt = getCntBetween(bal_nd2,bal_nd1);
+ bal_cnt->deleted = 0;
+ bal_cnt->mask = 0;
+ //bal_cnt->gapLen = gap;
+ }
+ else{
+ temp_cnt = allocateCN(node2,gap);
+ if(cntLookupTable)
+ putCnt2LookupTable(node1,temp_cnt);
+ temp_cnt->next = contig_array[node1].downwardConnect;
+ contig_array[node1].downwardConnect = temp_cnt;
+ bal_cnt = allocateCN(bal_nd1,gap);
+ if(cntLookupTable)
+ putCnt2LookupTable(bal_nd2,bal_cnt);
+ bal_cnt->next = contig_array[bal_nd2].downwardConnect;
+ contig_array[bal_nd2].downwardConnect = bal_cnt;
+ }
+ if(prev_cnt){
+ setNextInScaf(prev_cnt,temp_cnt);
+ setPrevInScaf(temp_cnt,1);
+ }
+ if(next_cnt){
+ setNextInScaf(bal_cnt,next_cnt);
+ setPrevInScaf(next_cnt,1);
+ }
+ prev_cnt = temp_cnt;
+ next_cnt = bal_cnt;
+ }
+
+ //re-binding connection at both ends
+ bal_nd2 = getTwinCtg(ctg4heapArray[1].ctgID);
+ ite_cnt = contig_array[bal_nd2].downwardConnect;
+ while(ite_cnt){
+ if(ite_cnt->deleted||ite_cnt->mask){
+ ite_cnt = ite_cnt->next;
+ continue;
+ }
+ if(ite_cnt->prevInScaf)
+ break;
+ ite_cnt = ite_cnt->next;
+ }
+ if(ite_cnt){
+ bal_nd1 = ite_cnt->contigID;
+ node1 = getTwinCtg(bal_nd1);
+ node2 = ctg4heapArray[1].ctgID;
+ temp_cnt = checkConnect(node1,node2);
+ bal_cnt = ite_cnt;
+ next_cnt = checkConnect(ctg4heapArray[1].ctgID,ctg4heapArray[2].ctgID);
+ prev_cnt = checkConnect(getTwinCtg(ctg4heapArray[2].ctgID), getTwinCtg(ctg4heapArray[1].ctgID));
+ if(temp_cnt){
+ setNextInScaf(temp_cnt,next_cnt);
+ setPrevInScaf(temp_cnt->nextInScaf,0);
+ setPrevInScaf(next_cnt,1);
+ setNextInScaf(prev_cnt,bal_cnt);
+ }
+ }
+
+ node1 = ctg4heapArray[nodeCounter].ctgID;
+ ite_cnt = contig_array[node1].downwardConnect;
+ while(ite_cnt){
+ if(ite_cnt->deleted||ite_cnt->mask){
+ ite_cnt = ite_cnt->next;
+ continue;
+ }
+ if(ite_cnt->prevInScaf)
+ break;
+ ite_cnt = ite_cnt->next;
+ }
+ if(ite_cnt){
+ node2 = ite_cnt->contigID;
+ bal_nd1 = getTwinCtg(node1);
+ bal_nd2 = getTwinCtg(node2);
+ temp_cnt = ite_cnt;
+ bal_cnt = checkConnect(bal_nd2,bal_nd1);
+ next_cnt = checkConnect(getTwinCtg(ctg4heapArray[nodeCounter].ctgID),
+ getTwinCtg(ctg4heapArray[nodeCounter-1].ctgID));
+ prev_cnt = checkConnect(ctg4heapArray[nodeCounter-1].ctgID,ctg4heapArray[nodeCounter].ctgID);
+ setNextInScaf(prev_cnt,temp_cnt);
+ setNextInScaf(bal_cnt,next_cnt);
+ setPrevInScaf(next_cnt,1);
+ }
+}
+//check if contigs next to each other have reasonable overlap
+boolean checkOverlapInBetween_general(double tolerance)
+{
+ int i,gap;
+ unsigned int node;
+ int lenSum,lenOlp;
+ lenSum = lenOlp = 0;
+ for(i=1;i<=nodeCounter;i++){
+ node = ctg4heapArray[i].ctgID;
+ lenSum += contig_array[node].length;
+ }
+ if(lenSum<1)
+ return 1;
+ for(i=1;i0)
+ lenOlp += -gap;
+ //if(-gap>ins_size_var)
+
+ }
+ double olp_pect=(double)lenOlp/lenSum;
+ fprintf(stderr,"[%s]existing with olp_pect %.3f.\n",__FUNCTION__,olp_pect);
+ if(olp_pect>tolerance){
+ return 0;
+ }
+ return 1;
+}
+
+//check if there's any connect indicates the opposite order between nodes in sub-graph
+static boolean checkConflictCnt_general(double tolerance)
+{
+ int i,j;
+ int supportCounter=0;
+ int objectCounter=0;
+ CONNECT *cnt;
+ for(i=1;iweight;
+ cnt = checkConnect(ctg4heapArray[j].ctgID,ctg4heapArray[i].ctgID);
+ if(cnt)
+ objectCounter += cnt->weight;
+ //return 1;
+ }
+ }
+ if(supportCounter<1)
+ return 1;
+ if((double)objectCounter/supportCounter0;i--){
+ if(contig_array[i].mask)
+ continue;
+ out_num = validConnect(i,NULL);
+
+ if(out_num<2)
+ continue;
+
+ //flag = pickSubGraph(i,strict);
+ flag = pickUpGeneralSubgraph(i,MaxNodeInSub);
+ if(!flag)
+ continue;
+ subCounter++;
+ qsort(&ctg4heapArray[1],nodeCounter,sizeof(CTGinHEAP),cmp_ctg);
+ flag = checkEligible();
+ if(!flag){
+ eligibleCounter++;
+ setInGraph(0);
+ continue;
+ }
+ if(strict){
+ overlapTolerance = OverlapPercent;
+ conflTolerance = ConflPercent;
+ }else{
+ overlapTolerance = 2*OverlapPercent;
+ conflTolerance = 2*ConflPercent;
+ }
+ flag = checkOverlapInBetween_general(overlapTolerance);
+ if(!flag){
+ overlapCounter++;
+ setInGraph(0);
+ continue;
+ }
+ flag = checkConflictCnt_general(conflTolerance);
+ if(flag){
+ conflCounter++;
+ setInGraph(0);
+ continue;
+ }
+ arrangeNodes_general();
+ setInGraph(0);
+ }
+ fprintf(stdout,"[%s]Picked %d subgraphs,%d have conflicting connections,%d have significant overlapping, %d eligible\n",
+ __FUNCTION__,subCounter,conflCounter,overlapCounter,eligibleCounter);
+
+}
+
+/**** the fowllowing codes for detecting and break down scaffold at weak point **********/
+// mark connections in scaffolds made by small pe
+static void smallScaf()
+{
+ unsigned int i,ctg,bal_ctg,prevCtg;
+ int counter=0;
+ CONNECT *bindCnt,*cnt;
+
+ for(i=1;i<=num_ctg;i++)
+ contig_array[i].flag = 0;
+ for(i=1;i<=num_ctg;i++){
+ if(contig_array[i].flag||contig_array[i].mask||!contig_array[i].downwardConnect)
+ continue;
+ bindCnt = getBindCnt(i);
+ if(!bindCnt)
+ continue;
+ counter++;
+
+ contig_array[i].flag = 1;
+ contig_array[getTwinCtg(i)].flag = 1;
+ prevCtg = getTwinCtg(i);
+ while(bindCnt){
+ ctg = bindCnt->contigID;
+ bal_ctg = getTwinCtg(ctg);
+ bindCnt->bySmall = 1;
+ cnt = getCntBetween(bal_ctg,prevCtg);
+ if(cnt)
+ cnt->bySmall = 1;
+
+ contig_array[ctg].flag = 1;
+ contig_array[bal_ctg].flag = 1;
+ prevCtg = bal_ctg;
+ bindCnt = bindCnt->nextInScaf;
+ }
+
+ ctg = getTwinCtg(i);
+ bindCnt = getBindCnt(ctg);
+ prevCtg = i;
+ while(bindCnt){
+ ctg = bindCnt->contigID;
+ bal_ctg = getTwinCtg(ctg);
+ bindCnt->bySmall = 1;
+ cnt = getCntBetween(bal_ctg,prevCtg);
+ if(cnt)
+ cnt->bySmall = 1;
+
+ contig_array[ctg].flag = 1;
+ contig_array[bal_ctg].flag = 1;
+ prevCtg = bal_ctg;
+ bindCnt = bindCnt->nextInScaf;
+ }
+ }
+ //printf("Report from smallScaf: %d scaffolds by smallPE\n",counter);
+}
+
+static boolean putItem2Sarray(unsigned int scaf,int wt,DARRAY *SCAF,DARRAY *WT,int counter)
+{
+ int i;
+ unsigned int *scafP,*wtP;
+ for(i=0;ideleted||ite_cnt->mask||ite_cnt->singleInScaf
+ ||ite_cnt->nextInScaf||ite_cnt->prevInScaf||ite_cnt->inherit){
+ ite_cnt = ite_cnt->next;
+ continue;
+ }
+ targetCtg = ite_cnt->contigID;
+ if(contig_array[ctg].from_vt==contig_array[targetCtg].from_vt){ // on the same scaff
+ ite_cnt = ite_cnt->next;
+ continue;
+ }
+ inc = putItem2Sarray(contig_array[targetCtg].from_vt,ite_cnt->weight,SCAF,WT,counter);
+ if(inc)
+ counter++;
+ ite_cnt = ite_cnt->next;
+ }
+ }
+ return counter;
+
+}
+
+static int getScaffold(unsigned int start, STACK *scafStack)
+{
+ int len = contig_array[start].length;
+ unsigned int *pt,ctg;
+
+ emptyStack(scafStack);
+ pt = (unsigned int*)stackPush(scafStack);
+ *pt = start;
+ CONNECT *bindCnt = getBindCnt(start);
+ while(bindCnt){
+ ctg = bindCnt->contigID;
+ pt = (unsigned int*)stackPush(scafStack);
+ *pt = ctg;
+ len += contig_array[ctg].length;
+ bindCnt = bindCnt->nextInScaf;
+ }
+ stackBackup(scafStack);
+ return len;
+}
+
+static boolean isLinkReliable(DARRAY *WT,int count)
+{
+ int i;
+ for(i=0;i=weakPE)
+ return 1;
+
+ return 0;
+}
+
+static int getWtFromSarray(DARRAY *SCAF,DARRAY *WT,int count,unsigned int scaf)
+{
+ int i;
+ for(i=0;i
+ scaf1 --- --- -- -- ---
+ scaf2 -- --- --- --
+ ---->
+*/
+static boolean checkScafConsist(STACK *scafStack1,STACK *scafStack2)
+{
+ DARRAY *downwardTo1 = (DARRAY *)createDarray(1000,sizeof(unsigned int));// scaf links to those scaffolds
+ DARRAY *downwardTo2 = (DARRAY *)createDarray(1000,sizeof(unsigned int));
+ DARRAY *downwardWt1 = (DARRAY *)createDarray(1000,sizeof(unsigned int));// scaf links to scaffolds with those wt
+ DARRAY *downwardWt2 = (DARRAY *)createDarray(1000,sizeof(unsigned int));
+
+ int linkCount1 = getDSLink2Scaf(scafStack1,downwardTo1,downwardWt1);
+ int linkCount2 = getDSLink2Scaf(scafStack2,downwardTo2,downwardWt2);
+ if(!linkCount1||!linkCount2){
+ freeDarray(downwardTo1);
+ freeDarray(downwardTo2);
+ freeDarray(downwardWt1);
+ freeDarray(downwardWt2);
+ return 1;
+ }
+ boolean flag1 = isLinkReliable(downwardWt1,linkCount1);
+ boolean flag2 = isLinkReliable(downwardWt2,linkCount2);
+ if(!flag1||!flag2){
+ freeDarray(downwardTo1);
+ freeDarray(downwardTo2);
+ freeDarray(downwardWt1);
+ freeDarray(downwardWt2);
+ return 1;
+ }
+
+ unsigned int scaf;
+ int i,wt1,wt2,ret=1;
+
+ for(i=0;i=0){
+ thisCtg = *(unsigned int *)darrayGet(ctgArray,index);
+ cnt = getCntBetween(thisCtg,nextCtg);
+ if(cnt->maxGap>2)
+ break;
+ else
+ *start = index;
+ nextCtg = thisCtg;
+ index--;
+ }
+ unsigned int prevCtg = *(unsigned int *)darrayGet(ctgArray,weakest+1);
+ *finish = weakest+1;
+ index = weakest+2;
+ while(indexmaxGap>2)
+ break;
+ else
+ *finish = index;
+ prevCtg = thisCtg;
+ index++;
+ }
+
+}
+
+static void changeScafEnd(STACK *scafStack,unsigned int end)
+{
+
+ unsigned int ctg,*pt;
+ unsigned int start=getTwinCtg(end);
+ stackRecover(scafStack);
+ while((pt=(unsigned int*)stackPop(scafStack))!=NULL){
+ ctg = *pt;
+ contig_array[ctg].to_vt = end;
+ contig_array[getTwinCtg(ctg)].from_vt = start;
+ }
+}
+
+static void changeScafBegin(STACK *scafStack,unsigned int start)
+{
+
+ unsigned int ctg,*pt;
+ unsigned int end=getTwinCtg(start);
+ stackRecover(scafStack);
+ while((pt=(unsigned int*)stackPop(scafStack))!=NULL){
+ ctg = *pt;
+ contig_array[ctg].from_vt = start;
+ contig_array[getTwinCtg(ctg)].to_vt = end;
+ }
+}
+// break down scaffolds poorly supported by longer PE
+static void detectBreakScaf()
+{
+ fprintf(stderr,"[%s]entering this function.\n",__FUNCTION__);
+ unsigned int i,avgPE,scafLen,len,ctg,bal_ctg,prevCtg,thisCtg;
+ long long peCounter,linkCounter;
+ int num3,num5,weakPoint,tempCounter,j,t,counter=0;
+ CONNECT *bindCnt,*cnt,*weakCnt;
+
+ STACK *scafStack1 = (STACK *)createStack(1000,sizeof(unsigned int));
+ STACK *scafStack2 = (STACK *)createStack(1000,sizeof(unsigned int));
+
+ for(i=1;i<=num_ctg;i++)
+ contig_array[i].flag = 0;
+ for(i=1;i<=num_ctg;i++){
+ if(contig_array[i].flag||contig_array[i].mask||!contig_array[i].downwardConnect)
+ continue;
+ bindCnt = getBindCnt(i);
+ if(!bindCnt)
+ continue;
+ //first scan get the average coverage by longer pe
+ num5 = num3 = peCounter = linkCounter = 0;
+ scafLen = contig_array[i].length;
+ ctg = i;
+ *(unsigned int *)darrayPut(scaf5,num5++) = i;
+ contig_array[i].flag = 1;
+ contig_array[getTwinCtg(i)].flag = 1;
+ while(bindCnt){
+ if(!bindCnt->bySmall)
+ break;
+ linkCounter++;
+ peCounter += bindCnt->maxGap;
+ ctg = bindCnt->contigID;
+ scafLen += contig_array[ctg].length;
+ *(unsigned int *)darrayPut(scaf5,num5++) = ctg;
+ bal_ctg = getTwinCtg(ctg);
+ contig_array[ctg].flag = 1;
+ contig_array[bal_ctg].flag = 1;
+ bindCnt = bindCnt->nextInScaf;
+ }
+
+ ctg = getTwinCtg(i);
+ bindCnt = getBindCnt(ctg);
+ while(bindCnt){
+ if(!bindCnt->bySmall)
+ break;
+ linkCounter++;
+ peCounter += bindCnt->maxGap;
+ ctg = bindCnt->contigID;
+ scafLen += contig_array[ctg].length;
+ bal_ctg = getTwinCtg(ctg);
+ contig_array[ctg].flag = 1;
+ contig_array[bal_ctg].flag = 1;
+ *(unsigned int *)darrayPut(scaf3,num3++) = bal_ctg;
+ bindCnt = bindCnt->nextInScaf;
+ }
+ if(linkCounter<1||scafLen<5000)
+ continue;
+
+ avgPE = peCounter/linkCounter;
+
+ if(avgPE<10)
+ continue;
+
+ tempCounter = 0;
+ for(j=num3-1;j>=0;j--)
+ *(unsigned int *)darrayPut(tempArray,tempCounter++) =
+ *(unsigned int *)darrayGet(scaf3,j);
+
+ for(j=0;jscafLen-2000)
+ break;
+ len += contig_array[thisCtg].length;
+ if(contig_array[prevCtg].from_vt!=contig_array[thisCtg].from_vt||
+ contig_array[prevCtg].indexInScaf>contig_array[thisCtg].indexInScaf){
+ prevCtg = thisCtg;
+ continue;
+ }
+ cnt = getCntBetween(prevCtg,thisCtg);
+ if(!weakCnt||weakCnt->maxGap>cnt->maxGap){
+ weakCnt = cnt;
+ weakPoint = t;
+ }
+ prevCtg = thisCtg;
+ }
+ if(!weakCnt||(weakCnt->maxGap>2&&weakCnt->maxGap>avgPE/5))
+ continue;
+ prevCtg = *(unsigned int *)darrayGet(tempArray,weakPoint-1);
+ thisCtg = *(unsigned int *)darrayGet(tempArray,weakPoint);
+ if(contig_array[prevCtg].from_vt!=contig_array[thisCtg].from_vt||
+ contig_array[prevCtg].indexInScaf>contig_array[thisCtg].indexInScaf){
+ printf("contig %d and %d not on the same scaff\n",prevCtg,thisCtg);
+ continue;
+ }
+ setConnectWP(prevCtg,thisCtg,1);
+ /*
+ fprintf(stderr,"scaffold len %d, avg long pe cov %d (%ld/%ld)\n",
+ scafLen,avgPE,peCounter,linkCounter);
+ fprintf(stderr,"Weak connect (%d) between %d(%dth of %d) and %d\n"
+ ,weakCnt->maxGap,prevCtg,weakPoint-1,tempCounter,thisCtg);
+ */
+ // set start and end to break down the scaffold
+ int index1,index2;
+ setBreakPoints(tempArray,tempCounter,weakPoint-1,&index1,&index2);
+ //fprintf(stderr,"break %d ->...-> %d\n",index1,index2);
+ unsigned int start = *(unsigned int*)darrayGet(tempArray,index1);
+ unsigned int finish = *(unsigned int*)darrayGet(tempArray,index2);
+ int len1 = getScaffold(getTwinCtg(start), scafStack1);
+ int len2 = getScaffold(finish, scafStack2);
+ if(len1<2000||len2<2000)
+ continue;
+ switch2twin(scafStack1);
+ int flag1 = checkScafConsist(scafStack1,scafStack2);
+
+ switch2twin(scafStack1);
+ switch2twin(scafStack2);
+ int flag2 = checkScafConsist(scafStack2,scafStack1);
+ if(!flag1||!flag2){
+ changeScafBegin(scafStack1,getTwinCtg(start));
+ changeScafEnd(scafStack2,getTwinCtg(finish));
+ //unbind links
+ unsigned int nextCtg = *(unsigned int *)darrayGet(tempArray,index1+1);
+ thisCtg = *(unsigned int *)darrayGet(tempArray,index1);
+ cnt=getCntBetween(getTwinCtg(nextCtg),getTwinCtg(thisCtg));
+ if(cnt->nextInScaf){
+ prevCtg = getTwinCtg(cnt->nextInScaf->contigID);
+ cnt->nextInScaf->prevInScaf = 0;
+ cnt = getCntBetween(prevCtg,thisCtg);
+ cnt->nextInScaf = NULL;
+ }
+ prevCtg = *(unsigned int *)darrayGet(tempArray,index2-1);
+ thisCtg = *(unsigned int *)darrayGet(tempArray,index2);
+ cnt = getCntBetween(prevCtg,thisCtg);
+ if(cnt->nextInScaf){
+ nextCtg = cnt->nextInScaf->contigID;
+ cnt->nextInScaf->prevInScaf= 0;
+ cnt = getCntBetween(getTwinCtg(nextCtg),getTwinCtg(thisCtg));
+ cnt->nextInScaf = NULL;
+ }
+ prevCtg = *(unsigned int *)darrayGet(tempArray,index1);
+ for(t=index1+1;t<=index2;t++){
+ thisCtg = *(unsigned int *)darrayGet(tempArray,t);
+ cnt = getCntBetween(prevCtg,thisCtg);
+ cnt->mask = 1;
+ cnt->nextInScaf=NULL;
+ cnt->prevInScaf = 0;
+ cnt = getCntBetween(getTwinCtg(thisCtg),getTwinCtg(prevCtg));
+ cnt->mask = 1;
+ cnt->nextInScaf=NULL;
+ cnt->prevInScaf = 0;
+ /*
+ fprintf(stderr,"(%d %d)/(%d %d) ",
+ prevCtg,thisCtg,getTwinCtg(thisCtg),getTwinCtg(prevCtg));
+ */
+ prevCtg = thisCtg;
+ }
+ //fprintf(stderr,": BREAKING\n");
+ counter++;
+ }
+ }
+
+ freeStack(scafStack1);
+ freeStack(scafStack2);
+ fprintf(stderr,"[%s]existing this function.\n",__FUNCTION__);
+ //printf("Report from checkScaf: %d scaffold segments broken\n",counter);
+}
+
+static boolean checkSimple(DARRAY *ctgArray,int count)
+{
+ int i;
+ unsigned int ctg;
+ for(i=0;iweak||cn_temp1->deleted){
+ cn_temp1 = cn_temp1->next;
+ continue;
+ }
+ ctg = cn_temp1->contigID;
+ if(checkConnect(ctg,i)){
+ counter++;
+ maskContig(i,1);
+ maskContig(ctg,1);
+ }
+ cn_temp1 = cn_temp1->next;
+ }
+
+ }
+ //printf("%d circles removed \n",counter);
+}
diff --git a/fusion/output_scaffold.c b/fusion/output_scaffold.c
new file mode 100755
index 0000000..2076101
--- /dev/null
+++ b/fusion/output_scaffold.c
@@ -0,0 +1,65 @@
+#include "stdinc.h"
+#include "newhash.h"
+#include "extfunc.h"
+#include "extvab.h"
+
+void output_contig_graph(char *outfile)
+{
+ char name[256];
+ FILE *fp;
+ unsigned int i;
+
+ sprintf(name,"%s.contig.gvz",outfile);
+ fp = ckopen(name,"w");
+ fprintf(fp,"digraph G{\n");
+ fprintf(fp,"\tsize=\"512,512\";\n");
+
+ for(i=num_ctg;i>0;i--){
+ fprintf(fp,"\tV%d -> V%d[label =\"%d(%d)\"];\n",contig_array[i].from_vt,contig_array[i].to_vt,i,contig_array[i].length);
+ }
+ fprintf(fp,"}\n");
+ fclose(fp);
+}
+void output_scaf(char *outfile)
+{
+ char name[256];
+ FILE *fp;
+ unsigned int i;
+ CONNECT *connect;
+ boolean flag;
+
+ sprintf(name,"%s.scaffold.gvz",outfile);
+ fp = ckopen(name,"w");
+ fprintf(fp,"digraph G{\n");
+ fprintf(fp,"\tsize=\"512,512\";\n");
+
+ for(i=num_ctg;i>0;i--){
+ //if(contig_array[i].mask||!contig_array[i].downwardConnect)
+ if(!contig_array[i].downwardConnect)
+ continue;
+ connect = contig_array[i].downwardConnect;
+ while(connect){
+ //if(connect->mask||connect->deleted){
+ if(connect->deleted){
+ connect = connect->next;
+ continue;
+ }
+ if(connect->prevInScaf||connect->nextInScaf)
+ flag = 1;
+ else
+ flag = 0;
+ if(!connect->mask)
+ fprintf(fp,"\tC%d_%d -> C%d_%d [label = \"%d(%d_%d)\"];\n"
+ ,i,contig_array[i].length,connect->contigID,contig_array[connect->contigID].length,
+ connect->gapLen,flag,connect->weight);
+ else
+ fprintf(fp,"\tC%d_%d -> C%d_%d [label = \"%d(%d_%d)\", color = red];\n"
+ ,i,contig_array[i].length,connect->contigID,contig_array[connect->contigID].length,
+ connect->gapLen,flag,connect->weight);
+ connect = connect->next;
+ }
+ }
+ fprintf(fp,"}\n");
+ fclose(fp);
+}
+
diff --git a/fusion/potential.c b/fusion/potential.c
new file mode 100644
index 0000000..e6f48cd
--- /dev/null
+++ b/fusion/potential.c
@@ -0,0 +1,232 @@
+#include "stdinc.h"
+#include "newhash.h"
+#include "extfunc.h"
+#include "extvab.h"
+#include "dfibHeap.h"
+#include "fibHeap.h"
+#include "darray.h"
+
+
+//static CTGinHEAP *ctg4heapArray;
+extern int inputLinks(FILE *fp, int insertS,char *line);
+//unsigned int traverse(unsigned int node,int *far_count,unsigned int *farpath,
+ //int *curr_count,unsigned int *currpath,int *used_count,unsigned int *used,int *max_dist,int *node_dist);
+//static int *sub_arr;
+//static int sub_counter=0;
+int rev_comp (const void * a, const void * b)
+{
+ return ( *(int*)b - *(int*)a );
+}
+void potential()
+{
+
+ char name[256],*line;
+ FILE *fp;
+ int i;
+ int flag2;
+
+ loadUpdatedEdges(graphfile);
+
+ if(!pes)
+ loadPEgrads(graphfile);
+
+ sprintf(name,"%s.links",graphfile);
+ fp = ckopen(name,"r");
+
+ createCntMemManager();
+ createCntLookupTable();
+
+ lineLen = 1024;
+ line = (char *)ckalloc(lineLen*sizeof(char));
+
+ fgets(line,lineLen,fp);
+ line[0] = '\0';
+ fprintf(stderr,"[%s]before inputLinks loop.\n",__FUNCTION__);
+ for(i=0;i=0){
+ int curr_bound=curr_boarder;
+ int curr_node=curr_path[curr_boarder--];
+ int base_dist=dist[curr_bound];
+ CONNECT *curr_cnt=contig_array[curr_node].downwardConnect;
+ while(curr_cnt){//push all adjacent connect
+ if(curr_cnt->weight<3||contig_array[curr_cnt->contigID].inSubGraph
+ ||contig_array[getTwinCtg(curr_cnt->contigID)].inSubGraph){
+ curr_cnt=curr_cnt->next;
+ continue;
+ }
+ curr_path[++curr_boarder]=curr_cnt->contigID;
+ contig_array[curr_cnt->contigID].inSubGraph=1;
+ contig_array[getTwinCtg(curr_cnt->contigID)].inSubGraph=1;
+ fprintf(stderr,"[%d] traversed %d %d .\n",__LINE__,curr_cnt->contigID,getTwinCtg(curr_cnt->contigID));
+ ++used;
+ dist[curr_boarder]=base_dist+
+ curr_cnt->gapLen+contig_array[curr_cnt->contigID].length;
+
+ if(dist[curr_boarder]>max_dist)
+ max_dist=dist[curr_boarder];
+ //fprintf(stderr,"curr_boarder %d node_dist %d max_dist %d \n",curr_boarder,
+ // dist[curr_boarder],max_dist);
+ curr_cnt=curr_cnt->next;
+ }
+
+ }
+ len+=max_dist;
+
+ //}
+ if(contig_array[getTwinCtg(i)].downwardConnect){
+ curr_boarder=0;
+ curr_path[curr_boarder]=i;
+ dist[curr_boarder]=0;
+
+ while(curr_boarder>=0){
+ int curr_bound=curr_boarder;
+ int curr_node=curr_path[curr_boarder--];
+ int base_dist=dist[curr_bound];
+ CONNECT *curr_cnt=contig_array[curr_node].downwardConnect;
+ while(curr_cnt){//push all adjacent connect
+ if(curr_cnt->weight<3||contig_array[curr_cnt->contigID].inSubGraph
+ ||contig_array[getTwinCtg(curr_cnt->contigID)].inSubGraph){
+ curr_cnt=curr_cnt->next;
+ continue;
+ }
+ curr_path[++curr_boarder]=curr_cnt->contigID;
+ contig_array[curr_cnt->contigID].inSubGraph=1;
+ contig_array[getTwinCtg(curr_cnt->contigID)].inSubGraph=1;
+ fprintf(stderr,"[%d] traversed %d %d .\n",__LINE__,curr_cnt->contigID,getTwinCtg(curr_cnt->contigID));
+ ++used;
+ dist[curr_boarder]=base_dist+
+ curr_cnt->gapLen+contig_array[curr_cnt->contigID].length;
+
+ if(dist[curr_boarder]>max_dist)
+ max_dist=dist[curr_boarder];
+ //fprintf(stderr,"curr_boarder %d node_dist %d max_dist %d \n",curr_boarder,
+ // dist[curr_boarder],max_dist);
+ curr_cnt=curr_cnt->next;
+ }
+
+ }
+ len+=max_dist;
+ }
+ /*int ii;
+ for(ii=0;ii=half)
+ break;
+ }
+ printf("N50 %d , half %lld.\n",predict[i],half);
+ printf("used contig %d",used);
+}
+
+/*
+unsigned int traverse(unsigned int node,int *far_count,unsigned int *farpath,
+ int *curr_count,unsigned int *currpath,int *used_count,unsigned int *used,int *max_dist,int *node_dist){
+ unsigned int bal = getTwinCtg(node);
+
+ currpath[(*curr_count)++]=node;
+ used[(*used_count)++]=node;
+ used[(*used_count)++]=bal;
+ contig_array[node].inSubGraph=1;
+ contig_array[bal].inSubGraph=1;
+
+ fprintf(stderr,"farcount %d curr_count %d node_dist %d max_dist %d.\n",*far_count,*curr_count,*node_dist,*max_dist);
+ CONNECT *tmp_cnt=contig_array[node].downwardConnect;
+ while(tmp_cnt){
+ unsigned int ctg,bal_ctg;
+ ctg=tmp_cnt->contigID;
+ bal_ctg=getTwinCtg(ctg);
+ if(contig_array[ctg].inSubGraph||contig_array[bal_ctg].inSubGraph
+ ||contig_array[ctg].flag||contig_array[bal_ctg].flag){
+ tmp_cnt=tmp_cnt->next;
+ continue;
+ }
+ *node_dist+=(tmp_cnt->gapLen+contig_array[ctg].length);
+ if(*node_dist>*max_dist){
+ int i;
+ for(i=0;i<*curr_count;++i){
+ farpath[i]=currpath[i];
+ }
+ *far_count=*curr_count;
+ *max_dist=*node_dist+tmp_cnt->gapLen;
+ }
+ traverse(tmp_cnt->contigID,far_count,farpath,curr_count,currpath,used_count,used,max_dist,node_dist);
+ *node_dist-=(tmp_cnt->gapLen+contig_array[ctg].length);
+ tmp_cnt=tmp_cnt->next;
+ }
+ --(*curr_count);
+
+ return 0;
+}
+*/
diff --git a/fusion/prepare.c b/fusion/prepare.c
new file mode 100644
index 0000000..6a04b3c
--- /dev/null
+++ b/fusion/prepare.c
@@ -0,0 +1,216 @@
+#include "stdinc.h"
+#include "newhash.h"
+#include "extfunc.h"
+#include "extvab.h"
+#include "ctype.h"
+boolean upper_rev(char *in,int in_len);
+void print_seq(FILE *out_file,char *sequence , int sequence_len);
+char rev[]={0,0,0,0,0,0,0,0,0,0,//0
+ 0,0,0,0,0,0,0,0,0,0,//10
+ 0,0,0,0,0,0,0,0,0,0,//20
+ 0,0,0,0,0,0,0,0,0,0,//30
+ 0,0,0,0,0,0,0,0,0,0,//40
+ 0,0,0,0,0,0,0,0,0,0,//50
+ 0,0,0,0,0,'T',0,'G',0,0,//60
+ 0,'C',0,0,0,0,0,0,'N',0,//70
+ 0,0,0,0,'A',0,0,0,0,0,//80
+ 0,0,0,0,0,0,0,0,0,0,//90
+ 0,0,0,0,0,0,0,0,0,0,};//100
+typedef struct io_ctg{
+ char *seq;
+ int len;
+ int bal;
+ char *name;
+}IO_CTG;
+
+static int cmp_ctg(const void *a,const void *b){
+ IO_CTG *A=(IO_CTG *)a;
+ IO_CTG *B=(IO_CTG *)b;
+ return A->len-B->len;
+}
+
+int data_prepare(){
+ char file_name[256];
+
+ FILE *basic;
+ sprintf(file_name,"%s.preGraphBasic",graphfile);
+ basic=ckopen(file_name,"w");
+ fprintf(basic,"VERTEX 605681 K %d",overlaplen);
+ fprintf(basic,"\nEDGEs 1861091\n\nMaxReadLen 100 MinReadLen 0 MaxNameLen 256\n");
+ fclose(basic);
+
+ //char **ctg_seq=(char **)ckalloc(100000000*sizeof(char *));
+ //int *ctg_bal=(int *)ckalloc(100000000*sizeof(int));
+ //int *ctg_len=(int *)ckalloc(100000000*sizeof(int));
+
+ FILE *ctg_fp;
+ ctg_fp=ckopen(ctg_file,"r");
+ FILE *update,*index,*new_ctg;
+ sprintf(file_name, "%s.contig", graphfile);
+ new_ctg=ckopen(file_name,"w");
+ FILE *conver;
+ sprintf(file_name,"%s.conver", graphfile);
+ conver=ckopen(file_name,"w");
+
+
+ char *line;
+ line= (char *)ckalloc(100000000*sizeof(char ));
+ char orig_name[1024];
+ char *seq;
+ IO_CTG *pre_ctg=(IO_CTG *)ckalloc(1000000000*sizeof(IO_CTG));
+
+ seq=(char *)malloc(1000000000*sizeof(char));
+ int cul_id=1;
+ int total=0;
+ fgets(line,100000000*sizeof(char ),ctg_fp);
+ sscanf(line,">%s",orig_name);
+ int len=0;
+ //fprintf(stderr,"reach here %d\n",__LINE__);
+ while(fgets(line,100000000*sizeof(char ),ctg_fp)!=NULL){
+ if(line[0]=='>'){
+ if(len%s",orig_name);
+ seq[0]='\0';
+ len=0;
+ continue;
+ }
+
+ boolean flag=upper_rev(seq,len);
+ //fprintf(new_ctg,">%d length %d\n",cul_id,len);
+ //fprintf(conver,"%s\t%d\t%d\n",orig_name,cul_id,len);
+ //print_seq(new_ctg,seq,len);
+ //fprintf(new_ctg,"%s\n",seq);
+ char *one_seq=(char *)ckalloc((len+100)*sizeof(char));
+ strcpy(one_seq,seq);
+
+ if(flag==0){
+ pre_ctg[++total].seq=one_seq;
+ pre_ctg[total].bal=2;
+ pre_ctg[total].len=len;
+ pre_ctg[total].name=(char *)malloc((strlen(orig_name)+1)*sizeof(char));
+ strcpy(pre_ctg[total].name,orig_name);
+ //pre_ctg[++cul_id].bal=0;
+ cul_id+=2;
+ }else{
+ pre_ctg[++total].seq=one_seq;
+ pre_ctg[total].len=len;
+ pre_ctg[total].bal=1;
+ pre_ctg[total].name=(char *)malloc((strlen(orig_name)+1)*sizeof(char));
+ strcpy(pre_ctg[total].name,orig_name);
+ ++cul_id;
+ }
+
+ sscanf(line,">%s",orig_name);
+ seq[0]='\0';
+ len=0;
+ }else{
+ //strcat(seq,line);//effective?
+ int single_len=strlen(line);
+ line[single_len-1]='\0';
+ strcpy(&seq[len],line);
+ len+=single_len-1;
+ }
+
+ }
+ if(len>overlaplen){
+ boolean flag=upper_rev(seq,len);
+ //fprintf(new_ctg,">%d length %d\n",cul_id,len);
+ //fprintf(conver,"%s\t%d\t%d\n",orig_name,cul_id,len);
+ //print_seq(new_ctg,seq,len);
+ //fprintf(new_ctg,"%s\n",seq);
+ char *one_seq=(char *)ckalloc((len+100)*sizeof(char));
+ strcpy(one_seq,seq);
+ if(flag==0){
+ pre_ctg[++total].seq=one_seq;
+ pre_ctg[total].bal=2;
+ pre_ctg[total].len=len;
+ pre_ctg[total].name=(char *)malloc(strlen(orig_name)*sizeof(char));
+ strcpy(pre_ctg[total].name,orig_name);
+ //pre_ctg[++total].bal=0;
+ cul_id+=2;
+ }else{
+ pre_ctg[++total].seq=one_seq;
+ pre_ctg[total].len=len;
+ pre_ctg[total].bal=1;
+ pre_ctg[total].name=(char *)malloc(strlen(orig_name)*sizeof(char));
+ strcpy(pre_ctg[total].name,orig_name);
+ ++cul_id;
+ }
+
+ }
+ fprintf(stderr,"All contigs loaded.\n");
+ sprintf(file_name, "%s.updated.edge", graphfile);
+ update=ckopen(file_name,"w");
+ sprintf(file_name, "%s.ContigIndex", graphfile);
+ index=ckopen(file_name,"w");
+ fprintf(update,"EDGEs %d\n",cul_id);
+ fprintf(index,"Edge_num %d %d\nindex\tlength\treverseComplement\n",cul_id,total);
+ qsort(&pre_ctg[1],total,sizeof(IO_CTG),cmp_ctg);
+
+ int i=1;
+ cul_id=0;
+ for(;i<=total;++i){
+ if(pre_ctg[i].bal==2){
+ len=pre_ctg[i].len;
+ fprintf(new_ctg,">%d length %d\n",++cul_id,len);
+ print_seq(new_ctg,pre_ctg[i].seq,len);
+ fprintf(conver,"%s\t%d\t%d\n",pre_ctg[i].name,cul_id,len);
+// if(overlaplen<=31){
+// fprintf(update,">length %d,fffffffffff,fffffffffff,1,8\n",len);
+// fprintf(update,">length %d,fffffffffff,fffffffffff,-1,8\n",len);
+// }else{
+ fprintf(update,">length %d,1,8\n",len);
+ fprintf(update,">length %d,-1,8\n",len);
+// }
+ fprintf(index,"%d\t%d\t1\n",cul_id++,len);
+
+ }else{
+ fprintf(new_ctg,">%d length %d\n",++cul_id,len);
+ len=pre_ctg[i].len;
+ print_seq(new_ctg,pre_ctg[i].seq,len);
+ fprintf(conver,"%s\t%d\t%d\n",pre_ctg[i].name,cul_id,len);
+ if(overlaplen<=31){
+ fprintf(update,">length %d,fffffffffff,fffffffffff,0,8\n",len);
+ }else{
+ fprintf(update,">length %d,0,8\n",len);
+ }
+ fprintf(index,"%d\t%d\t0\n",cul_id,len);
+ }
+ }
+
+ sprintf(file_name,"touch %s.Arc",graphfile);
+ system(file_name);
+ return 0;
+}
+
+//return value:0: in not equal its' rev_comp
+//1: in equal its' rev_comp
+boolean upper_rev(char *in,int in_len){
+ int i,it_num;
+
+ boolean ret_flag=1;
+ it_num=in_len/2;
+
+ for(i=0;ithreadID;
+ //printf("%dth thread with threadID %d, hash_table %p\n",id,prm.threadID,prm.hash_table);
+ while(1){
+ if(*(prm->selfSignal)==1){
+ unsigned int seq_index=0;
+ unsigned int pos = 0;
+ for(i=0;iselfSignal) = 0;
+ }else if(*(prm->selfSignal)==2){
+ for(i=0;iselfSignal) = 0;
+ }
+ else if(*(prm->selfSignal)==3){
+ *(prm->selfSignal) = 0;
+ break;
+ }
+ usleep(1);
+ }
+}
+
+static void singleKmer(int t,KmerSet *kset,
+ unsigned int seq_index,unsigned int pos)
+{
+ boolean flag;
+ kmer_t *node;
+
+ flag = put_kmerset(kset, kmerBuffer[t], 4, 4,&node);
+ //printf("singleKmer: kmer %llx\n",kmerBuffer[t]);
+ if(!flag){
+ if(smallerBuffer[t])
+ node->twin = 0;
+ else
+ node->twin = 1;;
+ node->l_links = ctgIdArray[seq_index];
+ node->r_links = pos;
+ }else
+ node->deleted = 1;
+}
+
+static void creatThrds(pthread_t *threads,PARAMETER *paras)
+{
+ unsigned char i;
+ int temp;
+
+ for(i=0;i='0'&&name[0]<='9')
+ return atoi(&(name[0]));
+ else
+ return 0;
+}
+
+boolean prlContig2nodes(char *grapfile,int len_cut)
+{
+ long long i,num_seq;
+ char name[256],*next_name;
+ FILE *fp;
+ pthread_t threads[thrd_num];
+ time_t start_t,stop_t;
+ unsigned char thrdSignal[thrd_num+1];
+ PARAMETER paras[thrd_num];
+ int maxCtgLen,minCtgLen,nameLen;
+ unsigned int lenSum,contigId;
+
+ WORDFILTER = (((Kmer) 1) << (2*overlaplen)) - 1;
+ time(&start_t);
+ sprintf(name,"%s.contig",grapfile);
+ fp = ckopen(name, "r");
+ maxCtgLen = nameLen = 10;
+ minCtgLen = 1000;
+ num_seq = readseqpar(&maxCtgLen,&minCtgLen,&nameLen,fp);
+ //printf("\nthere're %lld contigs in file: %s, max seq len %d, min seq len %d, max name len %d\n",
+ //num_seq,grapfile,maxCtgLen,minCtgLen,nameLen);
+ maxReadLen = maxCtgLen;
+ fclose(fp);
+ time(&stop_t);
+ //printf("time spent on parse contigs file %ds\n",(int)(stop_t-start_t));
+
+ next_name = (char *)ckalloc((maxNameLen+1)*sizeof(char));
+
+ // extract all the EDONs
+ seq_buffer_size=buffer_size*2;
+ max_read_c=seq_buffer_size/20;
+
+ kmerBuffer = (Kmer *)ckalloc(buffer_size*sizeof(Kmer));
+ hashBanBuffer = (Kmer *)ckalloc(buffer_size*sizeof(Kmer));
+ smallerBuffer = (boolean *)ckalloc(buffer_size*sizeof(boolean));
+
+ seqBuffer = (char *)ckalloc(seq_buffer_size*sizeof(char));
+ lenBuffer = (int *)ckalloc(max_read_c*sizeof(int));
+ indexArray = (unsigned int *)ckalloc((max_read_c+1)*sizeof(unsigned int));
+ seqBreakers = (unsigned int *)ckalloc((max_read_c+1)*sizeof(unsigned int));
+ ctgIdArray = (int *)ckalloc(max_read_c*sizeof(int));
+
+ fp = ckopen(name, "r");
+ //node_mem_manager = createMem_manager(EDONBLOCKSIZE,sizeof(EDON));
+ rcSeq = (char **)ckalloc((thrd_num+1)*sizeof(char*));
+ if(1){
+ kmerCounter = (long long *)ckalloc((thrd_num+1)*sizeof(long long));
+ KmerSets = (KmerSet **)ckalloc(thrd_num*sizeof(KmerSet *));
+ for(i=0;i0 ? contigId:i;
+ lenSum += lenBuffer[read_c];
+ kmer_c += lenBuffer[read_c] - overlaplen + 1;
+ read_c++;
+ seqBreakers[read_c] = lenSum;
+ indexArray[read_c] = kmer_c;
+ //printf("seq %d start at %d\n",read_c,seqBreakers[read_c]);
+ if(read_c==max_read_c||(lenSum+maxCtgLen)>seq_buffer_size||(kmer_c+maxCtgLen-overlaplen+1)>buffer_size){
+ kmerCounter[0] += kmer_c;
+ sendWorkSignal(2,thrdSignal);
+ sendWorkSignal(1,thrdSignal);
+
+ kmer_c = read_c = lenSum = 0;
+ }
+
+ }
+ if(read_c){
+ kmerCounter[0] += kmer_c;
+ sendWorkSignal(2,thrdSignal);
+ sendWorkSignal(1,thrdSignal);
+ }
+
+ sendWorkSignal(3,thrdSignal);
+
+ thread_wait(threads);
+ time(&stop_t);
+ //printf("time spent on hash reads: %ds\n",(int)(stop_t-start_t));
+ if(1){
+ unsigned long long alloCounter = 0;
+ unsigned long long allKmerCounter = 0;
+ for(i=0;iA G C T
+ {2, 7, 3, 1}, // C->C T A G
+ {1, 3, 7, 2}, // T->T C G A
+ {3, 1, 2, 7} // G->G A T C
+};
+
+static ubyte2 doubleBitMasker[7] = {
+ 0x3, //000000 00000011
+ 0xC, //000000 00001100
+ 0x30, //000000 00110000
+ 0xC0, //000000 11000000
+ 0x300, //000011 00000000
+ 0xC00, //001100 00000000
+ 0x3000 //110000 00000000
+};
+
+static boolean staticFlag=1;
+
+static long long readsInGap=0;
+
+static int buffer_size=10000000;
+static long long readCounter;
+static long long mapCounter;
+long long single_count;
+long long single_map;
+static int ALIGNLEN=0;
+//buffer related varibles for chop kmer
+static int read_c;
+static char **rcSeq;
+static char **seqBuffer;
+static int *lenBuffer;
+static unsigned int *ctgIdArray;
+static int *posArray;
+static char *orienArray;
+static char *footprint; // flag indicates whether the read shoulld leave markers on contigs
+
+// kmer related variables
+static int kmer_c;
+static Kmer *kmerBuffer,*hashBanBuffer;
+static kmer_t **nodeBuffer;
+static boolean *smallerBuffer;
+static unsigned int *indexArray;
+
+static int *deletion;
+
+static void parse1read(int t,int threadID);
+static void threadRoutine(void *thrdID);
+static void searchKmer(int t,KmerSet *kset);
+static void chopKmer4read(int t,int threadID);
+static void thread_wait(pthread_t *threads);
+
+static void creatThrds(pthread_t *threads,PARAMETER *paras)
+{
+ unsigned char i;
+ int temp;
+
+ for(i=0;ithreadID;
+ //printf("%dth thread with task %d, hash_table %p\n",id,prm.task,prm.hash_table);
+ while(1){
+ if(*(prm->selfSignal)==1){
+ for(i=0;iselfSignal) = 0;
+ }else if(*(prm->selfSignal)==2){
+ for(i=0;iselfSignal) = 0;
+ }else if(*(prm->selfSignal)==3){
+ // parse reads
+ for(t=0;tselfSignal) = 0;
+ }else if(*(prm->selfSignal)==5){
+ *(prm->selfSignal) = 0;
+ break;
+ }
+
+ usleep(1);
+ }
+}
+/*
+static void chopReads()
+{
+ int i;
+ for(i=0;isearchCnt;
+
+ if(found)
+ {
+ ++kset->foundCnt;
+ if(!node->deleted)
+ nodeBuffer[t] = node;
+ else
+ {
+ ++kset->delCnt;
+ nodeBuffer[t] = NULL;
+ }
+ }
+ else
+ {
+ ++kset->searchSpcSeedCnt;
+
+ boolean spcFlag;
+ Kmer buff_kmer, spc_kmer;
+ ubyte2 spc_bases;
+ spcKmer *rs;
+ spcBase *tmpBase;
+
+ buff_kmer=kmerBuffer[t];
+ spc_kmer = ((buff_kmer>>14)&0xFFFFFFF00) | ((buff_kmer>>12)&0xC0) | ((buff_kmer>>10)&0x3C) | ((buff_kmer>>6)&0x3);
+ spc_bases = ((buff_kmer>>8)&0x3000) | ((buff_kmer>>6)&0xC00) | ((buff_kmer>>2)&0x3C0) | (buff_kmer&0x3F);
+
+ spcFlag = search_spckmerset(spcSet, spc_kmer, &rs);
+
+ if(spcFlag)
+ {
+ ++kset->getSpcSeedCnt;
+
+ int i=0,j=0,getFlag=-1;
+ int mismatch=0;
+ ubyte2 tmp,mostLastBase; //loci flags
+ ubyte2 bestSpcBases; //best spaced bases
+ int min_mis=31;
+ ubyte2 tmpSpcBase;
+
+ tmpBase=rs->start;
+
+ //fprintf(stderr,"search %llu\tspc_kmer %u\tspc_bases %u\n", kmerBuffer[t], spc_kmer, spc_bases);
+
+ while(tmpBase != NULL)
+ {
+ tmpSpcBase = tmpBase->spaced_bases;
+ tmp = ((spc_bases ^ tmpSpcBase) & 0x5555) | (((spc_bases ^ tmpSpcBase) & 0xAAAA)>>1);
+ mismatch=binLight[tmp];
+
+ if(mismatch < min_mis) //get the minimal mismatch spaced_bases
+ {
+ min_mis = mismatch;
+ mostLastBase = tmp;
+ bestSpcBases = tmpSpcBase;
+ node = tmpBase->large_kmer;
+ getFlag=0;
+ }
+ else if(mismatch == min_mis) //if same amount of mismatch, choose the most right mismatch pos
+ {
+ if(tmplarge_kmer;
+ getFlag=1;
+ }
+ else if(tmp == mostLastBase) //if same mismatch pos, choose the most probable one[see probableMatrix]
+ {
+/*
+static ubyte probableMatrix[4][4] = {
+//A C T G 7 3 2 1
+7, 2, 1, 3, // A->A G C T
+2, 7, 3, 1, // C->C T A G
+1, 3, 7, 2, // T->T C G A
+3, 1, 2, 7 // G->G A T C
+};
+*/
+ getFlag=2;
+ ubyte2 readBases = spc_bases, loopBases = tmpSpcBase, bestBases = bestSpcBases, mismatchFlag = tmp;
+ for(j=0;j<7;j++)
+ {
+ if((mismatchFlag & 0x3) > 0)
+ {
+ if(probableMatrix[(readBases & 0x3)][(loopBases & 0x3)] > probableMatrix[(readBases & 0x3)][(bestBases & 0x3)])
+ //check each 2 bits(1 base) if mismatch
+ {
+ mostLastBase = tmp;
+ bestSpcBases = tmpSpcBase;
+ node = tmpBase->large_kmer;
+ break;
+ }
+ else if((probableMatrix[(readBases & 0x3)][(loopBases & 0x3)] < probableMatrix[(readBases & 0x3)][(bestBases & 0x3)]))
+ break;
+ }
+ mismatchFlag>>=2;
+ readBases>>=2;
+ loopBases>>=2;
+ bestBases>>=2;
+ }
+ }
+ }
+
+ tmpBase = tmpBase->next;
+ }
+
+ if(getFlag<0)
+ {
+ fprintf(stderr,"getFlag error at %llu",kmerBuffer[t]);
+ exit(-1);
+ }
+ ++kset->levelGet[getFlag];
+ nodeBuffer[t] = node;
+ }
+ else
+ nodeBuffer[t] = NULL;
+ }
+}
+
+static void parse1read(int t,int threadID)
+{
+ unsigned int j,i,s;
+ unsigned int contigID;
+ int counter2=0,counter;
+ unsigned int ctgLen,pos;
+ kmer_t *node;
+ boolean isSmaller;
+ int flag,maxOcc=0;
+ kmer_t *maxNode=NULL;
+ int alldgnLen = lenBuffer[t] > ALIGNLEN ? ALIGNLEN:lenBuffer[t];
+ int multi = alldgnLen-overlaplen+1 < 5 ? 5:alldgnLen-overlaplen+1;
+ unsigned int start,finish;
+
+ footprint[t] = 0;
+
+ start = indexArray[t];
+ finish = indexArray[t+1];
+ if(finish==start){ //too short
+ ctgIdArray[t] = 0;
+ deletion[threadID]++;
+ return;
+ }
+ for(j=start;jl_links==node->l_links){
+ flag++;
+ nodeBuffer[s] = NULL;
+ }
+ }
+ if(flag>=2)
+ counter2++; //a loose alignment
+ if(flag>=multi)
+ counter++;
+ else
+ continue;
+ if(flag>maxOcc){
+ pos = j;
+ maxOcc = flag;
+ maxNode = node;
+ }
+ }
+ if(!counter){ //no match
+ ctgIdArray[t] = 0;
+ return;
+ }
+ if(counter2>1)
+ footprint[t] = 1; //aligned to multi contigs
+
+ j = pos;
+ i = pos - start + 1;
+ node = nodeBuffer[j];
+ isSmaller = smallerBuffer[j];
+ contigID = node->l_links;
+ ctgLen = contig_array[contigID].length;
+ pos = node->r_links;
+ if(node->twin==isSmaller){
+ orienArray[t] = '-';
+ ctgIdArray[t] = getTwinCtg(contigID);
+ posArray[t] = ctgLen - pos -overlaplen -i + 1;
+ }else{
+ orienArray[t] = '+';
+ ctgIdArray[t] = contigID;
+ posArray[t] = pos - i + 1;
+ }
+
+}
+
+static void sendWorkSignal(unsigned char SIG,unsigned char *thrdSignals)
+{
+ int t;
+
+ for(t=0;tl_links;
+ ctgLen = contig_array[contigID].length;
+ pos = node->r_links;
+ if(node->twin==isSmaller){
+ ctgIdArray[t] = getTwinCtg(contigID);
+ posArray[t] = ctgLen - pos -overlaplen -i + 1;
+ }else{
+ ctgIdArray[t] = contigID;
+ posArray[t] = pos - i + 1;
+ }
+ }
+
+}
+
+static void output1read(int t, FILE *outfp)
+{
+ int len = lenBuffer[t];
+ int index;
+ readsInGap++;
+/*
+ if(ctgIdArray[t]==735||ctgIdArray[t]==getTwinCtg(735)){
+ printf("%d\t%d\t%d\t",t+1,ctgIdArray[t],posArray[t]);
+ int j;
+ for(j=0;j R2 <-- R1
+ output1read(read1,outfp);
+ }else{
+ read2 = t;
+ read1 = t - 1;
+ ctgIdArray[read2] = ctgIdArray[read1];
+ posArray[read2] = posArray[read1] + insSize - lenBuffer[read2]; // --> R1 <-- R2
+ output1read(read2,outfp);
+ }
+}
+
+static void recordLongRead(FILE *outfp)
+{
+ int t;
+
+ for(t=0;t0){
+ getReadIngap(t,insSize,outfp2,0); //read 2 in gap
+ rd2gap = 1;
+ }
+ else if(ctgIdArray[t]>0&&ctgIdArray[t-1]<1){
+ getReadIngap(t-1,insSize,outfp2,1); //read 1 in gap
+ rd1gap = 1;
+ }
+ }
+ if(ctgId<1)
+ continue;
+ mapCounter++;
+ single_map++;
+ fprintf(outfp,"%lld\t%u\t%d\t%c\n",readCounter,
+ ctgIdArray[t],posArray[t],orienArray[t]);
+ if(t%2==0)
+ continue;
+ if(outfp2&&footprint[t-1]&&!rd1gap)
+ output1read(t-1,outfp2);
+ if(outfp2&&footprint[t]&&!rd2gap)
+ output1read(t,outfp2);
+
+ }
+}
+
+//load contig index and length
+void basicContigInfo(char *infile)
+{
+ char name[256],lldne[1024];
+ FILE *fp;
+ int length,bal_ed,num_all,num_long,index;
+
+ sprintf(name,"%s.ContigIndex",infile);
+ fp = ckopen(name,"r");
+
+ fgets(lldne,sizeof(lldne),fp);
+ sscanf(lldne+8,"%d %d",&num_all,&num_long);
+ //printf("%d edges in graph\n",num_all);
+ num_ctg = num_all;
+ contig_array = (CONTIG *)ckalloc((num_all+1)*sizeof(CONTIG));
+
+ fgets(lldne,sizeof(lldne),fp);
+ num_long = 0;
+ while(fgets(lldne,sizeof(lldne),fp)!=NULL){
+ sscanf(lldne,"%d %d %d",&index,&length,&bal_ed);
+
+ contig_array[++num_long].length = length;
+ contig_array[num_long].bal_edge = bal_ed+1;
+ if(index!=num_long)
+ printf("basicContigInfo: %d vs %d\n",index,num_long);
+ if(bal_ed==0)
+ continue;
+ contig_array[++num_long].length = length;
+ contig_array[num_long].bal_edge = -bal_ed+1;
+
+ }
+
+ fclose(fp);
+}
+
+void prlRead2Ctg(char *libfile,char *outfile)
+{
+ long long i;
+ char *src_name,*next_name,name[256];
+ FILE *fo,*outfp2=NULL;
+ int maxReadNum,libNo,prevLibNo,insSize;
+ boolean flag,pairs=1;
+ pthread_t threads[thrd_num];
+ unsigned char thrdSignal[thrd_num+1];
+ PARAMETER paras[thrd_num];
+
+ maxReadLen = 0;
+ maxNameLen = 256;
+ scan_libInfo(libfile);
+ alloc_pe_mem(num_libs);
+ if(!maxReadLen)
+ maxReadLen = 100;
+ //printf("In file: %s, max seq len %d, max name len %d\n\n",
+ //libfile,maxReadLen,maxNameLen);
+ if(maxReadLen>maxReadLen4all)
+ maxReadLen4all = maxReadLen;
+
+//////////////////////////////////////////// spcSet
+ fflush(stdout);
+
+ ubyte2 spc_i,spc_j;
+ for(spc_i=0;spc_i<16384;spc_i++)
+ {
+ binLight[spc_i]=0;
+ for(spc_j=spc_i;spc_j;spc_j=spc_j&(spc_j-1))
+ ++binLight[spc_i];
+ }
+
+ spcSet = init_spckmerset(KmerSets[thrd_num-1]->size*thrd_num, 0.77f);
+
+
+ for(i=0;i1000)
+ ALIGNLEN = ALIGNLEN < 35 ? 35: ALIGNLEN;
+ else
+ ALIGNLEN = ALIGNLEN < 32 ? 32: ALIGNLEN;
+ //printf("current insert size %d, map_len %d\n",insSize,ALIGNLEN);
+
+ }
+
+ if(insSize>1000)
+ ALIGNLEN = ALIGNLEN < (lenBuffer[read_c]/2+1) ? (lenBuffer[read_c]/2+1):ALIGNLEN;
+
+// if((++i)%100000000==0)
+// printf("[%s]%lld reads processed.\n",__FUNCTION__,i);
+ indexArray[read_c] = kmer_c;
+ if(lenBuffer[read_c] >= overlaplen+1)
+ kmer_c += lenBuffer[read_c] - overlaplen + 1;
+ read_c++;
+ if(read_c==maxReadNum){
+ //mvnv(0,"Start processing reads.");
+
+ indexArray[read_c] = kmer_c;
+
+ sendWorkSignal(2,thrdSignal);
+ //mvnv(0,"chop finished one buffer.");
+ sendWorkSignal(1,thrdSignal);
+ //mvnv(0,"search finished one buffer.");
+ sendWorkSignal(3,thrdSignal);
+ //mvnv(0,"parse finished one buffer.");
+
+ recordAlldgn(fo,insSize,outfp2);
+ kmer_c = 0;
+ read_c = 0;
+ }
+ }
+
+ if(read_c){
+ indexArray[read_c] = kmer_c;
+ sendWorkSignal(2,thrdSignal);
+ sendWorkSignal(1,thrdSignal);
+ sendWorkSignal(3,thrdSignal);
+ recordAlldgn(fo,insSize,outfp2);
+ //printf("Output %lld out of %lld (%.1f)%% reads in gaps\n",readsInGap,readCounter,
+ // (float)readsInGap/readCounter*100);
+ }
+ if(readCounter)
+ printf("[%s]total %llu reads , map-rate (%.1f)%%\n",__FUNCTION__,
+ readCounter,(float)mapCounter/readCounter*100);
+ sendWorkSignal(5,thrdSignal);
+
+ thread_wait(threads);
+ fclose(fo);
+
+ sprintf(name,"%s.peGrads",outfile);
+ fo = ckopen(name,"w");
+ fprintf(fo,"grads&num: %d\t%lld\t%d\n",gradsCounter,n_solexa,maxReadLen4all);
+ if(pairs){
+ if(gradsCounter)
+ ;
+ //printf("%d pe insert size, the largest boundary is %lld\n\n",
+ //gradsCounter,pes[gradsCounter-1].PE_bound);
+ else
+ printf("no paired reads found\n");
+ for(i=0;isearchCnt;
+ foundCntTot += KmerSets[i]->foundCnt;
+ delCntTot += KmerSets[i]->delCnt;
+ searchSpcSeedCntTot += KmerSets[i]->searchSpcSeedCnt;
+ getSpcSeedCntTot += KmerSets[i]->getSpcSeedCnt;
+ levelGet1 += KmerSets[i]->levelGet[0];
+ levelGet2 += KmerSets[i]->levelGet[1];
+ levelGet3 += KmerSets[i]->levelGet[2];
+ }
+ fprintf(stderr,"SEARCH: Search %llu, get %llu, deleted %llu\n",
+ searchCntTot, foundCntTot, delCntTot);
+ fprintf(stderr,"SPACED SEED: Search %llu, get %llu, LVnum %llu, LVpos %llu, LVpro %llu\n",
+ searchSpcSeedCntTot, getSpcSeedCntTot, levelGet1, levelGet2, levelGet3);
+
+ free((void *)rcSeq);
+ free((void *)deletion);
+ for(i=0;i= overlaplen+1)
+ kmer_c += lenBuffer[read_c] - overlaplen + 1;
+ read_c++;
+ if(read_c==maxReadNum){
+ indexArray[read_c] = kmer_c;
+
+ sendWorkSignal(2,thrdSignal);
+ sendWorkSignal(1,thrdSignal);
+ sendWorkSignal(3,thrdSignal);
+
+ recordLongRead(outfp2);
+ kmer_c = 0;
+ read_c = 0;
+ }
+ }
+
+ if(read_c){
+ indexArray[read_c] = kmer_c;
+ sendWorkSignal(2,thrdSignal);
+ sendWorkSignal(1,thrdSignal);
+ sendWorkSignal(3,thrdSignal);
+ recordLongRead(outfp2);
+ //printf("Output %lld out of %lld (%.1f)%% reads in gaps\n",readsInGap,readCounter,
+ // (float)readsInGap/readCounter*100);
+ }
+
+ sendWorkSignal(5,thrdSignal);
+
+ thread_wait(threads);
+
+ fclose(outfp2);
+
+ free_libs();
+ if(1){ // multi-threads
+ for(i=0;iread\n");
+ for(j=0;jlen = len;
+ rd->dis = pos;
+ rd->seqStarter = starter;
+}
+
+static void convertIndex()
+{
+ int *length_array = (int *)ckalloc((num_ctg+1)*sizeof(int));
+ unsigned int i;
+ for(i=1;i<=num_ctg;i++)
+ length_array[i] = 0;
+
+ for(i=1;i<=num_ctg;i++){
+ if(index_array[i]>0)
+ length_array[index_array[i]] = i;
+ }
+ for(i=1;i<=num_ctg;i++)
+ index_array[i] = length_array[i]; //contig i with new index: index_array[i]
+ free((void *)length_array);
+
+}
+
+static long long getRead1by1(FILE *fp,DARRAY *readSeqInGap)
+{
+ long long readCounter=0;
+ if(!fp)
+ return readCounter;
+ int len,ctgID,pos;
+ long long starter;
+ char *pt;
+ char *freadBuf = (char *)ckalloc((maxReadLen/4+1)*sizeof(char));
+
+ while(fread(&len,sizeof(int),1,fp)==1){
+ if(fread(&ctgID,sizeof(int),1,fp)!=1)
+ break;
+ if(fread(&pos,sizeof(int),1,fp)!=1)
+ break;
+ if(fread(freadBuf,sizeof(char),len/4+1,fp)!=(unsigned)(len/4+1))
+ break;
+ //put seq to dynamic array
+ starter = readSeqInGap->item_c;
+ if(!darrayPut(readSeqInGap,starter+len/4)) // make sure there's room for this seq
+ break;
+ pt = (char *)darrayPut(readSeqInGap,starter);
+ bcopy(freadBuf,pt,len/4+1);
+ attach1read2contig(ctgID,len,pos,starter);
+ readCounter++;
+ }
+
+ free((void *)freadBuf);
+ return readCounter;
+}
+// Darray *readSeqInGap
+static boolean loadReads4gap(char *graphfile)
+{
+ FILE *fp,*fp2;
+ char name[1024];
+ long long readCounter;
+
+ sprintf(name,"%s.readInGap",graphfile);
+ fp = fopen(name,"rb");
+ sprintf(name,"%s.longReadInGap",graphfile);
+ fp2 = fopen(name,"rb");
+ if(!fp&&!fp2)
+ return 0;
+
+ if(!orig2new){
+ convertIndex();
+ orig2new = 1;
+ }
+
+ readSeqInGap = (DARRAY *)createDarray(1000000,sizeof(char));
+ if(fp){
+ readCounter = getRead1by1(fp,readSeqInGap);
+ //printf("Loaded %lld reads from %s.readInGap\n",readCounter,graphfile);
+ fclose(fp);
+ }
+ if(fp2){
+ readCounter = getRead1by1(fp2,readSeqInGap);
+ //printf("Loaded %lld reads from %s.LongReadInGap\n",readCounter,graphfile);
+ fclose(fp2);
+ }
+ return 1;
+}
+
+static void debugging1()
+{
+ unsigned int i;
+ if(orig2new){
+ unsigned int *length_array = (unsigned int *)ckalloc((num_ctg+1)*sizeof(unsigned int));
+ //use length_array to change info in index_array
+ for(i=1;i<=num_ctg;i++)
+ length_array[i] = 0;
+
+ for(i=1;i<=num_ctg;i++){
+ if(index_array[i]>0)
+ length_array[index_array[i]] = i;
+ }
+ for(i=1;i<=num_ctg;i++)
+ index_array[i] = length_array[i]; //contig i with original index: index_array[i]
+ orig2new = 0;
+ }
+ READNEARBY *rd;
+ int j;
+ char *pt;
+ for(i=1;i<=num_ctg;i++){
+ if(!contig_array[i].closeReads)
+ continue;
+ if(index_array[i]!=735)
+ continue;
+ //printf("contig %d, len %d: \n",index_array[i],contig_array[i].length);
+ stackBackup(contig_array[i].closeReads);
+ while((rd=(READNEARBY *)stackPop(contig_array[i].closeReads))!=NULL){
+ printf("%d\t%d\t%lld\t",rd->dis,rd->len,rd->seqStarter);
+ pt = (char *)darrayGet(readSeqInGap,rd->seqStarter);
+ for(j=0;jlen;j++)
+ printf("%c",int2base((int)getCharInTightString(pt,j)));
+ printf("\n");
+ }
+ stackRecover(contig_array[i].closeReads);
+ }
+
+}
+
+static void initiateCtgInScaf(CTGinSCAF *actg)
+{
+ actg->cutTail = 0;
+ actg->cutHead = overlaplen;
+ actg->gapSeqLen = 0;
+}
+
+static int procGap(char *line,STACK *ctgsStack)
+{
+ char *tp;
+ int length,i,seg;
+ unsigned int ctg;
+ CTGinSCAF *ctgPt;
+
+ tp = strtok(line, " ");
+ tp = strtok(NULL," "); //length
+ length = atoi(tp);
+ tp = strtok(NULL," "); //seg
+ seg = atoi(tp);
+ if(!seg)
+ return length;
+ for(i=0;ictgID = ctg;
+ ctgPt->start = 0;
+ ctgPt->end = 0;
+ ctgPt->scaftig_start = 0;
+ ctgPt->mask = 1;
+ }
+ return length;
+}
+
+static void debugging2(int index,STACK *ctgsStack)
+{
+ CTGinSCAF *actg;
+
+ stackBackup(ctgsStack);
+ printf(">scaffold%d\t%d 0.0\n",index,ctgsStack->item_c);
+ while((actg=stackPop(ctgsStack))!=NULL){
+ printf("%d\t%d\t%d\t%d\n",
+ actg->ctgID,actg->start,actg->end,actg->scaftig_start);
+ }
+ stackRecover(ctgsStack);
+}
+
+static int cmp_reads(const void *a,const void *b)
+{
+ READNEARBY *A,*B;
+ A = (READNEARBY *)a;
+ B = (READNEARBY *)b;
+
+ if(A->dis>B->dis)
+ return 1;
+ else if(A->dis==B->dis)
+ return 0;
+ else
+ return -1;
+}
+
+static void cutRdArray(READNEARBY *rdArray,int gapStart,int gapEnd,int *count,int arrayLen,READNEARBY *cutArray)
+{
+ int i;
+ int num = 0;
+
+ for(i=0;igapEnd)
+ break;
+ if((rdArray[i].dis+rdArray[i].len)>=gapStart){
+ cutArray[num].dis = rdArray[i].dis;
+ cutArray[num].len = rdArray[i].len;
+ cutArray[num++].seqStarter = rdArray[i].seqStarter;
+ }
+ }
+ *count = num;
+}
+
+static void outputTightStr(FILE *fp,char *tightStr,int start,int length, int outputlen,int revS,int *col)
+{
+ int i;
+ int end;
+ int column = *col;
+
+ if(!revS){
+ end = start+outputlen <= length ? start+outputlen:length;
+ for(i=start;i=0 ? length-start-outputlen:0;
+ for(i=length-1-start;i>=end;i--){
+ fprintf(fp,"%c",int2compbase((int)getCharInTightString(tightStr,i)));
+ if((++column)%100==0){
+ fprintf(fp,"\n");
+ //column = 0;
+ }
+ }
+ }
+ *col = column;
+}
+
+static void outputTightStrLowerCase(FILE *fp,char *tightStr,int start,int length, int outputlen,int revS,int *col)
+{
+ int i;
+ int end;
+ int column = *col;
+
+ if(!revS){
+ end = start+outputlen <= length ? start+outputlen:length;
+ for(i=start;i=0 ? length-start-outputlen:0;
+ for(i=length-1-start;i>=end;i--){
+ fprintf(fp,"%c","tgac"[(int)getCharInTightString(tightStr,i)]);
+ if((++column)%100==0){
+ fprintf(fp,"\n");
+ //column = 0;
+ }
+ }
+ }
+ *col = column;
+}
+
+static void outputNs(FILE *fp,int gapN,int *col)
+{
+ int i,column=*col;
+ for(i=0;ictgID;
+ bal_ctg1 = getTwinCtg(ctg1);
+ start1 = prevCtg->cutHead;
+ length1 = contig_array[ctg1].length + overlaplen;
+ if(length1-prevCtg->cutTail-start1>CTGappend){
+ outputlen1 = CTGappend;
+ start1 = length1-prevCtg->cutTail-outputlen1;
+ }else
+ outputlen1 = length1-prevCtg->cutTail-start1;
+
+ ctg2 = actg->ctgID;
+ bal_ctg2 = getTwinCtg(ctg2);
+ start2 = actg->cutHead;
+ length2 = contig_array[ctg2].length + overlaplen;
+ if(length2-actg->cutTail-start2>CTGappend){
+ outputlen2 = CTGappend;
+ }else
+ outputlen2 = length2-actg->cutTail-start2;
+ if(isLargerThanTwin(ctg1))
+ fprintf(fo,">S%d_C%d_L%d_G%d",scafIndex,index_array[bal_ctg1],outputlen1,prevCtg->gapSeqLen);
+ else
+ fprintf(fo,">S%d_C%d_L%d_G%d",scafIndex,index_array[ctg1],outputlen1,prevCtg->gapSeqLen);
+
+ if(isLargerThanTwin(ctg2))
+ fprintf(fo,"_C%d_L%d\n",index_array[bal_ctg2],outputlen2);
+ else
+ fprintf(fo,"_C%d_L%d\n",index_array[ctg2],outputlen2);
+
+ if(contig_array[ctg1].seq)
+ outputTightStr(fo,contig_array[ctg1].seq,start1,length1, outputlen1,0,&column);
+ else if(contig_array[bal_ctg1].seq)
+ outputTightStr(fo,contig_array[bal_ctg1].seq,start1,length1, outputlen1,1,&column);
+
+ pt = (char *)darrayPut(gapSeqArray,prevCtg->gapSeqOffset);
+ outputTightStrLowerCase(fo,pt,0,prevCtg->gapSeqLen, prevCtg->gapSeqLen,0,&column);
+
+ if(contig_array[ctg2].seq)
+ outputTightStr(fo,contig_array[ctg2].seq,start2,length2, outputlen2,0,&column);
+ else if(contig_array[bal_ctg2].seq)
+ outputTightStr(fo,contig_array[bal_ctg2].seq,start2,length2, outputlen2,1,&column);
+
+ fprintf(fo,"\n");
+}
+
+static void outputGapSeq(FILE *fo,int index,STACK *ctgsStack,DARRAY *gapSeqArray)
+{
+ CTGinSCAF *actg,*prevCtg=NULL;
+ stackRecover(ctgsStack);
+
+ while((actg=stackPop(ctgsStack))!=NULL){
+ if(prevCtg&&prevCtg->gapSeqLen>0)
+ output1gap(fo,index,prevCtg,actg,gapSeqArray);
+ prevCtg = actg;
+
+ }
+
+}
+
+static void outputScafSeq(FILE *fo,int index,STACK *ctgsStack,DARRAY *gapSeqArray)
+{
+ CTGinSCAF *actg,*prevCtg=NULL;
+ unsigned int ctg,bal_ctg,length;
+ int start,outputlen,gapN;
+ char *pt;
+ int column = 0;
+ long long cvgSum=0;
+ int lenSum=0;
+
+ stackRecover(ctgsStack);
+ while((actg=stackPop(ctgsStack))!=NULL){
+ if(!(contig_array[actg->ctgID].cvg>0))
+ continue;
+ lenSum += contig_array[actg->ctgID].length;
+ cvgSum += contig_array[actg->ctgID].length*contig_array[actg->ctgID].cvg;
+ }
+ if(lenSum>0)
+ fprintf(fo,">scaffold%d %4.1f\n",index,(double)cvgSum/lenSum);
+ else
+ fprintf(fo,">scaffold%d 0.0\n",index);
+
+ stackRecover(ctgsStack);
+ while((actg=stackPop(ctgsStack))!=NULL){
+ ctg = actg->ctgID;
+ bal_ctg = getTwinCtg(ctg);
+ length = contig_array[ctg].length + overlaplen;
+ if(prevCtg&&actg->scaftig_start){
+ gapN = actg->start - prevCtg->start - contig_array[prevCtg->ctgID].length;
+ gapN = gapN > 0 ? gapN:1;
+ outputNs(fo,gapN,&column);
+ //outputGapInfo(prevCtg->ctgID,ctg);
+ Ncounter++;
+ }
+ if(!prevCtg)
+ start = 0;
+ else
+ start = actg->cutHead;
+ outputlen = length-start-actg->cutTail;
+ if(contig_array[ctg].seq)
+ outputTightStr(fo,contig_array[ctg].seq,start,length, outputlen,0,&column);
+ else if(contig_array[bal_ctg].seq)
+ outputTightStr(fo,contig_array[bal_ctg].seq,start,length, outputlen,1,&column);
+ if(actg->gapSeqLen<1){
+ prevCtg = actg;
+ continue;
+ }
+
+ pt = (char *)darrayPut(gapSeqArray,actg->gapSeqOffset);
+ outputTightStrLowerCase(fo,pt,0,actg->gapSeqLen, actg->gapSeqLen,0,&column);
+
+ prevCtg = actg;
+ }
+ fprintf(fo,"\n");
+
+}
+
+static void fill1scaf(int index,STACK *ctgsStack,int thrdID);
+static void check1scaf(int t,int thrdID)
+{
+ if(flagBuf[t])
+ return;
+ boolean late=0;
+ pthread_mutex_lock(&mutex);
+ if(!flagBuf[t]){
+ flagBuf[t] = 1;
+ thrdNoBuf[t] = thrdID;
+ }else
+ late = 1;
+ pthread_mutex_unlock(&mutex);
+ if(late)
+ return;
+ counters[thrdID]++;
+ fill1scaf(scafCounter+t+1,ctgStackBuffer[t],thrdID);
+}
+
+static void fill1scaf(int index,STACK *ctgsStack,int thrdID)
+{
+
+ CTGinSCAF *actg,*prevCtg=NULL;
+ READNEARBY *rdArray,*rdArray4gap,*rd;
+ int numRd=0,count,maxGLen=0;
+ unsigned int ctg,bal_ctg;
+ STACK *rdStack;
+
+ while((actg=stackPop(ctgsStack))!=NULL){
+ if(prevCtg)
+ maxGLen = maxGLen<(actg->start-prevCtg->end) ? (actg->start-prevCtg->end):maxGLen;
+ ctg = actg->ctgID;
+ bal_ctg = getTwinCtg(ctg);
+ if(actg->mask){
+ prevCtg = actg;
+ continue;
+ }
+ if(contig_array[ctg].closeReads)
+ numRd += contig_array[ctg].closeReads->item_c;
+ else if(contig_array[bal_ctg].closeReads)
+ numRd += contig_array[bal_ctg].closeReads->item_c;
+ prevCtg = actg;
+ }
+ if(numRd<1)
+ return;
+ rdArray = (READNEARBY *)ckalloc(numRd*sizeof(READNEARBY));
+ rdArray4gap = (READNEARBY *)ckalloc(numRd*sizeof(READNEARBY));
+ //fprintf(stderr,"scaffold%d reads4gap %d\n",index,numRd);
+
+ // collect reads appended to contigs in this scaffold
+ int numRd2 = 0;
+ stackRecover(ctgsStack);
+ while((actg=stackPop(ctgsStack))!=NULL){
+ ctg = actg->ctgID;
+ bal_ctg = getTwinCtg(ctg);
+ if(actg->mask)
+ continue;
+ if(contig_array[ctg].closeReads)
+ rdStack = contig_array[ctg].closeReads;
+ else if(contig_array[bal_ctg].closeReads)
+ rdStack = contig_array[bal_ctg].closeReads;
+ else
+ continue;
+
+ stackBackup(rdStack);
+ while((rd=(READNEARBY *)stackPop(rdStack))!=NULL){
+ rdArray[numRd2].len = rd->len;
+ rdArray[numRd2].seqStarter = rd->seqStarter;
+ if(isSmallerThanTwin(ctg))
+ rdArray[numRd2++].dis = actg->start - overlaplen + rd->dis;
+ else
+ rdArray[numRd2++].dis = actg->start -overlaplen +
+ contig_array[ctg].length - rd->len - rd->dis;
+ }
+ stackRecover(rdStack);
+ }
+ if(numRd2!=numRd)
+ printf("##reads numbers doesn't match, %d vs %d when scaffold %d\n",numRd,numRd2,index);
+ qsort(rdArray,numRd,sizeof(READNEARBY),cmp_reads);
+ //fill gap one by one
+ int gapStart,gapEnd;
+ int numIn=0;
+ boolean flag;
+ int buffer_size=maxReadLen > 100 ? maxReadLen:100;
+ int maxGSLen = maxGLen+GLDiff < 10 ? 10:maxGLen+GLDiff;
+ //fprintf(stderr,"maxGlen %d, maxGSlen %d\n",maxGLen,maxGSLen);
+
+ char *seqGap = (char *)ckalloc(maxGSLen*sizeof(char)); // temp array for gap sequence
+ Kmer *kmerCtg1 = (Kmer *)ckalloc(buffer_size*sizeof(Kmer));
+ Kmer *kmerCtg2 = (Kmer *)ckalloc(buffer_size*sizeof(Kmer));
+ char *seqCtg1 = (char *)ckalloc(buffer_size*sizeof(char));
+ char *seqCtg2 = (char *)ckalloc(buffer_size*sizeof(char));
+ prevCtg = NULL;
+ stackRecover(ctgsStack);
+ while((actg=stackPop(ctgsStack))!=NULL){
+ if(!prevCtg||!actg->scaftig_start){
+ prevCtg = actg;
+ continue;
+ }
+ gapStart = prevCtg->end - 100;
+ gapEnd = actg->start - overlaplen + 100;
+
+ cutRdArray(rdArray,gapStart,gapEnd,&count,numRd,rdArray4gap);
+
+ numIn += count;
+ /*
+ if(!count){
+ prevCtg = actg;
+ continue;
+ }
+ */
+ int overlap;
+ for(overlap=overlaplen;overlap>14;overlap-=2){
+
+ flag = localGraph(rdArray4gap,count,prevCtg,actg,
+ overlaplen,kmerCtg1,kmerCtg2,overlap,darrayBuf[thrdID],
+ seqCtg1,seqCtg2,seqGap);
+
+ //free_kmerset(kmerSet);
+
+ if(flag==1){
+ /*
+ fprintf(stderr,"Between ctg %d and %d, Found with %d\n",prevCtg->ctgID
+ ,actg->ctgID,overlap);
+ */
+ break;
+ }
+ }
+ /*
+ if(count==0)
+ printf("Gap closed without reads\n");
+ if(!flag)
+ fprintf(stderr,"Between ctg %d and %d, NO routes found\n",prevCtg->ctgID,actg->ctgID);
+ */
+
+ prevCtg = actg;
+ }
+
+ //fprintf(stderr,"____scaffold%d reads in gap %d\n",index,numIn);
+ free((void *)seqGap);
+ free((void *)kmerCtg1);
+ free((void *)kmerCtg2);
+ free((void *)seqCtg1);
+ free((void *)seqCtg2);
+ free((void *)rdArray);
+ free((void *)rdArray4gap);
+}
+
+static void reverseStack(STACK *dStack,STACK *sStack)
+{
+ CTGinSCAF *actg,*ctgPt;
+ emptyStack(dStack);
+
+ while((actg=(CTGinSCAF *)stackPop(sStack))!=NULL){
+ ctgPt = (CTGinSCAF *)stackPush(dStack);
+ ctgPt->ctgID = actg->ctgID;
+ ctgPt->start = actg->start;
+ ctgPt->end = actg->end;
+ ctgPt->scaftig_start = actg->scaftig_start;
+ ctgPt->mask = actg->mask;
+ ctgPt->cutHead = actg->cutHead;
+ ctgPt->cutTail = actg->cutTail;
+ ctgPt->gapSeqLen = actg->gapSeqLen;
+ ctgPt->gapSeqOffset = actg->gapSeqOffset;
+ }
+ stackBackup(dStack);
+}
+
+static Kmer tightStr2Kmer(char *tightStr,int start,int length,int revS)
+{
+ int i;
+ Kmer word=0;
+
+ if(!revS){
+ if(start+overlaplen>length){
+ printf("tightStr2Kmer A: no enough bases for kmer\n");
+ return word;
+ }
+ for(i=start;ilength-1-start-overlaplen;i--){
+ word <<= 2;
+ word += int_comp(getCharInTightString(tightStr,i));
+ }
+ }
+ return word;
+}
+
+static Kmer maxKmer()
+{
+ Kmer word = 0;
+ int i;
+ for(i=0;i>= 2;
+ kmerAtEnd &= MaxKmer;
+ kmerAtStart >>= 2;
+ }
+ if(i<10){
+ return overlaplen - i;
+ }
+ else
+ return 0;
+}
+
+
+static void initStackBuf(STACK **ctgStackBuffer,int scafBufSize)
+{
+ int i;
+ for(i=0;iselfSignal)==1){
+ emptyDarray(darrayBuf[prm->threadID]);
+ for(i=0;ithreadID);
+
+ *(prm->selfSignal) = 0;
+ }else if(*(prm->selfSignal)==2){
+ *(prm->selfSignal) = 0;
+ break;
+ }
+ usleep(1);
+ }
+}
+
+static void creatThrds(pthread_t *threads,PARAMETER *paras)
+{
+ unsigned char i;
+ int temp;
+
+ for(i=0;iC%d %4.1f\n",ctg,(double)contig_array[ctg].cvg);
+ outputTightStr(fo,contig_array[ctg].seq,0,len,len,0,&col);
+ }
+ else if(contig_array[bal_ctg].seq){
+ fprintf(fo,">C%d %4.1f\n",bal_ctg,(double)contig_array[ctg].cvg);
+ outputTightStr(fo,contig_array[bal_ctg].seq,0,len,len,0,&col);
+ }
+ contig_array[ctg].flag = 1;
+ contig_array[bal_ctg].flag = 1;
+ fprintf(fo,"\n");
+}
+
+void prlReadsCloseGap(char *graphfile)
+{
+ //thrd_num=1;
+ /*if(fillGap){
+ boolean flag;
+ //printf("\nStart to load reads for gap filling. %d length discrepancy is allowed\n",GLDiff);
+ //printf("...\n");
+ flag = loadReads4gap(graphfile);
+ if(!flag)
+ return;
+ }*/
+
+ if(orig2new){
+ convertIndex();
+ orig2new = 0;
+ }
+ FILE *fp,*fo,*fo2;
+ char line[1024];
+ CTGinSCAF *actg;
+ STACK *ctgStack,*aStack;
+ int index=0,offset=0,counter,overallLen;
+ int i,starter,prev_start,gapLen,catchable;
+ unsigned int ctg,prev_ctg=0;
+ boolean IsPrevGap;
+ pthread_t threads[thrd_num];
+ unsigned char thrdSignal[thrd_num+1];
+ PARAMETER paras[thrd_num];
+
+ for(ctg=1;ctg<=num_ctg;ctg++)
+ contig_array[ctg].flag = 0;
+
+ MAXKMER = maxKmer();
+
+ ctgStack = (STACK *)createStack(1000,sizeof(CTGinSCAF));
+
+ sprintf(line, "%s.scaf_gap", graphfile);
+ fp = ckopen(line, "r");
+ sprintf(line, "%s.scafSeq", graphfile);
+ fo = ckopen(line, "w");
+
+ sprintf(line, "%s.gapSeq", graphfile);
+ fo2 = ckopen(line, "w");
+
+ pthread_mutex_init(&mutex,NULL);
+
+ flagBuf = (boolean *)ckalloc(scafBufSize*sizeof(boolean));;
+ thrdNoBuf = (unsigned char *)ckalloc(scafBufSize*sizeof(unsigned char));;
+ memset(thrdNoBuf,0,scafBufSize*sizeof(char));
+
+ ctgStackBuffer = (STACK **)ckalloc(scafBufSize*sizeof(STACK *));
+ initStackBuf(ctgStackBuffer,scafBufSize);
+
+ darrayBuf = (DARRAY **)ckalloc(thrd_num*sizeof(DARRAY *));
+ counters = (int *)ckalloc(thrd_num*sizeof(int));
+
+ /*for(i=0;i'){
+ if(index){
+ aStack = ctgStackBuffer[scafInBuf];
+ flagBuf[scafInBuf++] = 0;
+ reverseStack(aStack,ctgStack);
+ if(scafInBuf==scafBufSize){
+ /*if(fillGap)
+ sendWorkSignal(1,thrdSignal);*/
+
+ outputSeqs(fo,fo2,scafInBuf);
+ scafCounter += scafInBuf;
+ scafInBuf = 0;
+ }
+ //if(index%1000==0)
+ //printf("Processed %d scaffolds\n",index);
+
+ }
+ //read next scaff
+ emptyStack(ctgStack);
+ IsPrevGap = offset = prev_ctg = 0;
+ sscanf(line+9,"%d %d %d",&index,&counter,&overallLen);
+ continue;
+ }
+ if(line[0]=='G'){ // gap appears
+ /*if(fillGap){
+ gapLen = procGap(line,ctgStack);
+ IsPrevGap = 1;
+ }*/
+ continue;
+ }
+ if(line[0]>='0'&&line[0]<='9'){ // a contig line
+ sscanf(line,"%d %d",&ctg,&starter);
+ actg = (CTGinSCAF *)stackPush(ctgStack);
+ actg->ctgID = ctg;
+ if(contig_array[ctg].flag)
+ MaskContig(ctg);
+ else
+ MarkCtgOccu(ctg);
+ initiateCtgInScaf(actg);
+ if(!prev_ctg)
+ actg->cutHead = 0;
+ else if(!IsPrevGap)
+ allGaps++;
+ if(!IsPrevGap){
+ if(prev_ctg&&(starter-prev_start-(int)contig_array[prev_ctg].length)
+ <((int)overlaplen*4)){
+ /*
+ if(fillGap)
+ catchable = contigCatch(prev_ctg,ctg);
+ else
+ */
+ catchable = 0;
+ if(catchable){ // prev_ctg and ctg overlap **bp
+ allGaps--;
+ /*
+ if(isLargerThanTwin(prev_ctg))
+ fprintf(stderr,"%d ####### by_overlap\n",getTwinCtg(prev_ctg));
+ else
+ fprintf(stderr,"%d ####### by_overlap\n",prev_ctg);
+ */
+ actg->scaftig_start = 0;
+ actg->cutHead = catchable;
+ offset += - (starter-prev_start-contig_array[prev_ctg].length) +
+ (overlaplen - catchable);
+ }else
+ actg->scaftig_start = 1;
+
+ }else
+ actg->scaftig_start = 1;
+ }else{
+ offset += - (starter-prev_start-contig_array[prev_ctg].length) + gapLen;
+ actg->scaftig_start = 0;
+ }
+ actg->start = starter + offset;
+ actg->end = actg->start + contig_array[ctg].length - 1;
+ actg->mask = contig_array[ctg].mask;
+ IsPrevGap = 0;
+ prev_ctg = ctg;
+ prev_start = starter;
+ }
+ }
+ if(index){
+ aStack = ctgStackBuffer[scafInBuf];
+ flagBuf[scafInBuf++] = 0;
+ reverseStack(aStack,ctgStack);
+ if(fillGap)
+ sendWorkSignal(1,thrdSignal);
+ outputSeqs(fo,fo2,scafInBuf);
+ }
+
+ /*if(fillGap){
+ sendWorkSignal(2,thrdSignal);
+ thread_wait(threads);
+ }*/
+ for(ctg=1;ctg<=num_ctg;ctg++){
+ if((contig_array[ctg].length+overlaplen)<100||
+ contig_array[ctg].flag)
+ continue;
+ output_ctg(ctg,fo);
+
+ }
+ //printf("Done with %d scaffolds, %d gaps finished, %d gaps overall\n",index,allGaps-Ncounter,allGaps);
+ //printf("scaffolds outputted : %d.\n",index);
+ index = 0;
+ for(i=0;i0)
+ length_array[index_array[i]] = i;
+ }
+ for(i=1;i<=num_ctg;i++)
+ index_array[i] = length_array[i]; //contig i with new index: index_array[i]
+ free((void *)length_array);
+
+}
+
+static void reverseStack(STACK *dStack,STACK *sStack)
+{
+ CTGinSCAF *actg,*ctgPt;
+ emptyStack(dStack);
+
+ while((actg=(CTGinSCAF *)stackPop(sStack))!=NULL){
+ ctgPt = (CTGinSCAF *)stackPush(dStack);
+ ctgPt->ctgID = actg->ctgID;
+ ctgPt->start = actg->start;
+ ctgPt->end = actg->end;
+ }
+ stackBackup(dStack);
+}
+
+static void initStackBuf(STACK **ctgStackBuffer,int scafBufSize)
+{
+ int i;
+ for(i=0;ictgID;
+ bal_ctg = getTwinCtg(ctg);
+
+ if(contig_array[ctg].from_vt!=0){
+ contig_array[ctg].multi = 1;
+ contig_array[bal_ctg].multi = 1;
+ continue;
+ }
+
+ contig_array[ctg].from_vt = scafID;
+ contig_array[ctg].to_vt = actg->start;
+ contig_array[ctg].flag = 0; //ctg and scaf on the same strand
+ contig_array[bal_ctg].from_vt = scafID;
+ contig_array[bal_ctg].to_vt = actg->start;
+ contig_array[bal_ctg].flag = 1;
+ }
+ }
+
+}
+
+static void locateContigOnscaff(char *graphfile)
+{
+
+ FILE *fp;
+ char line[1024];
+ CTGinSCAF *actg;
+ STACK *ctgStack,*aStack;
+ int index=0,counter,overallLen;
+ int starter,prev_start,gapN,scafLen;
+ unsigned int ctg,prev_ctg=0;
+
+ for(ctg=1;ctg<=num_ctg;ctg++){
+ contig_array[ctg].from_vt = 0;
+ contig_array[ctg].multi = 0;
+ }
+
+ ctgStack = (STACK *)createStack(1000,sizeof(CTGinSCAF));
+
+ sprintf(line, "%s.scaf_gap", graphfile);
+ fp = ckopen(line, "r");
+
+ ctgStackBuffer = (STACK **)ckalloc(scafBufSize*sizeof(STACK *));
+ initStackBuf(ctgStackBuffer,scafBufSize);
+
+
+ Ncounter = scafCounter = scafInBuf = allGaps = 0;
+ while(fgets(line,sizeof(line),fp)!=NULL){
+ if(line[0]=='>'){
+ if(index){
+ aStack = ctgStackBuffer[scafInBuf++];
+ reverseStack(aStack,ctgStack);
+ if(scafInBuf==scafBufSize){
+ mapCtg2Scaf(scafInBuf);
+ scafCounter += scafInBuf;
+ scafInBuf = 0;
+ }
+ //if(index%1000==0)
+ //printf("Processed %d scaffolds\n",index);
+ }
+ //read next scaff
+ scafLen = prev_ctg = 0;
+ emptyStack(ctgStack);
+ sscanf(line+9,"%d %d %d",&index,&counter,&overallLen);
+ fprintf(stderr,">%d\n",index);
+ continue;
+ }
+ if(line[0]=='G'){ // gap appears
+ continue;
+ }
+ if(line[0]>='0'&&line[0]<='9'){ // a contig line
+ sscanf(line,"%d %d",&ctg,&starter);
+ actg = (CTGinSCAF *)stackPush(ctgStack);
+ actg->ctgID = ctg;
+ if(!prev_ctg){
+ actg->start = scafLen;
+ actg->end = actg->start + overlaplen + contig_array[ctg].length - 1;
+ }else{
+ gapN = starter - prev_start-(int)contig_array[prev_ctg].length;
+ gapN = gapN < 1 ? 1:gapN;
+ actg->start = scafLen + gapN;
+ actg->end = actg->start + contig_array[ctg].length - 1;
+ }
+ fprintf(stderr,"%d\t%d\n",actg->start,actg->end);
+ scafLen = actg->end+1;
+ prev_ctg = ctg;
+ prev_start = starter;
+ }
+ }
+ if(index){
+ aStack = ctgStackBuffer[scafInBuf++];
+ reverseStack(aStack,ctgStack);
+ mapCtg2Scaf(scafInBuf);
+ }
+ gapN = 0;
+ for(ctg=1;ctg<=num_ctg;ctg++){
+ if(contig_array[ctg].from_vt==0||contig_array[ctg].multi==1)
+ continue;
+ gapN++;
+ }
+ //printf("\nDone with %d scaffolds, %d contigs in Scaffolld\n",index,gapN);
+ fclose(fp);
+ freeStack(ctgStack);
+ freeStackBuf(ctgStackBuffer,scafBufSize);
+ free((void*)ctgStackBuffer);
+}
+
+static boolean contigElligible(unsigned int contigno)
+{
+ unsigned int ctg = index_array[contigno];
+ if(contig_array[ctg].from_vt==0||contig_array[ctg].multi==1)
+ return 0;
+ else
+ return 1;
+
+}
+static void output1read(FILE *fo,long long readno,unsigned int contigno,int pos)
+{
+
+ unsigned int ctg = index_array[contigno];
+ int posOnScaf;
+ char orien;
+ pos = pos < 0 ? 0:pos;
+ if(contig_array[ctg].flag==0){
+ posOnScaf = contig_array[ctg].to_vt + pos - overlaplen;
+ orien = '+';
+ }else{
+ posOnScaf = contig_array[ctg].to_vt + contig_array[ctg].length - pos;
+ orien = '-';
+ }
+ /*
+ if(readno==676)
+ printf("Read %lld in region from %d, extend %d, pos %d, orien %c\n",
+ readno,contig_array[ctg].to_vt,contig_array[ctg].length,posOnScaf,orien);
+ */
+ fprintf(fo,"%lld\t%d\t%d\t%c\n",readno,contig_array[ctg].from_vt,posOnScaf,orien);
+}
+
+void locateReadOnScaf(char *graphfile)
+{
+ char name[1024],line[1024];
+ FILE *fp,*fo;
+ long long readno,counter=0,pre_readno=0;
+ unsigned int contigno,pre_contigno;
+ int pre_pos,pos;
+
+ locateContigOnscaff(graphfile);
+
+ sprintf(name,"%s.readOnContig",graphfile);
+ fp = ckopen(name,"r");
+ sprintf(name,"%s.readOnScaf",graphfile);
+ fo = ckopen(name,"w");
+
+ if(!orig2new){
+ convertIndex();
+ orig2new = 1;
+ }
+ fgets(line,1024,fp);
+ while(fgets(line,1024,fp)!=NULL){
+ sscanf(line,"%lld %d %d",&readno,&contigno,&pos);
+ if((readno%2==0)&&(pre_readno==readno-1) // they are a pair of reads
+ &&contigElligible(pre_contigno)&&contigElligible(contigno)){
+ output1read(fo,pre_readno,pre_contigno,pre_pos);
+ output1read(fo,readno,contigno,pos);
+ counter++;
+ }
+ pre_readno = readno;
+ pre_contigno = contigno;
+ pre_pos = pos;
+ }
+ printf("%lld pairs on contig\n",counter);
+ fclose(fp);
+ fclose(fo);
+}
diff --git a/fusion/readseq1by1.c b/fusion/readseq1by1.c
new file mode 100755
index 0000000..ee3f35b
--- /dev/null
+++ b/fusion/readseq1by1.c
@@ -0,0 +1,465 @@
+#include "stdinc.h"
+#include "newhash.h"
+#include "extfunc.h"
+#include "extvab.h"
+
+static char src_rc_seq[1024];
+extern long long single_count;
+extern long long single_map;
+void readseq1by1(char *src_seq, char *src_name, int *len_seq, FILE *fp,long long num_seq)
+{
+ int i,k, n,strL;
+ char c;
+ char str[5000];
+
+ n = 0;
+ k = num_seq;
+ while(fgets(str, 4950, fp)) {
+ if(str[0] == '#') continue;
+ if(str[0] == '>') {
+ /*
+ if(k >= 0) { // if this isn't the first '>' in the file
+ *len_seq = n;
+ }
+ */
+ *len_seq = n;
+ n = 0;
+ sscanf(&str[1],"%s",src_name);
+ return;
+ } else {
+ strL = strlen(str);
+ if(strL+n>maxReadLen)
+ strL = maxReadLen - n;
+ for(i = 0; i < strL; i ++) {
+ if(str[i] >= 'a' && str[i] <= 'z') {
+ c = base2int(str[i]-'a'+'A');
+ src_seq[n ++] = c;
+ } else if(str[i] >= 'A' && str[i] <= 'Z') {
+ c = base2int(str[i]);
+ src_seq[n ++] = c;
+ // after pre-process all the symbles would be a,g,c,t,n in lower or upper case.
+ } else if(str[i]=='.') {
+ c = base2int('A');
+ src_seq[n ++] = c;
+ } // after pre-process all the symbles would be a,g,c,t,n in lower or upper case.
+ }
+ //printf("%d: %d\n",k,n);
+ }
+ }
+
+ if(k >= 0){
+ *len_seq = n;
+ return;
+ }
+ *len_seq = 0;
+}
+
+
+void read_one_sequence(FILE *fp, long long *T, char **X)
+
+{
+
+ char *fasta,*src_name; //point to fasta array
+ int num_seq,len,name_len,min_len;
+
+ num_seq = readseqpar(&len,&min_len,&name_len,fp);
+ if(num_seq<1){
+ printf("no fasta sequence in file\n");
+ *T = 0;
+ return;
+ }
+ fasta = (char *)ckalloc(len*sizeof(char));
+ src_name = (char *)ckalloc((name_len+1)*sizeof(char));
+ rewind(fp);
+
+ readseq1by1(fasta,src_name,&len,fp,-1);
+ readseq1by1(fasta,src_name,&len,fp,0);
+
+ *X = fasta;
+ *T = len;
+ free((void *)src_name);
+}
+
+long long multiFileParse(int *max_leg, int *min_leg,int *max_name_leg, FILE *fp)
+{
+
+ char str[5000];
+ FILE *freads;
+ int slen;
+ long long counter = 0;
+ *max_name_leg = *max_leg = 1;
+ *min_leg = 1000;
+ while(fgets(str,4950,fp)){
+ slen = strlen(str);
+ str[slen-1] = str[slen];
+ freads = ckopen(str,"r");
+ counter += readseqpar(max_leg,min_leg,max_name_leg,freads);
+ fclose(freads);
+ }
+ return counter;
+}
+
+long long readseqpar(int *max_leg, int *min_leg,int *max_name_leg, FILE *fp)
+{
+ int l, n;
+ long long k;
+ char str[5000], src_name[5000];
+
+
+ n = 0;
+ k = -1;
+ while(fgets(str, 4950, fp)) {
+ if(str[0] == '>') {
+ if(k >= 0) {
+ if(n > *max_leg)
+ *max_leg = n;
+ if(n < *min_leg)
+ *min_leg = n;
+
+ }
+ n = 0;
+ k ++;
+ sscanf(&str[1], "%s", src_name);
+ if((l = strlen(src_name)) > *max_name_leg)
+ *max_name_leg = l;
+ } else {
+ n += strlen(str)-1;
+ }
+ }
+ if(n > *max_leg)
+ *max_leg = n;
+
+ if(n < *min_leg)
+ *min_leg = n;
+
+ k ++;
+ return(k);
+}
+
+void read1seqfq(char *src_seq, char *src_name, int *len_seq, FILE *fp)
+{
+ int i,n,strL;
+ char c;
+ char str[5000];
+ boolean flag=0;
+
+ while(fgets(str, 4950, fp)) {
+ if(str[0]=='@'){
+ flag = 1;
+ sscanf(&str[1],"%s",src_name);
+ break;
+ }
+ }
+
+ if(!flag){ //last time reading fq file get this
+ *len_seq = 0;
+ return;
+ }
+
+ n = 0;
+ while(fgets(str, 4950, fp)){
+ if(str[0] == '+') {
+ fgets(str,4950,fp); // pass quality value line
+ *len_seq = n;
+ return;
+ } else {
+ strL = strlen(str);
+ if(strL+n>maxReadLen)
+ strL = maxReadLen - n;
+ for(i = 0; i < strL; i ++) {
+ if(str[i] >= 'a' && str[i] <= 'z') {
+ c = base2int(str[i]-'a'+'A');
+ src_seq[n ++] = c;
+ } else if(str[i] >= 'A' && str[i] <= 'Z') {
+ c = base2int(str[i]);
+ src_seq[n ++] = c;
+ // after pre-process all the symbles would be a,g,c,t,n in lower or upper case.
+ } else if(str[i]=='.') {
+ c = base2int('A');
+ src_seq[n ++] = c;
+ } // after pre-process all the symbles would be a,g,c,t,n in lower or upper case.
+ }
+ //printf("%d: %d\n",k,n);
+ }
+ }
+
+ *len_seq = n;
+ return;
+}
+
+// find the next file to open in libs
+static int nextValidIndex(int libNo,boolean pair,unsigned char asm_ctg)
+{
+ int i=libNo;
+
+ while(i1&&lib_array[i].asm_flag!=asm_ctg){ // reads for other purpose
+ i++;
+ continue;
+ }
+ if(lib_array[i].curr_type==1&&
+ lib_array[i].curr_index3&&strcmp(fname+strlen(fname)-3,".gz")==0){
+ char *cmd = (char *)ckalloc((strlen(fname)+20)*sizeof(char));
+ sprintf(cmd,"gzip -dc %s",fname);
+ fp = popen(cmd,"r");
+ free(cmd);
+ return fp;
+ }else{
+ return ckopen(fname,"r");
+ }
+
+}
+
+void openFileInLib(int libNo)
+{
+ int i = libNo;
+ if(lib_array[i].curr_type==1){
+ printf("[%s]opened file:\n %s\n",
+ __FUNCTION__,lib_array[i].a1_fname[lib_array[i].curr_index]);
+ printf("[%s]opened file:\n %s\n",
+ __FUNCTION__,lib_array[i].a2_fname[lib_array[i].curr_index]);
+ lib_array[i].fp1 = openFile4read(lib_array[i].a1_fname[lib_array[i].curr_index]);
+ lib_array[i].fp2 = openFile4read(lib_array[i].a2_fname[lib_array[i].curr_index]);
+ lib_array[i].curr_index++;
+ lib_array[i].paired = 1;
+ }else if(lib_array[i].curr_type==2){
+ printf("[%s]opened file:\n %s\n",
+ __FUNCTION__,lib_array[i].q1_fname[lib_array[i].curr_index]);
+ printf("[%s]opened file:\n %s\n",
+ __FUNCTION__,lib_array[i].q2_fname[lib_array[i].curr_index]);
+ lib_array[i].fp1 = openFile4read(lib_array[i].q1_fname[lib_array[i].curr_index]);
+ lib_array[i].fp2 = openFile4read(lib_array[i].q2_fname[lib_array[i].curr_index]);
+ lib_array[i].curr_index++;
+ lib_array[i].paired = 1;
+ }else if(lib_array[i].curr_type==3){
+ printf("[%s]opened file:\n %s\n",
+ lib_array[i].p_fname[lib_array[i].curr_index]);
+ lib_array[i].fp1 = openFile4read(lib_array[i].p_fname[lib_array[i].curr_index]);
+ lib_array[i].curr_index++;
+ lib_array[i].paired = 0;
+ }else if(lib_array[i].curr_type==4){
+ printf("[%s]opened file:\n %s\n",
+ __FUNCTION__,lib_array[i].s_a_fname[lib_array[i].curr_index]);
+ lib_array[i].fp1 = openFile4read(lib_array[i].s_a_fname[lib_array[i].curr_index]);
+ lib_array[i].curr_index++;
+ lib_array[i].paired = 0;
+ }else if(lib_array[i].curr_type==5){
+ printf("[%s]opened file:\n %s\n",
+ __FUNCTION__,lib_array[i].s_q_fname[lib_array[i].curr_index]);
+ lib_array[i].fp1 = openFile4read(lib_array[i].s_q_fname[lib_array[i].curr_index]);
+ lib_array[i].curr_index++;
+ lib_array[i].paired = 0;
+ }
+
+}
+
+static void reverse2k(char *src_seq,int len_seq)
+{
+ if(!len_seq)
+ return;
+
+ int i;
+ reverseComplementSeq(src_seq,len_seq,src_rc_seq);
+
+ for(i=0;i3&&strcmp(fname+strlen(fname)-3,".gz")==0)
+ pclose(lib_array[libNo].fp1);
+ else
+ fclose(lib_array[libNo].fp1);
+}
+
+static void closeFp2InLab(int libNo)
+{
+ int ftype = lib_array[libNo].curr_type;
+ int index = lib_array[libNo].curr_index-1;
+ char *fname;
+ if(ftype==1)
+ fname = lib_array[libNo].a2_fname[index];
+ else if(ftype==2)
+ fname = lib_array[libNo].q2_fname[index];
+ else
+ return;
+ if(strlen(fname)>3&&strcmp(fname+strlen(fname)-3,".gz")==0)
+ pclose(lib_array[libNo].fp2);
+ else
+ fclose(lib_array[libNo].fp2);
+}
+
+boolean read1seqInLib(char *src_seq, char *src_name, int *len_seq, int *libNo,boolean pair,unsigned char asm_ctg)
+{
+ int i = *libNo;
+ int prevLib = i;
+
+ if(!lib_array[i].fp1 // file1 does not exist
+ ||(lib_array[i].curr_type!=1&&feof(lib_array[i].fp1)) // file1 reaches end and not type1
+ ||(lib_array[i].curr_type==1&&feof(lib_array[i].fp1)&&feof(lib_array[i].fp2))){//f1&f2 reaches end
+ if(lib_array[i].fp1&&feof(lib_array[i].fp1)){
+ closeFp1InLab(i);
+ //printf("[%s]%d reads in current file , (%.1f) map-rate .\n",__FUNCTION__,single_count,single_map/single_count);
+ single_count=single_map=0;
+ }
+ if(lib_array[i].fp2&&feof(lib_array[i].fp2)){
+ closeFp2InLab(i);
+ //printf("[%s]%d reads in current file , (%.1f) map-rate .\n",__FUNCTION__,single_count,single_map/single_count);
+ single_count=single_map=0;
+ }
+
+ *libNo = nextValidIndex(i,pair,asm_ctg);
+ i = *libNo;
+ if(lib_array[i].rd_len_cutoff>0)
+ maxReadLen = lib_array[i].rd_len_cutoff=num_libs)
+ return 0;
+ openFileInLib(i);
+
+ if(lib_array[i].curr_type==1){
+ readseq1by1(src_seq, src_name,len_seq, lib_array[i].fp1,-1);
+ readseq1by1(src_seq, src_name,len_seq, lib_array[i].fp2,-1);
+ }else if(lib_array[i].curr_type==3||lib_array[i].curr_type==4)
+ readseq1by1(src_seq, src_name,len_seq, lib_array[i].fp1,-1);
+
+ }
+ if(lib_array[i].curr_type==1){
+ if(lib_array[i].paired==1){
+ readseq1by1(src_seq, src_name,len_seq, lib_array[i].fp1,1);
+ if(lib_array[i].reverse)
+ reverse2k(src_seq,*len_seq);
+ lib_array[i].paired = 2;
+ if(*len_seq>0||!feof(lib_array[i].fp1)){
+ n_solexa++;
+ return 1;
+ }
+ else
+ return read1seqInLib(src_seq,src_name,len_seq,libNo,pair,asm_ctg);
+ }else{
+ readseq1by1(src_seq, src_name,len_seq, lib_array[i].fp2,1);
+
+ if(lib_array[i].reverse)
+ reverse2k(src_seq,*len_seq);
+ lib_array[i].paired = 1;
+ n_solexa++;
+ return 1; //can't fail to read a read2
+ }
+ }
+ if(lib_array[i].curr_type==2){
+ if(lib_array[i].paired==1){
+ read1seqfq(src_seq, src_name,len_seq, lib_array[i].fp1);
+ /*
+ if(*len_seq>0){
+ for(j=0;j<*len_seq;j++)
+ printf("%c",int2base(src_seq[j]));
+ printf("\n");
+ }
+ */
+ if(lib_array[i].reverse)
+ reverse2k(src_seq,*len_seq);
+ lib_array[i].paired = 2;
+ if(*len_seq>0||!feof(lib_array[i].fp1)){
+ n_solexa++;
+ return 1;
+ }else
+ return read1seqInLib(src_seq,src_name,len_seq,libNo,pair,asm_ctg);
+ }else{
+ read1seqfq(src_seq, src_name,len_seq, lib_array[i].fp2);
+ if(lib_array[i].reverse)
+ reverse2k(src_seq,*len_seq);
+ lib_array[i].paired = 1;
+ n_solexa++;
+ return 1; //can't fail to read a read2
+ }
+ }
+ if(lib_array[i].curr_type==5)
+ read1seqfq(src_seq, src_name,len_seq, lib_array[i].fp1);
+ else{
+ readseq1by1(src_seq, src_name,len_seq, lib_array[i].fp1,1);
+ }
+ /*
+ int t;
+ for(t=0;t<*len_seq;t++)
+ printf("%d",src_seq[t]);
+ printf("\n");
+ */
+ if(lib_array[i].reverse)
+ reverse2k(src_seq,*len_seq);
+ if(*len_seq>0||!feof(lib_array[i].fp1)){
+ n_solexa++;
+ return 1;
+ }else
+ return read1seqInLib(src_seq,src_name,len_seq,libNo,pair,asm_ctg);
+}
diff --git a/fusion/scaffold.c b/fusion/scaffold.c
new file mode 100755
index 0000000..001c505
--- /dev/null
+++ b/fusion/scaffold.c
@@ -0,0 +1,60 @@
+#include "stdinc.h"
+#include "newhash.h"
+#include "extfunc.h"
+#include "extvab.h"
+
+static void initenv(int argc, char **argv);
+static void display_scaff_usage();
+
+static boolean LINK,SCAFF;
+
+
+int call_scaffold()
+{
+ time_t start_t,stop_t,time_bef,time_aft;
+ time(&start_t);
+
+ //initenv(argc, argv);
+
+ loadPEgrads(graphfile);
+
+ time(&time_bef);
+ loadUpdatedEdges(graphfile);
+ time(&time_aft);
+ //printf("time spent on loading edges %ds\n",(int)(time_aft-time_bef));
+
+ if(!SCAFF){
+ time(&time_bef);
+ PE2Links(graphfile);
+ time(&time_aft);
+ //printf("time spent on loading pair end info %ds\n",(int)(time_aft-time_bef));
+
+ time(&time_bef);
+ Links2Scaf(graphfile);
+ time(&time_aft);
+ //printf("time spent on creating scaffolds %ds\n",(int)(time_aft-time_bef));
+
+ scaffolding(100,graphfile);
+ }
+
+ prlReadsCloseGap(graphfile);
+
+
+// locateReadOnScaf(graphfile);
+
+ free_pe_mem();
+ if(index_array)
+ free((void *)index_array);
+
+ freeContig_array();
+
+ //destroyPreArcMem();
+ destroyConnectMem();
+ deleteCntLookupTable();
+
+ time(&stop_t);
+ //printf("time elapsed: %dm\n",(int)(stop_t-start_t)/60);
+ printf("[%s]total time on scaffolding : %d minute(s).\n",__FUNCTION__,(int)(stop_t-start_t)/60);
+
+ return 0;
+}
diff --git a/fusion/searchPath.c b/fusion/searchPath.c
new file mode 100755
index 0000000..1f0015c
--- /dev/null
+++ b/fusion/searchPath.c
@@ -0,0 +1,169 @@
+#include "stdinc.h"
+#include "newhash.h"
+#include "extfunc.h"
+#include "extvab.h"
+
+static int trace_limit = 5000; //the times function is called in a search
+/*
+ search connection paths which were masked along related contigs
+ start from one contig, end with another
+ path length includes the length of the last contig
+*/
+void traceAlongMaskedCnt(unsigned int destE,unsigned int currE,int max_steps,int min,int max,
+ int index,int len,int *num_route)
+{
+ num_trace++;
+ if(num_trace>trace_limit||*num_route>=max_n_routes){
+ return;
+ }
+
+ unsigned int *array;
+ int num,i,length;
+ CONNECT *ite_cnt;
+
+ if(index>0)// there're at most max_steps edges stored in this array including the destination edge
+ length = len + contig_array[currE].length;
+ else
+ length = 0;
+ if(index>max_steps||length>max)
+ return; // this is the only situation we stop
+ if(index>0)// there're at most max_steps edges stored in this array including the destination edge
+ so_far[index-1] = currE;
+
+ if(currE==destE&&index==0){
+ printf("traceAlongMaskedCnt: start and destination are the same\n");
+ return;
+ }
+
+ if(currE==destE && length>=min &&length<=max){
+ num = *num_route;
+ array = found_routes[num];
+ for(i=0;imask||ite_cnt->deleted){
+ ite_cnt = ite_cnt->next;
+ continue;
+ }
+ traceAlongMaskedCnt(destE,ite_cnt->contigID,max_steps,min,max,
+ index+1,length + ite_cnt->gapLen,num_route);
+ ite_cnt = ite_cnt->next;
+ }
+
+}
+// search connection paths from one connect to a contig
+// path length includes the length of the last contig
+void traceAlongConnect(unsigned int destE,CONNECT *currCNT,int max_steps,int min,int max,int index,int len,int *num_route)
+{
+ num_trace++;
+ if(num_trace>trace_limit||*num_route>=max_n_routes){
+ return;
+ }
+
+ unsigned int *array,currE;
+ int num,i,length;
+ CONNECT *ite_cnt;
+
+ currE = currCNT->contigID;
+ length = len + currCNT->gapLen;
+ length += contig_array[currE].length;
+
+ if(index>max_steps||length>max)
+ return; // this is the only situation we stop
+ /*
+ if(globalFlag)
+ printf("B: step %d, ctg %d, length %d\n",index,currCNT->contigID,length);
+ */
+ if(currE==destE&&index==1){
+ printf("traceAlongConnect: start and destination are the same\n");
+ return;
+ }
+
+ so_far[index-1] = currE; // there're at most max_steps edges stored in this array including the destination edge
+
+ if(currE==destE && length>=min &&length<=max){
+ num = *num_route;
+ array = found_routes[num];
+ for(i=0;inextInScaf){
+ traceAlongConnect(destE,currCNT->nextInScaf,max_steps,min,max,index+1,length,num_route);
+ return;
+ }
+
+ ite_cnt = contig_array[currE].downwardConnect;
+ while(ite_cnt){
+ if(ite_cnt->mask||ite_cnt->deleted){
+ ite_cnt = ite_cnt->next;
+ continue;
+ }
+ traceAlongConnect(destE,ite_cnt,max_steps,min,max,index+1,length,num_route);
+ ite_cnt = ite_cnt->next;
+ }
+
+}
+
+//find paths in the graph from currE to destE, its length does not include length of both end contigs
+void traceAlongArc(unsigned int destE,unsigned int currE,int max_steps,int min,int max,int index,int len,int *num_route)
+{
+ num_trace++;
+ if(num_trace>trace_limit||*num_route>=max_n_routes){
+ return;
+ }
+
+ unsigned int *array,out_ed,vt;
+ int num,i,pos,length;
+ preARC *parc;
+
+ pos = index;
+ if(pos>max_steps||len>max)
+ return; // this is the only situation we stop
+ if(currE==destE&&pos==0){
+ printf("traceAlongArc: start and destination are the same\n");
+ return;
+ }
+
+ if(pos>0) // pos starts with 0 for the starting edge
+ so_far[pos-1] = currE; // there're at most max_steps edges stored in this array including the destination edge
+
+ if(currE==destE && len>=min){
+ num = *num_route;
+ array = found_routes[num];
+ for(i=0;i0) //not the starting edge
+ length = len + contig_array[currE].length;
+ else
+ length = len;
+
+
+ vt = contig_array[currE].to_vt;
+
+ parc = contig_array[currE].arcs;
+ while(parc){
+ out_ed = parc->to_ed;
+ traceAlongArc(destE,out_ed,max_steps,min,max,pos,length,num_route);
+ parc = parc->next;
+ }
+
+}
diff --git a/fusion/seq.c b/fusion/seq.c
new file mode 100755
index 0000000..664d4a7
--- /dev/null
+++ b/fusion/seq.c
@@ -0,0 +1,169 @@
+#include "stdinc.h"
+#include "newhash.h"
+#include "extfunc.h"
+#include "extvab.h"
+
+/*
+put a insertSize in the grads array,
+if all grads have been entered and all the boundaris have been set, return 0
+*/
+
+void print_kmer(FILE *fp,Kmer kmer,char c)
+{
+ if(kmer)
+ fprintf(fp,"%llx",kmer);
+ else
+ fprintf(fp,"0x0");
+ fprintf(fp,"%c",c);
+
+}
+
+void printTightString(char *tightSeq,int len)
+{
+ int i;
+
+ for(i=0;i> 2);
+ seq = ((seq & 0x0F0F0F0F0F0F0F0FLLU)<< 4) | ((seq & 0xF0F0F0F0F0F0F0F0LLU)>> 4);
+ seq = ((seq & 0x00FF00FF00FF00FFLLU)<< 8) | ((seq & 0xFF00FF00FF00FF00LLU)>> 8);
+ seq = ((seq & 0x0000FFFF0000FFFFLLU)<<16) | ((seq & 0xFFFF0000FFFF0000LLU)>>16);
+ seq = ((seq & 0x00000000FFFFFFFFLLU)<<32) | ((seq & 0xFFFFFFFF00000000LLU)>>32);
+ return seq >> (64 - (seq_size<<1));
+}
+
+Kmer reverseComplementVerbose(Kmer word,int overlap)
+{
+ return fastReverseComp(word,overlap);
+ /*
+ int index;
+ Kmer revComp = 0;
+ Kmer copy = word;
+ unsigned char nucleotide;
+
+ for (index = 0; index < overlap; index++) {
+ nucleotide = copy & 3;
+ revComp <<= 2;
+ revComp += int_comp(nucleotide);//3 - nucleotide;
+ copy >>= 2;
+ }
+ return revComp;
+ */
+}
+
+Kmer reverseComplement(Kmer word,int overlap)
+{
+ return fastReverseComp(word,overlap);
+}
+
+void writeChar2tightString(char nt,char *tightSeq,int pos)
+{
+ char *byte = tightSeq + pos/4;
+ switch(pos%4){
+ case 0:
+ *byte &=63;
+ *byte += nt << 6;
+ return;
+ case 1:
+ *byte &=207;
+ *byte += nt << 4;
+ return;
+ case 2:
+ *byte &=243;
+ *byte += nt << 2;
+ return;
+ case 3:
+ *byte &=252;
+ *byte += nt;
+ return;
+
+ }
+}
+
+char getCharInTightString(char *tightSeq,int pos)
+{
+ char *byte = tightSeq+pos/4;
+ switch(pos%4){
+ case 3:
+ return (*byte & 3);
+ case 2:
+ return (*byte & 12) >> 2;
+ case 1:
+ return (*byte & 48) >> 4;
+ case 0:
+ return (*byte & 192) >> 6;
+ }
+ return 0;
+}
+
+// complement of sequence denoted 0, 1, 2, 3
+void reverseComplementSeq(char *seq, int len,char *bal_seq)
+{
+ int i,index=0;
+
+ if(len<1)
+ return;
+
+ for(i=len-1;i>=0;i--)
+ bal_seq[index++] = int_comp(seq[i]);
+
+ return;
+}
+
+// complement of sequence denoted 0, 1, 2, 3
+char *compl_int_seq(char *seq, int len)
+{
+ char *bal_seq=NULL,c,bal_c;
+ int i,index;
+
+ if(len<1)
+ return bal_seq;
+
+ bal_seq = (char *)ckalloc(len*sizeof(char));
+ index = 0;
+ for(i=len-1;i>=0;i--){
+ c = seq[i];
+ if(c<4)
+ bal_c = int_comp(c);//3-c;
+ else
+ bal_c = c;
+ bal_seq[index++] = bal_c;
+
+ }
+ return bal_seq;
+}
+
+long long trans_seq(char *seq, int len)
+{
+ int i;
+ long long res;
+
+ res = 0;
+ for(i = 0; i < len; i ++) {
+ res = res * 4 + seq[i];
+ }
+
+ return(res);
+}
+
+char *kmer2seq(Kmer word)
+{
+ int i;
+ char *seq;
+ Kmer charMask = 3;
+
+ seq = (char *)ckalloc(overlaplen*sizeof(char));
+ for(i=overlaplen-1;i>=0;i--){
+ seq[i] = charMask&word;
+ word >>= 2;
+ }
+ return seq;
+}
diff --git a/fusion/stack.c b/fusion/stack.c
new file mode 100755
index 0000000..707dd9e
--- /dev/null
+++ b/fusion/stack.c
@@ -0,0 +1,113 @@
+#include "stack.h"
+
+STACK *createStack(int num_items,size_t unit_size)
+{
+ STACK *newStack = (STACK *)malloc(1*sizeof(STACK));
+
+ newStack->block_list = NULL;
+ newStack->items_per_block = num_items;
+ newStack->item_size = unit_size;
+ newStack->item_c = 0;
+ return newStack;
+}
+
+void emptyStack(STACK *astack)
+{
+ BLOCK_STARTER *block;
+ if(!astack||!astack->block_list)
+ return;
+
+ block = astack->block_list;
+ if(block->next)
+ block = block->next;
+
+ astack->block_list = block;
+ astack->item_c = 0;
+ astack->index_in_block = 0;
+}
+
+void freeStack(STACK *astack)
+{
+ BLOCK_STARTER *ite_block,*temp_block;
+
+ if(!astack)
+ return;
+
+ ite_block = astack->block_list;
+ if(ite_block){
+ while(ite_block->next)
+ ite_block = ite_block->next;
+ }
+ while(ite_block){
+ temp_block = ite_block;
+ ite_block = ite_block->prev;
+ free((void *)temp_block);
+ }
+
+ free((void *)astack);
+}
+
+void stackBackup(STACK *astack)
+{
+ astack->block_backup = astack->block_list;
+ astack->index_backup = astack->index_in_block;
+ astack->item_c_backup = astack->item_c;
+}
+
+void stackRecover(STACK *astack)
+{
+ astack->block_list = astack->block_backup;
+ astack->index_in_block = astack->index_backup;
+ astack->item_c = astack->item_c_backup;
+}
+
+void *stackPop(STACK *astack)
+{
+ BLOCK_STARTER *block;
+
+ if(!astack||!astack->block_list||!astack->item_c)
+ return NULL;
+
+ astack->item_c--;
+ block = astack->block_list;
+ if(astack->index_in_block==1){
+ if(block->next){
+ astack->block_list = block->next;
+ astack->index_in_block = astack->items_per_block;
+ }else{
+ astack->index_in_block = 0;
+ astack->item_c = 0;
+ }
+ return (void *)((void *)block+sizeof(BLOCK_STARTER));
+
+ }
+ return (void *)((void *)block+sizeof(BLOCK_STARTER)+astack->item_size*(--astack->index_in_block));
+}
+
+void *stackPush(STACK *astack)
+{
+ BLOCK_STARTER *block;
+
+ if(!astack)
+ return NULL;
+
+ astack->item_c++;
+ if(!astack->block_list||(astack->index_in_block==astack->items_per_block&&!astack->block_list->prev)){
+ block = malloc(sizeof(BLOCK_STARTER)+astack->items_per_block*astack->item_size);
+ block->prev = NULL;
+ if(astack->block_list)
+ astack->block_list->prev = block;
+ block->next = astack->block_list;
+ astack->block_list = block;
+ astack->index_in_block = 1;
+ return (void *)((void *)block+sizeof(BLOCK_STARTER));
+ }else if(astack->index_in_block==astack->items_per_block&&astack->block_list->prev){
+ astack->block_list = astack->block_list->prev;
+ astack->index_in_block = 1;
+ return (void *)((void *)astack->block_list+sizeof(BLOCK_STARTER));
+ }
+
+ block = astack->block_list;
+ return (void *)((void *)block+sizeof(BLOCK_STARTER)+astack->item_size*astack->index_in_block++);
+
+}
diff --git a/sparsePregraph/Makefile b/sparsePregraph/Makefile
index 9c88a75..980a616 100644
--- a/sparsePregraph/Makefile
+++ b/sparsePregraph/Makefile
@@ -1,8 +1,8 @@
-CC= g++ # /opt/blc/gcc-4.5.0/bin/gcc #gcc
+CC= g++
ifdef debug
-CFLAGS= -O0 -g -fomit-frame-pointer #-static #-mcrc32 -march=core2 -msse4.1 -msse4.2
+CFLAGS= -O0 -g -fomit-frame-pointer #-mcrc32
else
-CFLAGS= -O4 -fomit-frame-pointer #-static #-mcrc32 -march=core2 -msse4.1 -msse4.2
+CFLAGS= -O3 -fomit-frame-pointer -w #-mcrc32
endif
DFLAGS=
@@ -37,15 +37,6 @@ EXTRA_FLAGS += -Wl,--hash-style=both
LIBS += -lbam
endif
-ifneq (,$(findstring Unix,$(shell uname)))
-EXTRA_FLAGS += -Wl,--hash-style=both
-LIBS += -lbam -lrt
-endif
-
-ifneq (,$(findstring Darwin,$(shell uname)))
-LIBS += -lbammac
-endif
-
ifneq (,$(findstring $(shell uname -m), x86_64))
CFLAGS += -m64
endif
@@ -61,26 +52,23 @@ endif
.SUFFIXES:.cpp .o
.cpp.o:
- @printf "Compiling $<... \r"; \
- $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< || echo "Error in command: $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $<"
+ @printf "Compiling $<... \r"
+ @$(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< || echo "Error in command: $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $<"
all: clean $(OBJS)
+ @printf "$(PROG) objects generated. \n"
#pregraph_sparse
-.PHONY:all clean install
+.PHONY:all clean
envTest:
@test $(BIT_ERR) != 1 || sh -c 'echo "Fatal: 64bit CPU and Operating System required!";false;'
-pregraph_sparse: clean envTest $(OBJS)
- @printf "Linking... \r"
- #@$(CC) $(CFLAGS)$(INCLUDES) -o $(PROG) $(OBJS) $(LIBPATH) $(LIBS) $(ENTRAFLAGS)
- @printf "$(PROG) compilation done.\n";
+pregraph_sparse: clean envTest $(OBJS)
+ @printf "Linking... \r"
+ @$(CC) $(CFLAGS)$(INCLUDES) -o $(PROG) $(OBJS) $(LIBPATH) $(LIBS) $(ENTRAFLAGS)
+ @printf "$(PROG) compilation done. \n"
clean:
- @rm -fr gmon.out *.o a.out *.exe *.dSYM $(PROG) *~ *.a *.so.* *.so *.dylib
- @printf "$(PROG) cleaning done.\n";
-
-install:
- @cp $(PROG) ../bin/
- @printf "$(PROG) installed at ../bin/$(PROG)\n"
+ @rm -fr gmon.out *.o a.out $(PROG)
+ @printf "$(PROG) cleaning done. \n"
diff --git a/sparsePregraph/build_edge.cpp b/sparsePregraph/build_edge.cpp
index 2b9ceab..f6639bb 100644
--- a/sparsePregraph/build_edge.cpp
+++ b/sparsePregraph/build_edge.cpp
@@ -1,7 +1,7 @@
/*
* build_edge.cpp
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/sparsePregraph/build_preArc.cpp b/sparsePregraph/build_preArc.cpp
index ad4f2ca..c271e26 100644
--- a/sparsePregraph/build_preArc.cpp
+++ b/sparsePregraph/build_preArc.cpp
@@ -1,7 +1,7 @@
/*
* build_preArc.cpp
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/sparsePregraph/change.log b/sparsePregraph/change.log
deleted file mode 100644
index 99a26ff..0000000
--- a/sparsePregraph/change.log
+++ /dev/null
@@ -1,24 +0,0 @@
-1.change the edge node
-
-old:
-struct edge_node
-{
- uint64_t edge£º50£¬edge_cov:7,len:6,used:1£»
- struct edge_node *nxt_edge;
-};
-
-now:
-struct edge_node
-{
- uint64_t edge;
- uint64_t edge_cov:7,len:6,used:1,deleted:1;
- struct edge_node *nxt_edge;
-};
-
-so, the LoadGraph... function can't work when performed on an old hash data set.
-
-
-2. support bam format
-3. support -R
-4. support 127mer
-5. build vertex K_size -> gap .
\ No newline at end of file
diff --git a/sparsePregraph/convert_soapdenovo.cpp b/sparsePregraph/convert_soapdenovo.cpp
index 89852c8..f9ec777 100644
--- a/sparsePregraph/convert_soapdenovo.cpp
+++ b/sparsePregraph/convert_soapdenovo.cpp
@@ -1,7 +1,7 @@
/*
* convert_soapdenovo.cpp
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/sparsePregraph/global.cpp b/sparsePregraph/global.cpp
index 2a49afd..18c9a23 100644
--- a/sparsePregraph/global.cpp
+++ b/sparsePregraph/global.cpp
@@ -1,7 +1,7 @@
/*
* global.cpp
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/sparsePregraph/inc/build_edge.h b/sparsePregraph/inc/build_edge.h
index 2b6fa03..adb27c6 100644
--- a/sparsePregraph/inc/build_edge.h
+++ b/sparsePregraph/inc/build_edge.h
@@ -1,7 +1,7 @@
/*
* inc/sparse_kmer.h
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/sparsePregraph/inc/build_graph.h b/sparsePregraph/inc/build_graph.h
index 6ff5818..b603331 100644
--- a/sparsePregraph/inc/build_graph.h
+++ b/sparsePregraph/inc/build_graph.h
@@ -1,7 +1,7 @@
/*
* inc/build_graph.h
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/sparsePregraph/inc/build_preArc.h b/sparsePregraph/inc/build_preArc.h
index df683ca..7513739 100644
--- a/sparsePregraph/inc/build_preArc.h
+++ b/sparsePregraph/inc/build_preArc.h
@@ -1,7 +1,7 @@
/*
* inc/build_preArc.h
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/sparsePregraph/inc/convert_soapdenovo.h b/sparsePregraph/inc/convert_soapdenovo.h
index 05fe016..fb7768c 100644
--- a/sparsePregraph/inc/convert_soapdenovo.h
+++ b/sparsePregraph/inc/convert_soapdenovo.h
@@ -1,7 +1,7 @@
/*
* inc/convert_soapdenovo.h
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/sparsePregraph/inc/global.h b/sparsePregraph/inc/global.h
index eebaeca..94ee249 100644
--- a/sparsePregraph/inc/global.h
+++ b/sparsePregraph/inc/global.h
@@ -1,7 +1,7 @@
/*
* inc/global.h
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/sparsePregraph/inc/io_func.h b/sparsePregraph/inc/io_func.h
index 24066fd..105eb3f 100644
--- a/sparsePregraph/inc/io_func.h
+++ b/sparsePregraph/inc/io_func.h
@@ -1,7 +1,7 @@
/*
* inc/io_func.h
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/sparsePregraph/inc/libcurses.a b/sparsePregraph/inc/libcurses.a
deleted file mode 100644
index a3863b8..0000000
Binary files a/sparsePregraph/inc/libcurses.a and /dev/null differ
diff --git a/sparsePregraph/inc/multi_threads.h b/sparsePregraph/inc/multi_threads.h
index 2155d91..68e8f8e 100644
--- a/sparsePregraph/inc/multi_threads.h
+++ b/sparsePregraph/inc/multi_threads.h
@@ -1,7 +1,7 @@
/*
* inc/multi_threads.h
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/sparsePregraph/inc/sparse_kmer.h b/sparsePregraph/inc/sparse_kmer.h
index d013ded..650d376 100644
--- a/sparsePregraph/inc/sparse_kmer.h
+++ b/sparsePregraph/inc/sparse_kmer.h
@@ -1,7 +1,7 @@
/*
* inc/sparse_kmer.h
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/sparsePregraph/inc/stdinc.h b/sparsePregraph/inc/stdinc.h
index 5cd64c0..3a02528 100644
--- a/sparsePregraph/inc/stdinc.h
+++ b/sparsePregraph/inc/stdinc.h
@@ -1,7 +1,7 @@
/*
* inc/stdinc.h
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/sparsePregraph/inc/xcurses.h.gch b/sparsePregraph/inc/xcurses.h.gch
deleted file mode 100644
index e01c34a..0000000
Binary files a/sparsePregraph/inc/xcurses.h.gch and /dev/null differ
diff --git a/sparsePregraph/io_func.cpp b/sparsePregraph/io_func.cpp
index 6832bce..349ff87 100644
--- a/sparsePregraph/io_func.cpp
+++ b/sparsePregraph/io_func.cpp
@@ -1,7 +1,7 @@
/*
* io_func.cpp
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/sparsePregraph/multi_threads.cpp b/sparsePregraph/multi_threads.cpp
index 5aa755d..0847f51 100644
--- a/sparsePregraph/multi_threads.cpp
+++ b/sparsePregraph/multi_threads.cpp
@@ -1,7 +1,7 @@
/*
* multi_threads.cpp
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/sparsePregraph/pregraph_sparse.cpp b/sparsePregraph/pregraph_sparse.cpp
index cd422f3..d425c97 100644
--- a/sparsePregraph/pregraph_sparse.cpp
+++ b/sparsePregraph/pregraph_sparse.cpp
@@ -1,7 +1,7 @@
/*
* pregraph_sparse.cpp
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/standardPregraph/Makefile b/standardPregraph/Makefile
index 9716391..421f400 100644
--- a/standardPregraph/Makefile
+++ b/standardPregraph/Makefile
@@ -9,9 +9,9 @@ CC= gcc
GCCVERSIONMAJOR := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
GCCVERSIONMINOR := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 4)
ifdef debug
-CFLAGS= -O0 -g -fomit-frame-pointer #-static #-mcrc32 -march=core2 -msse4.1 -msse4.2
+CFLAGS= -O0 -g -fomit-frame-pointer #-mcrc32
else
-CFLAGS= -O4 -fomit-frame-pointer #-static #-mcrc32 -march=core2 -msse4.1 -msse4.2
+CFLAGS= -O3 -fomit-frame-pointer -w #-mcrc32
endif
DFLAGS=
OBJS= arc.o attachPEinfo.o bubble.o check.o compactEdge.o \
@@ -49,15 +49,6 @@ EXTRA_FLAGS += -Wl,--hash-style=both
LIBS += -lbam -lrt
endif
-ifneq (,$(findstring Unix,$(shell uname)))
-EXTRA_FLAGS += -Wl,--hash-style=both
-LIBS += -lbam -lrt
-endif
-
-ifneq (,$(findstring Darwin,$(shell uname)))
-LIBS += -lbammac
-endif
-
ifneq (,$(findstring $(shell uname -m), x86_64))
CFLAGS += -m64
endif
@@ -73,12 +64,11 @@ endif
.SUFFIXES:.c .o
.c.o:
- @printf "Compiling $<... \r"; \
- $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< || echo "Error in command: $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $<"
+ @printf "Compiling $<... \r"
+ @$(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< || echo "Error in command: $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $<"
all: clean $(OBJS)
-
-#SOAPdenovo
+ @printf "$(PROG) objects generated. \n"
.PHONY:all clean install
@@ -88,14 +78,10 @@ envTest:
@test $(GCCVERSIONMINOR) == 1 || sh -c 'echo "GCC version lower than 4.4.0";false;'
SOAPdenovo: envTest $(OBJS)
- @printf "Linking... \r"
+ @printf "Linking... \r"
@$(CC) $(CFLAGS) -o $(PROG) $(OBJS) $(LIBPATH) $(LIBS) $(ENTRAFLAGS)
- @printf "$(PROG) compilation done.\n";
+ @printf "$(PROG) compilation done. \n"
clean:
@rm -fr gmon.out *.o a.out *.exe *.dSYM $(PROG) *~ *.a *.so.* *.so *.dylib
- @printf "$(PROG) cleaning done.\n";
-
-install:
- @cp $(PROG) ../bin/
- @printf "$(PROG) installed at ../bin/$(PROG)\n"
+ @printf "$(PROG) cleaning done. \n"
diff --git a/standardPregraph/arc.c b/standardPregraph/arc.c
index 32cf554..08649a6 100644
--- a/standardPregraph/arc.c
+++ b/standardPregraph/arc.c
@@ -1,7 +1,7 @@
/*
* arc.c
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/standardPregraph/attachPEinfo.c b/standardPregraph/attachPEinfo.c
index 77105d5..f1ac7cf 100644
--- a/standardPregraph/attachPEinfo.c
+++ b/standardPregraph/attachPEinfo.c
@@ -1,7 +1,7 @@
/*
* attachPEinfo.c
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/standardPregraph/bubble.c b/standardPregraph/bubble.c
index dd83a4c..c7abf7f 100644
--- a/standardPregraph/bubble.c
+++ b/standardPregraph/bubble.c
@@ -1,7 +1,7 @@
/*
* bubble.c
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/standardPregraph/check.c b/standardPregraph/check.c
index be06fba..cf39402 100644
--- a/standardPregraph/check.c
+++ b/standardPregraph/check.c
@@ -1,7 +1,7 @@
/*
* check.c
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/standardPregraph/compactEdge.c b/standardPregraph/compactEdge.c
index edf9824..6979760 100644
--- a/standardPregraph/compactEdge.c
+++ b/standardPregraph/compactEdge.c
@@ -1,7 +1,7 @@
/*
* compactEdge.c
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/standardPregraph/concatenateEdge.c b/standardPregraph/concatenateEdge.c
index c795e46..18fa761 100644
--- a/standardPregraph/concatenateEdge.c
+++ b/standardPregraph/concatenateEdge.c
@@ -1,7 +1,7 @@
/*
* concatenateEdge.c
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/standardPregraph/connect.c b/standardPregraph/connect.c
index 1f10a8c..60ac7bb 100644
--- a/standardPregraph/connect.c
+++ b/standardPregraph/connect.c
@@ -1,7 +1,7 @@
/*
* connect.c
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/standardPregraph/contig.c b/standardPregraph/contig.c
index f1ce229..6d8c0c1 100644
--- a/standardPregraph/contig.c
+++ b/standardPregraph/contig.c
@@ -1,7 +1,7 @@
/*
* contig.c
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/standardPregraph/cutTipPreGraph.c b/standardPregraph/cutTipPreGraph.c
index b1fb2c8..1594be9 100644
--- a/standardPregraph/cutTipPreGraph.c
+++ b/standardPregraph/cutTipPreGraph.c
@@ -1,7 +1,7 @@
/*
* cutTipPreGraph.c
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/standardPregraph/cutTip_graph.c b/standardPregraph/cutTip_graph.c
index 651c1f8..4cfc0be 100644
--- a/standardPregraph/cutTip_graph.c
+++ b/standardPregraph/cutTip_graph.c
@@ -1,7 +1,7 @@
/*
* cutTip_graph.c
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/standardPregraph/cutTip_graph2.c b/standardPregraph/cutTip_graph2.c
index 9ab776c..12da91b 100644
--- a/standardPregraph/cutTip_graph2.c
+++ b/standardPregraph/cutTip_graph2.c
@@ -1,7 +1,7 @@
/*
* cutTip_graph2.c
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/standardPregraph/darray.c b/standardPregraph/darray.c
index 66ca668..46fdad0 100644
--- a/standardPregraph/darray.c
+++ b/standardPregraph/darray.c
@@ -1,7 +1,7 @@
/*
* darray.c
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/standardPregraph/dfibHeap.c b/standardPregraph/dfibHeap.c
index 900171d..5399b70 100644
--- a/standardPregraph/dfibHeap.c
+++ b/standardPregraph/dfibHeap.c
@@ -1,7 +1,7 @@
/*
* dfibHeap.c
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/standardPregraph/fibHeap.c b/standardPregraph/fibHeap.c
index f690fb8..51f4e21 100644
--- a/standardPregraph/fibHeap.c
+++ b/standardPregraph/fibHeap.c
@@ -1,7 +1,7 @@
/*
* fibHeap.c
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo .
*
* This file is part of SOAPdenovo.
*
diff --git a/standardPregraph/hashFunction.c b/standardPregraph/hashFunction.c
index bc7065d..6de56cc 100644
--- a/standardPregraph/hashFunction.c
+++ b/standardPregraph/hashFunction.c
@@ -1,7 +1,7 @@
/*
* hashFunction.c
*
- * Copyright (c) 2008-2012 BGI-Shenzhen .
+ * Copyright (c) 2008-2016 Ruibang Luo