Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

This commit was manufactured by cvs2svn to create branch

'bioperl-ext-branch-1-5-1'.

svn path=/bioperl-ext/branches/bioperl-ext-branch-1-5-1/; revision=658
  • Loading branch information...
commit 8695d5b2014b91e934c4550ae222196bcb202091 1 parent cab3ef9
nobody authored
Showing with 3,786 additions and 36 deletions.
  1. +219 −1 Bio/Ext/Align/Align.xs
  2. +3 −3 Bio/Ext/Align/Makefile.PL
  3. +30 −0 Bio/Ext/Align/blosum45.mat
  4. +29 −0 Bio/Ext/Align/blosum50.mat
  5. +30 −0 Bio/Ext/Align/blosum62.mat
  6. +298 −0 Bio/Ext/Align/libs/dpalign.c
  7. +66 −0 Bio/Ext/Align/libs/dpalign.h
  8. +779 −0 Bio/Ext/Align/libs/linspc.c
  9. +5 −3 Bio/Ext/Align/libs/makefile
  10. +24 −0 Bio/Ext/Align/md_20.mat
  11. +74 −14 Bio/Ext/Align/test.pl
  12. +21 −0 Bio/Ext/Align/typemap
  13. +49 −0 Bio/Ext/HMM/HMM.pm
  14. +207 −0 Bio/Ext/HMM/HMM.xs
  15. +13 −0 Bio/Ext/HMM/Makefile.PL
  16. +36 −0 Bio/Ext/HMM/hmm.h
  17. +779 −0 Bio/Ext/HMM/hmmlib.c
  18. +102 −0 Bio/Ext/HMM/test.pl
  19. +10 −0 Bio/Ext/HMM/typemap
  20. +76 −0 Bio/SeqIO/staden/Makefile.PL
  21. +318 −0 Bio/SeqIO/staden/read.pm
  22. +145 −0 Bio/SeqIO/staden/test.pl
  23. +17 −0 Makefile.PL
  24. +82 −15 README
  25. +253 −0 t/Test.pm
  26. +14 −0 t/basic.t
  27. BIN  t/data/readtest.abi
  28. BIN  t/data/readtest.ctf
  29. +74 −0 t/data/readtest.exp
  30. +19 −0 t/data/readtest.pln
  31. BIN  t/data/readtest.ztr
  32. +14 −0 t/data/readtestabi.fa
  33. BIN  t/data/readtestref.scf
View
220 Bio/Ext/Align/Align.xs
@@ -10,7 +10,7 @@ extern "C" {
#endif
#include "sw.h"
-
+#include "dpalign.h"
static int
not_here(s)
@@ -3307,7 +3307,225 @@ Align_Proteins_SmithWaterman(one,two,comp,gap,ext)
OUTPUT:
RETVAL
+MODULE = Bio::Ext::Align PACKAGE = Bio::Ext::Align
+
+dpAlign_AlignOutput *
+Align_DNA_Sequences(seq1, seq2, match, mismatch, gap, ext, alg)
+ char * seq1
+ char * seq2
+ int match
+ int mismatch
+ int gap
+ int ext
+ int alg
+ CODE:
+ switch (alg) {
+ case 1:
+ RETVAL = dpAlign_Local_DNA_MillerMyers(seq1, seq2, match, mismatch, gap, ext);
+ break;
+ case 2:
+ RETVAL = dpAlign_Global_DNA_MillerMyers(seq1, seq2, match, mismatch, gap, ext);
+ break;
+ default:
+ RETVAL = dpAlign_Local_DNA_MillerMyers(seq1, seq2, match, mismatch, gap, ext);
+ break;
+ }
+ OUTPUT:
+ RETVAL
+
+dpAlign_AlignOutput *
+Align_Protein_Sequences(seq1, seq2, matrix, alg)
+ char * seq1
+ char * seq2
+ dpAlign_ScoringMatrix * matrix
+ int alg
+ CODE:
+ switch (alg) {
+ case 1:
+ RETVAL = dpAlign_Local_Protein_MillerMyers(seq1, seq2, matrix);
+ break;
+ case 2:
+ RETVAL = dpAlign_Global_Protein_MillerMyers(seq1, seq2, matrix);
+ break;
+ default:
+ RETVAL = dpAlign_Local_Protein_MillerMyers(seq1, seq2, matrix);
+ break;
+ }
+ OUTPUT:
+ RETVAL
+
+int
+Score_DNA_Sequences(sp, seq2)
+ dpAlign_SequenceProfile * sp
+ char * seq2
+ CODE:
+ RETVAL = dpAlign_Local_DNA_PhilGreen(sp, seq2);
+ ST(0) = sv_newmortal();
+ sv_setiv(ST(0), (IV)RETVAL);
+ XSRETURN(1);
+
+int
+Score_Protein_Sequences(sp, seq2)
+ dpAlign_SequenceProfile * sp
+ char * seq2
+ CODE:
+ RETVAL = dpAlign_Local_Protein_PhilGreen(sp, seq2);
+ ST(0) = sv_newmortal();
+ sv_setiv(ST(0), (IV)RETVAL);
+ XSRETURN(1);
+
+MODULE = Bio::Ext::Align PACKAGE = Bio::Ext::Align::ScoringMatrix
+
+dpAlign_ScoringMatrix *
+new(class, alphabet, gap, ext)
+ char * class
+ char * alphabet
+ int gap
+ int ext
+ PPCODE:
+ dpAlign_ScoringMatrix * out;
+ out = new_dpAlign_ScoringMatrix(alphabet, gap, ext);
+ ST(0) = sv_newmortal();
+ sv_setref_pv(ST(0), class, (void *) out);
+ XSRETURN(1);
+
+void
+set_entry(class, matrix, row, col, val)
+ char * class
+ dpAlign_ScoringMatrix * matrix
+ char * row
+ char * col
+ int val
+ PPCODE:
+ set_dpAlign_ScoringMatrix(matrix, row, col, val);
+ XSRETURN(1);
+
+void
+DESTROY(obj)
+ dpAlign_ScoringMatrix * obj
+ PPCODE:
+ int i;
+ for (i = 0; i < obj->sz; ++i)
+ free(obj->s[i]);
+ free(obj->s);
+
+MODULE = Bio::Ext::Align PACKAGE = Bio::Ext::Align::SequenceProfile
+
+dpAlign_SequenceProfile *
+dna_new(class, seq1, match, mismatch, gap, ext)
+ char * class
+ char * seq1
+ int match
+ int mismatch
+ int gap
+ int ext
+ PPCODE:
+ dpAlign_SequenceProfile * out;
+ out = dpAlign_DNA_Profile(seq1, match, mismatch, gap, ext);
+ ST(0) = sv_newmortal();
+ sv_setref_pv(ST(0), class, (void *) out);
+ XSRETURN(1);
+
+dpAlign_SequenceProfile *
+protein_new(class, seq1, matrix)
+ char * class
+ char * seq1
+ dpAlign_ScoringMatrix * matrix
+ PPCODE:
+ dpAlign_SequenceProfile * out;
+ out = dpAlign_Protein_Profile(seq1, matrix);
+ ST(0) = sv_newmortal();
+ sv_setref_pv(ST(0), class, (void *) out);
+ XSRETURN(1);
+
+char *
+alphabet(obj)
+ dpAlign_SequenceProfile * obj
+ CODE:
+ switch (obj->type) {
+ case 1:
+ RETVAL = "dna";
+ break;
+ case 2:
+ RETVAL = "protein";
+ break;
+ case 3:
+ RETVAL = "rna";
+ break;
+ default:
+ RETVAL = "unknown";
+ }
+ OUTPUT:
+ RETVAL
+
+void
+DESTROY(obj)
+ dpAlign_SequenceProfile * obj
+ CODE:
+ free(obj->waa);
+
+MODULE = Bio::Ext::Align PACKAGE = Bio::Ext::Align::AlignOutput
+char *
+aln1(obj)
+ dpAlign_AlignOutput * obj
+ CODE:
+ RETVAL = obj->aln1;
+ OUTPUT:
+ RETVAL
+char *
+aln2(obj)
+ dpAlign_AlignOutput * obj
+ CODE:
+ RETVAL = obj->aln2;
+ OUTPUT:
+ RETVAL
+int
+start1(obj)
+ dpAlign_AlignOutput * obj
+ CODE:
+ RETVAL = obj->start1;
+ OUTPUT:
+ RETVAL
+
+int
+end1(obj)
+ dpAlign_AlignOutput * obj
+ CODE:
+ RETVAL = obj->end1;
+ OUTPUT:
+ RETVAL
+
+int
+start2(obj)
+ dpAlign_AlignOutput * obj
+ CODE:
+ RETVAL = obj->start2;
+ OUTPUT:
+ RETVAL
+
+int
+end2(obj)
+ dpAlign_AlignOutput * obj
+ CODE:
+ RETVAL = obj->end2;
+ OUTPUT:
+ RETVAL
+
+int
+score(obj)
+ dpAlign_AlignOutput * obj
+ CODE:
+ RETVAL = obj->score;
+ OUTPUT:
+ RETVAL
+
+void
+DESTROY(obj)
+ dpAlign_AlignOutput * obj
+ CODE:
+ free(obj->aln1);
+ free(obj->aln2);
View
6 Bio/Ext/Align/Makefile.PL
@@ -3,7 +3,7 @@ use ExtUtils::MakeMaker;
# the contents of the Makefile that is written.
WriteMakefile(
'NAME' => 'Bio::Ext::Align',
- 'VERSION' => '0.1',
+ 'VERSION' => '1.5.1',
'LIBS' => ['-lm'], # e.g., '-lm'
'DEFINE' => '-DPOSIX -DNOERROR', # e.g., '-DHAVE_SOMETHING'
'INC' => '-I./libs', # e.g., '-I/usr/include/other'
@@ -14,8 +14,8 @@ WriteMakefile(
sub MY::postamble{
'
$(MYEXTLIB):
- DEFINE=\'$(DEFINE)\'; CC=\'$(PERLMAINCC)\'; export DEFINE INC CC; \
- cd libs && $(MAKE) CC=$(CC) libsw$(LIB_EXT) -e
+ DEFINE=\'$(DEFINE)\'; CC=\'$(PERLMAINCC)\'; CFLAGS=\'$(CCFLAGS)\'; export DEFINE INC CC CFLAGS; \
+ cd libs && $(MAKE) CC=\'$(PERLMAINCC)\' CFLAGS=\'$(CCFLAGS) $(DEFINE)\' DEFINE=\'$(DEFINE)\' libsw$(LIB_EXT) -e
';
}
View
30 Bio/Ext/Align/blosum45.mat
@@ -0,0 +1,30 @@
+# Matrix made by matblas from blosum45.iij
+# BLOSUM Clustered Scoring Matrix in 1/3 Bit Units
+# Blocks Database = /data/blocks_5.0/blocks.dat
+# Cluster Percentage: >= 45
+# Entropy = 0.3795, Expected = -0.2789
+ A R N D C Q E G H I L K M F P S T W Y V B Z X
+A 5 -2 -1 -2 -1 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -2 -2 0 -1 -1 0
+R -2 7 0 -1 -3 1 0 -2 0 -3 -2 3 -1 -2 -2 -1 -1 -2 -1 -2 -1 0 -1
+N -1 0 6 2 -2 0 0 0 1 -2 -3 0 -2 -2 -2 1 0 -4 -2 -3 4 0 -1
+D -2 -1 2 7 -3 0 2 -1 0 -4 -3 0 -3 -4 -1 0 -1 -4 -2 -3 5 1 -1
+C -1 -3 -2 -3 12 -3 -3 -3 -3 -3 -2 -3 -2 -2 -4 -1 -1 -5 -3 -1 -2 -3 -2
+Q -1 1 0 0 -3 6 2 -2 1 -2 -2 1 0 -4 -1 0 -1 -2 -1 -3 0 4 -1
+E -1 0 0 2 -3 2 6 -2 0 -3 -2 1 -2 -3 0 0 -1 -3 -2 -3 1 4 -1
+G 0 -2 0 -1 -3 -2 -2 7 -2 -4 -3 -2 -2 -3 -2 0 -2 -2 -3 -3 -1 -2 -1
+H -2 0 1 0 -3 1 0 -2 10 -3 -2 -1 0 -2 -2 -1 -2 -3 2 -3 0 0 -1
+I -1 -3 -2 -4 -3 -2 -3 -4 -3 5 2 -3 2 0 -2 -2 -1 -2 0 3 -3 -3 -1
+L -1 -2 -3 -3 -2 -2 -2 -3 -2 2 5 -3 2 1 -3 -3 -1 -2 0 1 -3 -2 -1
+K -1 3 0 0 -3 1 1 -2 -1 -3 -3 5 -1 -3 -1 -1 -1 -2 -1 -2 0 1 -1
+M -1 -1 -2 -3 -2 0 -2 -2 0 2 2 -1 6 0 -2 -2 -1 -2 0 1 -2 -1 -1
+F -2 -2 -2 -4 -2 -4 -3 -3 -2 0 1 -3 0 8 -3 -2 -1 1 3 0 -3 -3 -1
+P -1 -2 -2 -1 -4 -1 0 -2 -2 -2 -3 -1 -2 -3 9 -1 -1 -3 -3 -3 -2 -1 -1
+S 1 -1 1 0 -1 0 0 0 -1 -2 -3 -1 -2 -2 -1 4 2 -4 -2 -1 0 0 0
+T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -1 -1 2 5 -3 -1 0 0 -1 0
+W -2 -2 -4 -4 -5 -2 -3 -2 -3 -2 -2 -2 -2 1 -3 -4 -3 15 3 -3 -4 -2 -2
+Y -2 -1 -2 -2 -3 -1 -2 -3 2 0 0 -1 0 3 -3 -2 -1 3 8 -1 -2 -2 -1
+V 0 -2 -3 -3 -1 -3 -3 -3 -3 3 1 -2 1 0 -3 -1 0 -3 -1 5 -3 -3 -1
+B -1 -1 4 5 -2 0 1 -1 0 -3 -3 0 -2 -3 -2 0 0 -4 -2 -3 4 2 -1
+Z -1 0 0 1 -3 4 4 -2 0 -3 -2 1 -1 -3 -1 0 -1 -2 -2 -3 2 4 -1
+X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0 0 -2 -1 -1 -1 -1 -1
+
View
29 Bio/Ext/Align/blosum50.mat
@@ -0,0 +1,29 @@
+# Matrix made by matblas from blosum50.iij
+# BLOSUM Clustered Scoring Matrix in 1/3 Bit Units
+# Blocks Database = /data/blocks_5.0/blocks.dat
+# Cluster Percentage: >= 50
+# Entropy = 0.4808, Expected = -0.3573
+ A R N D C Q E G H I L K M F P S T W Y V B Z X
+A 5 -2 -1 -2 -1 -1 -1 0 -2 -1 -2 -1 -1 -3 -1 1 0 -3 -2 0 -2 -1 -1
+R -2 7 -1 -2 -4 1 0 -3 0 -4 -3 3 -2 -3 -3 -1 -1 -3 -1 -3 -1 0 -1
+N -1 -1 7 2 -2 0 0 0 1 -3 -4 0 -2 -4 -2 1 0 -4 -2 -3 4 0 -1
+D -2 -2 2 8 -4 0 2 -1 -1 -4 -4 -1 -4 -5 -1 0 -1 -5 -3 -4 5 1 -1
+C -1 -4 -2 -4 13 -3 -3 -3 -3 -2 -2 -3 -2 -2 -4 -1 -1 -5 -3 -1 -3 -3 -2
+Q -1 1 0 0 -3 7 2 -2 1 -3 -2 2 0 -4 -1 0 -1 -1 -1 -3 0 4 -1
+E -1 0 0 2 -3 2 6 -3 0 -4 -3 1 -2 -3 -1 -1 -1 -3 -2 -3 1 5 -1
+G 0 -3 0 -1 -3 -2 -3 8 -2 -4 -4 -2 -3 -4 -2 0 -2 -3 -3 -4 -1 -2 -2
+H -2 0 1 -1 -3 1 0 -2 10 -4 -3 0 -1 -1 -2 -1 -2 -3 2 -4 0 0 -1
+I -1 -4 -3 -4 -2 -3 -4 -4 -4 5 2 -3 2 0 -3 -3 -1 -3 -1 4 -4 -3 -1
+L -2 -3 -4 -4 -2 -2 -3 -4 -3 2 5 -3 3 1 -4 -3 -1 -2 -1 1 -4 -3 -1
+K -1 3 0 -1 -3 2 1 -2 0 -3 -3 6 -2 -4 -1 0 -1 -3 -2 -3 0 1 -1
+M -1 -2 -2 -4 -2 0 -2 -3 -1 2 3 -2 7 0 -3 -2 -1 -1 0 1 -3 -1 -1
+F -3 -3 -4 -5 -2 -4 -3 -4 -1 0 1 -4 0 8 -4 -3 -2 1 4 -1 -4 -4 -2
+P -1 -3 -2 -1 -4 -1 -1 -2 -2 -3 -4 -1 -3 -4 10 -1 -1 -4 -3 -3 -2 -1 -2
+S 1 -1 1 0 -1 0 -1 0 -1 -3 -3 0 -2 -3 -1 5 2 -4 -2 -2 0 0 -1
+T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 2 5 -3 -2 0 0 -1 0
+W -3 -3 -4 -5 -5 -1 -3 -3 -3 -3 -2 -3 -1 1 -4 -4 -3 15 2 -3 -5 -2 -3
+Y -2 -1 -2 -3 -3 -1 -2 -3 2 -1 -1 -2 0 4 -3 -2 -2 2 8 -1 -3 -2 -1
+V 0 -3 -3 -4 -1 -3 -3 -4 -4 4 1 -3 1 -1 -3 -2 0 -3 -1 5 -4 -3 -1
+B -2 -1 4 5 -3 0 1 -1 0 -4 -4 0 -3 -4 -2 0 0 -5 -3 -4 5 2 -1
+Z -1 0 0 1 -3 4 5 -2 0 -3 -3 1 -1 -4 -1 0 -1 -2 -2 -3 2 5 -1
+X -1 -1 -1 -1 -2 -1 -1 -2 -1 -1 -1 -1 -1 -2 -2 -1 0 -3 -1 -1 -1 -1 -1
View
30 Bio/Ext/Align/blosum62.mat
@@ -0,0 +1,30 @@
+# Matrix made by matblas from blosum62.iij
+# BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
+# Blocks Database = /data/blocks_5.0/blocks.dat
+# Cluster Percentage: >= 62
+# Entropy = 0.6979, Expected = -0.5209
+ A R N D C Q E G H I L K M F P S T W Y V B Z X
+A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0
+R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1
+N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1
+D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1
+C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2
+Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1
+E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1
+G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1
+H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1
+I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1
+L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1
+K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1
+M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1
+F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1
+P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2
+S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0
+T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0
+W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2
+Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1
+V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1
+B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1
+Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1
+X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1
+
View
298 Bio/Ext/Align/libs/dpalign.c
@@ -0,0 +1,298 @@
+#include "dpalign.h"
+
+int hoxd[4][4] = {
+{ 91,-114, -31,-123},
+{-114, 100,-125, -31},
+{ -31,-125, 100,-114},
+{-123, -31,-114, 91}
+};
+
+int blosum62[24][24] = {
+{ 4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0, -2, -1, 0, -4},
+{-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3, -1, 0, -1, -4},
+{-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3, 3, 0, -1, -4},
+{-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3, 4, 1, -1, -4},
+{ 0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1, -3, -3, -2, -4},
+{-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2, 0, 3, -1, -4},
+{-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4},
+{ 0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3, -1, -2, -1, -4},
+{-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3, 0, 0, -1, -4},
+{-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3, -3, -3, -1, -4},
+{-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1, -4, -3, -1, -4},
+{-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2, 0, 1, -1, -4},
+{-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1, -3, -1, -1, -4},
+{-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1, -3, -3, -1, -4},
+{-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2, -2, -1, -2, -4},
+{ 1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2, 0, 0, 0, -4},
+{ 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0, -1, -1, 0, -4},
+{-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3, -4, -3, -2, -4},
+{-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1, -3, -2, -1, -4},
+{ 0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4, -3, -2, -1, -4},
+{-2, -1, 3, 4, -3, 0, 1, -1, 0, -3, -4, 0, -3, -3, -2, 0, -1, -4, -3, -3, 4, 1, -1, -4},
+{-1, 0, 0, 1, -3, 3, 4, -2, 0, -3, -3, 1, -1, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4},
+{ 0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, 0, 0, -2, -1, -1, -1, -1, -1, -4},
+{-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 1}
+};
+
+/*
+ dpAlign_fatal prints the string s to STDERR and then exit
+ */
+void
+dpAlign_fatal(char * s)
+{
+ fprintf(stderr, s);
+ exit(-1);
+}
+
+/*
+ align is an implementation of Miller-Myers' dynamic programming alignment
+ algorithm using the gap array to represent the alignment result.
+ There are two gap arrays, one for each sequence. Each of them consists of
+ M+1 and N+1 cells where M is the length of sequence A and N is the length
+ of sequence B. In general, spc1[i] will contain an integer that indicates
+ how many gaps need to be inserted between A[i-1] and A[i]. For example,
+ spc1[1] will represent the number of gaps inserted between residues A[0]
+ and A[1]. s is the scoring matrix. g is the gap opening cost. e is the
+ gap extension cost. F is the forward Gotoh array. R is the backward
+ Gotoh array. Note that spc1 and spc2 gap arrays should be initilized to
+ all zeros when you first call this function.
+ align returns the score of the resulting alignment. At the end, the gap
+ arrays spc1 and spc2 will also be set to the proper values.
+ */
+int
+align(unsigned char * A, unsigned char * B, int M, int N, int ** s, int g, int e, struct swstr * F, struct swstr * R, int * spc1, int * spc2)
+{
+ int midi, midj, type;
+ int midc;
+ register struct swstr * swp;
+ register int i, j;
+ register int from, P, t;
+ int c; /* score of a cell */
+ int d; /* down value in Q array */
+ int * ss;
+ int h = g + e;
+
+/* if the mstrix to divide and conquer has size <= 1 */
+ if (N <= 0) {
+ if (M > 0) *spc2 += M;
+ return -gap(M, g, e);
+ }
+
+ if (M <= 1) {
+ if (M <= 0) {
+ *spc1 += N;
+ return -gap(N, g, e);
+ }
+ midc = -h - gap(N, g, e);
+ midj = 0;
+ ss = s[A[0]];
+ for (j = 1; j <= N; ++j) {
+ c = -gap(j-1, g, e) + ss[B[j-1]] - gap(N-j, g, e);
+ if (c > midc) {
+ midc = c;
+ midj = j;
+ }
+ }
+ if (midj == 0) {
+ *(spc1+1) += N;
+ ++(*spc2);
+ }
+ else {
+ if (midj > 1) *spc1 += (midj-1);
+ if (midj < N) *(spc1+1) += (N-midj);
+ }
+ return midc;
+ }
+
+ swp = F;
+ F[0].H = 0;
+ t = -g;
+ for (swp = F+1; swp <= F+N; ++swp)
+ swp->E = swp->H = t = t - e;
+
+/* Calculate forward matrix cost */
+
+ t = -g;
+ midi = M/2;
+ for (i = 0; i < midi; ++i) {
+ from = F[0].H;
+ F[0].H = c = P = t = t - e;
+ ss = s[A[i]];
+ for (swp = F+1, j = 0; j < N; ++swp, ++j) {
+ if ((c = c - h) > (P = P - e)) P = c;
+ if ((c = swp->H - h) > (d = swp->E - e)) d = c;
+ c = from + ss[B[j]];
+ if (P > c) c = P;
+ if (d > c) c = d;
+ swp->E = d;
+ from = swp->H;
+ swp->H = c;
+ }
+ }
+ F[0].E = F[0].H;
+
+ R[0].H = 0;
+ t = -g;
+ for (swp = R+1; swp <= R+N; ++swp)
+ swp->E = swp->H = t = t - e;
+
+/* Calculate backward matrix cost */
+
+ t = -g;
+ for (i = M-1; i >=midi; --i) {
+ from = R[0].H;
+ R[0].H = P = c = t = t - e;
+ ss = s[A[i]];
+ for (swp = R+1, j = N-1; j >= 0; ++swp, --j) {
+ if ((c = c - h) > (P = P - e)) P = c;
+ if ((c = swp->H - h) > (d = swp->E - e)) d = c;
+ c = from + ss[B[j]];
+ if (P > c) c = P;
+ if (d > c) c = d;
+ swp->E = d;
+ from = swp->H;
+ swp->H = c;
+ }
+ }
+ R[0].E = R[0].H;
+
+ midc = F[N].H + R[0].H;
+ midj = N;
+ type = 1;
+/* see if it is type I or type II */
+
+ for (j = N-1; j >= 0; --j) {
+ c = F[j].H + R[N-j].H;
+ if (c > midc) {
+ midc = c;
+ midj = j;
+ }
+ }
+
+ for (j = N; j >= 0; --j) {
+ c = F[j].E + R[N-j].E + g;
+ if (c > midc) {
+ midc = c;
+ midj = j;
+ type = 2;
+ }
+ }
+
+ if (type == 1) {
+ align(A, B, midi, midj, s, g, e, F, R, spc1, spc2);
+ align(A+midi, B+midj, M-midi, N-midj, s, g, e, F, R, spc1+midi, spc2+midj);
+ }
+ else {
+ align(A, B, midi-1, midj, s, g, e, F, R, spc1, spc2);
+ *(spc2+midj) += 2;
+ align(A+midi+1, B+midj, M-midi-1, N-midj, s, g, e, F, R, spc1+midi+1, spc2+midj);
+ }
+ return midc;
+}
+
+/*
+ pgreen runs Phil Green's algorithm to calculate alignment score between
+ two sequences. The argument waa is a sequence profile derived from
+ the query sequence and the scoring matrix. The second argument M is
+ the length of query sequence. B is the sequence to compare to. N is
+ the length of the second sequence. gap is the gap opening cost. ext
+ is the gap extension cost. ss is a preallocated array of struct swstr
+ with a size of M.
+
+ Most of the code in this function is copied from the do_work function
+ in dropgsw.c inside the FASTA distribution.
+ */
+int
+pgreen(int * waa, int M, unsigned char * B, int N, int gap, int ext, struct swstr * ss)
+{
+ int score;
+ int e, f, h, p, i, temp;
+ struct swstr * ssj;
+
+ int * pwaa;
+
+ gap = gap + ext;
+ ext = -ext;
+ ss[M].H = -1;
+ ss[M].E = 1;
+ score = 0;
+ for (h = 0; h < M; ++h) {
+ ss[h].H = ss[h].E = 0;
+ }
+
+ for (i = 0; i < N; ++i) {
+ pwaa = waa + B[i]*M;
+ ssj = ss;
+
+ e = f = h = p = 0;
+ zero_f:
+ while (1) {
+ h = p + *pwaa++;
+ p = ssj->H;
+ if ((e = ssj->E) > 0) {
+ if (p == -1) goto next_row;
+ if (h < e) h = e;
+ else if (h > gap) {
+ e += ext;
+ goto transition;
+ }
+ e += ext;
+ ssj->E = (e > 0) ? e : 0;
+ ssj++->H = h;
+ }
+ else {
+ if (h > 0) {
+ if (h > gap) {
+ e = 0;
+ goto transition;
+ }
+ ssj++->H = h;
+ }
+ else ssj++->H = 0;
+ }
+ }
+
+ transition:
+ if (score < h) score = h;
+
+ temp = h - gap;
+ if (f < temp) f = temp;
+ if (e < temp) e = temp;
+ ssj->E = (e > 0) ? e : 0;
+ ssj++->H = h;
+ e = 0;
+
+ do {
+ h = p + *pwaa++;
+ p = ssj->H;
+
+ if (h < f) h = f;
+ f += ext;
+
+ if ((e = ssj->E) > 0) {
+ if (p == -1) goto next_row;
+ if (h < e) h = e;
+ else if (h > gap) {
+ e += ext;
+ goto transition;
+ }
+ e += ext;
+ ssj->E = (e > 0) ? e : 0;
+ ssj++->H = h;
+ }
+ else {
+ if (h > gap) {
+ e = 0;
+ goto transition;
+ }
+ ssj++->H = h;
+ }
+ } while (f > 0);
+ goto zero_f;
+ next_row:
+ ;
+ }
+
+ free(waa);
+ return score;
+}
View
66 Bio/Ext/Align/libs/dpalign.h
@@ -0,0 +1,66 @@
+#ifndef _DPALIGN_H_
+#define _DPALIGN_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+/* global variables */
+extern int hoxd[4][4];
+extern int blosum62[24][24];
+
+struct swstr {
+ int H; /* score for CC/RR matrix */
+ int E; /* score for DD/SS matrix */
+};
+
+#define gap(k, g, h) ((k) <= 0 ? 0 : (g)+(h)*(k)) /* gap cost */
+/* a naive macro to encode DNA nucleotide */
+#define dna_encode(c) (c == 'A' ? 0x00 : c == 'C' ? 0x01 : c == 'G' ? 0x02 : c == 'T' ? 0x03 : c == 'U' ? 0x04 : c == 'R' ? 0x05 : c == 'Y' ? 0x06 : c == 'M' ? 0x07 : c == 'W' ? 0x08 : c == 'S' ? 0x09 : c == 'K' ? 0x0a : c == 'D' ? 0x0b : c == 'H' ? 0x0c : c == 'V' ? 0x0d : c == 'B' ? 0x0e : c == 'N' ? 0x0f : c == 'X' ? 0x10 : -1)
+/* a naive macro to encode amino acids */
+#define prot_encode(c) (c == 'A' ? 0x00 : c == 'R' ? 0x01 : c == 'N' ? 0x02 : c == 'D' ? 0x03 : c == 'C' ? 0x04 : c == 'Q' ? 0x05 : c == 'E' ? 0x06 : c == 'G' ? 0x07 : c == 'H' ? 0x08 : c == 'I' ? 0x09 : c == 'L' ? 0x0a : c == 'K' ? 0x0b : c == 'M' ? 0x0c : c == 'F' ? 0x0d : c == 'P' ? 0x0e : c == 'S' ? 0x0f : c == 'T' ? 0x10 : c == 'W' ? 0x11 : c == 'Y' ? 0x12 : c == 'V' ? 0x13 : c == 'B' ? 0x14 : c == 'Z' ? 0x15 : c == 'X' ? 0x16 : c == '*' ? 0x17 : -1)
+
+typedef struct _dpAlign_alnoutput {
+ char * aln1; /* aligned subsequence of sequence 1 with space '-' inserted */
+ int start1; /* start point of aligned subsequence 1 */
+ int end1; /* end point of aligned subsequence 1 */
+ char * aln2; /* aligned subsequence of sequence 2 with space '-' inserted */
+ int start2; /* start point of aligned subsequence 2 */
+ int end2; /* end point of aligned subsequence 2 */
+ int score; /* score of this alignment */
+} dpAlign_AlignOutput;
+
+typedef struct _dpAlign_SequenceProfile {
+ int * waa; /* sequence profile */
+ int gap;
+ int ext;
+ int len; /* length of sequence */
+ int type; /* 1 is DNA, 2 is Protein, 3 is RNA */
+ int a[256]; /* alphabet array that maps a character in the alphabet to an integer index that indexes the columns and rows in the scoring matrix */
+} dpAlign_SequenceProfile;
+
+typedef struct _dpAlign_ScoringMatrix {
+ int ** s; /* scoring matrix pointer */
+ int a[256]; /* alphabet array that maps a character in the alphabet to an integer index that indexes the columns and rows in the scoring matrix */
+ int gap; /* gap opening penalty for this matrix */
+ int ext; /* gap extension penalty for this matrix */
+ int sz; /* size of alphabet */
+} dpAlign_ScoringMatrix;
+
+dpAlign_AlignOutput * dpAlign_Local_DNA_MillerMyers(char *, char *, int, int, int, int);
+dpAlign_AlignOutput * dpAlign_Global_DNA_MillerMyers(char *, char *, int, int, int, int);
+dpAlign_AlignOutput * dpAlign_Local_Protein_MillerMyers(char *, char *, dpAlign_ScoringMatrix *);
+dpAlign_AlignOutput * dpAlign_Global_Protein_MillerMyers(char *, char *, dpAlign_ScoringMatrix *);
+dpAlign_SequenceProfile * dpAlign_Protein_Profile(char *, dpAlign_ScoringMatrix *);
+int dpAlign_Local_Protein_PhilGreen(dpAlign_SequenceProfile *, char *);
+dpAlign_SequenceProfile * dpAlign_DNA_Profile(char *, int, int, int, int);
+int dpAlign_Local_DNA_PhilGreen(dpAlign_SequenceProfile *, char *);
+dpAlign_ScoringMatrix * new_dpAlign_ScoringMatrix(char *, int, int);
+void set_dpAlign_ScoringMatrix(dpAlign_ScoringMatrix *, char *, char *, int);
+dpAlign_AlignOutput * dpAlign_Local_DNA_Green(char *, char *, int, int, int, int);
+void dpAlign_fatal(char *);
+int align(unsigned char *, unsigned char *, int, int, int **, int, int, struct swstr *, struct swstr *, int *, int *);
+int pgreen(int *, int, unsigned char *, int, int, int, struct swstr *);
+#endif
View
779 Bio/Ext/Align/libs/linspc.c
@@ -0,0 +1,779 @@
+#include "dpalign.h"
+#include <sys/time.h>
+
+/* $Id$ */
+
+typedef struct sw_alignstruct {
+ char * seq1; /* NULL-terminated sequence 1 string */
+ int len1; /* length of sequence 1 */
+ char * seq2; /* NULL-terminated sequence 2 string */
+ int len2; /* length of sequence 2 */
+ unsigned char * s1; /* encoded sequence 1 */
+ unsigned char * s2; /* encoded sequence 2 */
+ int ** s; /* DNA/Protein scoring matrix */
+ int gap; /* gap opening penalty. default is 3 */
+ int ext; /* gap extension penalty, default is 1 for each gap */
+ int score; /* total score of the result alignment */
+ int start1; /* start point of aligned subsequence in sequence 1 */
+ int start2; /* start point of aligned subsequence in sequence 2 */
+ int end1; /* end point of aligned subsequence in sequence 1 */
+ int end2; /* end point of aligned subsequence in sequence 2 */
+ struct swstr * FF; /* forward Gotoh arrays */
+ struct swstr * RR; /* reverse Gotoh arrays */
+ int * spc1; /* gap array for sequence 1 */
+ int * spc2; /* gap array for sequence 2 */
+} sw_AlignStruct;
+
+/* static functions */
+static sw_AlignStruct * init_AlignStruct(char *, char *, int **, int, int);
+static void find_ends(sw_AlignStruct *);
+static dpAlign_AlignOutput * traceback(sw_AlignStruct *);
+
+/*
+ dpAlign_Local_DNA_MillerMyers implements the Miller-Myers algorithm
+ to align DNA sequences as defined in their 1988 paper. It takes two
+ character string sequences seq1 and seq2, together with the scoring
+ parameters for match, mismatch, gap opening and gap extension as
+ arguments. At the end, it returns the dpAlign_AlignOutput data
+ structure which can be translated into a Bio::SimpleAlign object
+ pretty easily.
+ */
+dpAlign_AlignOutput *
+dpAlign_Local_DNA_MillerMyers(char * seq1, char * seq2, int match, int mismatch, int gap, int ext)
+{
+ sw_AlignStruct * as;
+ int ** s;
+ int i, j;
+
+ if (seq1 == NULL)
+ dpAlign_fatal("Sequence 1 is a NULL pointer!\n");
+ if (seq2 == NULL)
+ dpAlign_fatal("Sequence 2 is a NULL pointer!\n");
+
+/* initialize DNA scoring matrix */
+ s = (int **) malloc(17*sizeof(int *));
+ if (s == NULL)
+ dpAlign_fatal("Cannot allocate memory for scoring matrix row!\n");
+ for (i = 0; i < 17; ++i) {
+ s[i] = (int *) malloc(17*sizeof(int));
+ if (s[i] == NULL)
+ dpAlign_fatal("Cannot allocate memory for scoring matrix col!\n");
+ for (j = 0; j < 17; ++j) {
+ if (i == 16 || j == 16) s[i][j] = mismatch; /* X mismatches all */
+ else if (i == j) s[i][j] = match;
+ else s[i][j] = mismatch;
+ }
+ }
+
+/* initialize the alignment data structure */
+ as = init_AlignStruct(seq1, seq2, s, gap, ext);
+
+/* uppercase the sequence and then encode it */
+ for (i = 0; i < as->len1; ++i) {
+ if (as->seq1[i] >= 'a' && as->seq1[i] <= 'z') as->seq1[i] -= 0x20;
+ as->s1[i] = dna_encode(as->seq1[i]);
+ }
+ for (i = 0; i < as->len2; ++i) {
+ if (as->seq2[i] >= 'a' && as->seq2[i] <= 'z') as->seq2[i] -= 0x20;
+ as->s2[i] = dna_encode(as->seq2[i]);
+ }
+
+/* locate the end points of the subsequences that gives you the maximal
+ score */
+ find_ends(as);
+
+ if (as->score <= 0)
+ return NULL;
+
+/* align the subsequences bounded by the end points */
+ as->score = align(as->s1 + as->start1 - 1, as->s2 + as->start2 - 1, as->end1 - as->start1 + 1, as->end2 - as->start2 + 1, as->s, as->gap, as->ext, as->FF, as->RR, as->spc1, as->spc2);
+/*
+ for (i = 0; i < 17; ++i) {
+ free(as->s[i]);
+ }
+ free(as->s);
+*/
+ return traceback(as);
+}
+
+/*
+ dpAlign_Global_DNA_MillerMyers implements the Miller-Myers algorithm
+ to align DNA sequences as defined in their 1988 paper. It takes two
+ character string sequences seq1 and seq2, together with the scoring
+ parameters for match, mismatch, gap opening and gap extension as
+ arguments. At the end, it returns the dpAlign_AlignOutput data
+ structure which can be translated into a Bio::SimpleAlign object
+ pretty easily.
+ */
+dpAlign_AlignOutput *
+dpAlign_Global_DNA_MillerMyers(char * seq1, char * seq2, int match, int mismatch, int gap, int ext)
+{
+ sw_AlignStruct * as;
+ int ** s;
+ int i, j;
+
+ if (seq1 == NULL)
+ dpAlign_fatal("Sequence 1 is a NULL pointer!\n");
+ if (seq2 == NULL)
+ dpAlign_fatal("Sequence 2 is a NULL pointer!\n");
+
+/* initialize DNA scoring matrix */
+ s = (int **) malloc(17*sizeof(int *));
+ if (s == NULL)
+ dpAlign_fatal("Cannot allocate memory for scoring matrix row!\n");
+ for (i = 0; i < 17; ++i) {
+ s[i] = (int *) malloc(17*sizeof(int));
+ if (s[i] == NULL)
+ dpAlign_fatal("Cannot allocate memory for scoring matrix col!\n");
+ for (j = 0; j < 17; ++j) {
+ if (i == 16 || j == 16) s[i][j] = mismatch; /* X mismatches all */
+ else if (i == j) s[i][j] = match;
+ else s[i][j] = mismatch;
+ }
+ }
+
+/* initialize the alignment data structure */
+ as = init_AlignStruct(seq1, seq2, s, gap, ext);
+
+/* uppercase the sequence and then encode it */
+ for (i = 0; i < as->len1; ++i) {
+ if (as->seq1[i] >= 'a' && as->seq1[i] <= 'z') as->seq1[i] -= 0x20;
+ as->s1[i] = dna_encode(as->seq1[i]);
+ }
+ for (i = 0; i < as->len2; ++i) {
+ if (as->seq2[i] >= 'a' && as->seq2[i] <= 'z') as->seq2[i] -= 0x20;
+ as->s2[i] = dna_encode(as->seq2[i]);
+ }
+
+/* initialize the spaces arrays */
+ as->spc1 = (int *) calloc(as->len1 + 1, sizeof(int));
+ if (as->spc1 == NULL)
+ dpAlign_fatal("Can't allocate memory for spaces array for seq 1!\n");
+ as->spc2 = (int *) calloc(as->len2 + 1, sizeof(int));
+ if (as->spc2 == NULL)
+ dpAlign_fatal("Can't allocate memory for spaces array for seq 2!\n");
+/* align the subsequences bounded by the end points */
+ as->score = align(as->s1, as->s2, as->len1, as->len2, as->s, as->gap, as->ext, as->FF, as->RR, as->spc1, as->spc2);
+/*
+ for (i = 0; i < 17; ++i) {
+ free(as->s[i]);
+ }
+ free(as->s);
+*/
+/* set start and end for global alignment */
+ as->start1 = 1;
+ as->end1 = as->len1;
+ as->start2 = 1;
+ as->end2 = as->len2;
+ return traceback(as);
+}
+
+/*
+ new_dpAlign_ScoringMatrix instantiate a dpAlign_ScoringMatrix object
+ with a alphabet string, gap opening penalty and gap extension penalty.
+ The dpAlign_ScoringMatrix object created will have alphabet mapping
+ array, gap penalities initialized. Memory will be allocated for the
+ scoring matrix s and initialized to zeros.
+ */
+dpAlign_ScoringMatrix *
+new_dpAlign_ScoringMatrix(char * alphabet, int gap, int ext)
+{
+ int i;
+ dpAlign_ScoringMatrix * matrix;
+
+ matrix = (dpAlign_ScoringMatrix *) malloc(sizeof(dpAlign_ScoringMatrix));
+ if (matrix == NULL)
+ dpAlign_fatal("Can't allocate memory for dpAlign_ScoringMatrix!\n");
+ matrix->sz = strlen(alphabet);
+ matrix->s = (int **) malloc(matrix->sz*sizeof(int *));
+ if (matrix->s == NULL)
+ dpAlign_fatal("Can't allocate memory for dpAlign_ScoringMatrix's s!\n");
+ for (i = 0; i < matrix->sz; ++i) {
+ matrix->s[i] = (int *) calloc(matrix->sz, sizeof(int));
+ if (matrix->s[i] == NULL)
+ dpAlign_fatal("Can't allocate memory for dpAlign_ScoringMatrix's s!\n");
+ }
+ matrix->gap = gap;
+ matrix->ext = ext;
+ for (i = 0; i < matrix->sz; ++i)
+ matrix->a[alphabet[i]] = i;
+ return matrix;
+}
+
+/*
+ set_dpAlign_ScoringMatrix initilizes the scoring matrix s of a
+ dpAlign_ScoringMatrix object. It only intializes one cell. Therefore
+ to use this function, you need to wrap it with two loops over the
+ alphabet set to initialize all the fields.
+ */
+void
+set_dpAlign_ScoringMatrix(dpAlign_ScoringMatrix * matrix, char * row, char * col, int val)
+{
+ matrix->s[matrix->a[row[0]]][matrix->a[col[0]]] = val;
+}
+
+/*
+ dpAlign_Protein_Profile creates a sequence profile from a protein
+ sequence.
+ */
+dpAlign_SequenceProfile *
+dpAlign_Protein_Profile(char * seq1, dpAlign_ScoringMatrix * matrix)
+{
+ int i, j;
+ unsigned char * s1;
+ int gap = 7;
+ int ext = 1;
+ int sz = 24;
+ int * smp; /* scoring matrix pointer */
+ int * pwaa;
+ dpAlign_SequenceProfile * sp;
+
+ if (seq1 == NULL)
+ dpAlign_fatal("Sequence 1 is a NULL pointer!\n");
+ sp = (dpAlign_SequenceProfile *) malloc(sizeof(dpAlign_SequenceProfile));
+ if (sp == NULL)
+ dpAlign_fatal("Can't allocate memory for Sequence Profile!\n");
+ sp->len = strlen(seq1);
+ s1 = (unsigned char *) malloc(sp->len*sizeof(unsigned char));
+ if (s1 == NULL)
+ dpAlign_fatal("Cannot allocate memory for encoded sequence 1!\n");
+ if (matrix == NULL) {
+ sp->a['A'] = 0x00; sp->a['R'] = 0x01; sp->a['N'] = 0x02; sp->a['D'] = 0x03;
+ sp->a['C'] = 0x04; sp->a['Q'] = 0x05; sp->a['E'] = 0x06; sp->a['G'] = 0x07;
+ sp->a['H'] = 0x08; sp->a['I'] = 0x09; sp->a['L'] = 0x0a; sp->a['K'] = 0x0b;
+ sp->a['M'] = 0x0c; sp->a['F'] = 0x0d; sp->a['P'] = 0x0e; sp->a['S'] = 0x0f;
+ sp->a['T'] = 0x10; sp->a['W'] = 0x11; sp->a['Y'] = 0x12; sp->a['V'] = 0x13;
+ sp->a['B'] = 0x14; sp->a['Z'] = 0x15; sp->a['X'] = 0x16; sp->a['*'] = 0x17;
+ }
+ else {
+ gap = matrix->gap;
+ ext = matrix->ext;
+ sz = matrix->sz;
+ for (i = 0; i < 256; ++i)
+ sp->a[i] = matrix->a[i];
+ }
+ for (i = 0; i < sp->len; ++i) {
+ if (seq1[i] >= 'a' && seq1[i] <= 'z') seq1[i] -= 0x20;
+ s1[i] = sp->a[seq1[i]];
+ }
+
+ sp->waa = (int *) malloc(sizeof(int)*sz*sp->len);
+ if (sp->waa == NULL)
+ dpAlign_fatal("Can't allocate memory for waa!\n");
+ pwaa = sp->waa;
+ if (matrix == NULL) {
+ for (i = 0; i < sz; ++i) {
+ smp = blosum62[i];
+ for (j = 0; j < sp->len; ++j)
+ *pwaa++ = smp[s1[j]];
+ }
+ }
+ else {
+ for (i = 0; i < sz; ++i) {
+ smp = matrix->s[i];
+ for (j = 0; j < sp->len; ++j) {
+//printf("hi1\t%d\t%d\t%d\t%d\n", i, j, s1[j], smp[s1[j]]);
+ *pwaa++ = smp[s1[j]];
+}
+ }
+ }
+ sp->gap = gap;
+ sp->ext = ext;
+ sp->type = 2;
+ free(s1);
+ return sp;
+}
+
+/*
+ dpAlign_Local_Protein_PhilGreen compares a protein sequence profile
+ with a protein sequence and return the optimal local alignment score.
+ */
+int
+dpAlign_Local_Protein_PhilGreen(dpAlign_SequenceProfile * sp, char * seq2)
+{
+ int i;
+ int N;
+ struct swstr * ss;
+ int score;
+ unsigned char * s2;
+
+ if (seq2 == NULL)
+ dpAlign_fatal("Sequence 2 is a NULL pointer!\n");
+
+ N = strlen(seq2);
+
+ s2 = (unsigned char *) malloc(N*sizeof(unsigned char));
+ if (s2 == NULL)
+ dpAlign_fatal("Cannot allocate memory for encoded sequence 2!\n");
+
+ for (i = 0; i < N; ++i) {
+ if (seq2[i] >= 'a' && seq2[i] <= 'z') seq2[i] -= 0x20;
+ s2[i] = sp->a[seq2[i]];
+ }
+
+ ss = (struct swstr *) malloc(sizeof(struct swstr)*(sp->len+1));
+ if (ss == NULL)
+ dpAlign_fatal("Cannot allocate memory for ss array!\n");
+ score = pgreen(sp->waa, sp->len, s2, N, sp->gap, sp->ext, ss);
+ free(ss);
+ free(s2);
+ return score;
+}
+
+/*
+ dpAlign_Protein_Profile creates a sequence profile from a DNA sequence.
+ */
+dpAlign_SequenceProfile *
+dpAlign_DNA_Profile(char * seq1, int match, int mismatch, int gap, int ext)
+{
+ int i, j;
+ unsigned char * s1;
+ int * pwaa;
+ dpAlign_SequenceProfile * sp;
+
+ if (seq1 == NULL)
+ dpAlign_fatal("Sequence 1 is a NULL pointer!\n");
+ sp = (dpAlign_SequenceProfile *) malloc(sizeof(dpAlign_SequenceProfile));
+ if (sp == NULL)
+ dpAlign_fatal("Can't allocate memory for Sequence Profile!\n");
+ sp->len = strlen(seq1);
+ s1 = (unsigned char *) malloc(sp->len*sizeof(unsigned char));
+ if (s1 == NULL)
+ dpAlign_fatal("Cannot allocate memory for encoded sequence 1!\n");
+ for (i = 0; i < sp->len; ++i) {
+ if (seq1[i] >= 'a' && seq1[i] <= 'z') seq1[i] -= 0x20;
+ s1[i] = dna_encode(seq1[i]);
+ }
+ sp->waa = (int *) malloc(sizeof(int)*24*sp->len);
+ if (sp->waa == NULL)
+ dpAlign_fatal("Can't allocate memory for waa!\n");
+ pwaa = sp->waa;
+ for (i = 0; i < 17; ++i)
+ for (j = 0; j < sp->len; ++j)
+ *pwaa++ = s1[j] == (unsigned char) i ? match : mismatch;
+ sp->gap = gap;
+ sp->ext = ext;
+ sp->type = 1;
+ free(s1);
+ return sp;
+}
+
+/*
+ dpAlign_Local_DNA_PhilGreen compares a DNA sequence profile
+ with a DNA sequence and return the optimal local alignment score.
+ */
+int
+dpAlign_Local_DNA_PhilGreen(dpAlign_SequenceProfile * sp, char * seq2)
+{
+ int i;
+ int N;
+ struct swstr * ss;
+ int score;
+ unsigned char * s2;
+
+ if (seq2 == NULL)
+ dpAlign_fatal("Sequence 2 is a NULL pointer!\n");
+
+ N = strlen(seq2);
+
+ s2 = (unsigned char *) malloc(N*sizeof(unsigned char));
+ if (s2 == NULL)
+ dpAlign_fatal("Cannot allocate memory for encoded sequence 2!\n");
+
+ for (i = 0; i < N; ++i) {
+ if (seq2[i] >= 'a' && seq2[i] <= 'z') seq2[i] -= 0x20;
+ s2[i] = dna_encode(seq2[i]);
+ }
+
+ ss = (struct swstr *) malloc(sizeof(struct swstr)*(sp->len+1));
+ if (ss == NULL)
+ dpAlign_fatal("Cannot allocate memory for ss array!\n");
+ score = pgreen(sp->waa, sp->len, s2, N, sp->gap, sp->ext, ss);
+ free(ss);
+ free(s2);
+ return score;
+}
+
+/*
+ dpAlign_Local_Protein_MillerMyers implements the Miller-Myers algorithm
+ that aligns protein sequences as defined in their 1988 paper. It takes
+ two character strings seq1 and seq2, the name of a scoring matrix.
+ Currently, we only support "BLOSUM62" matrix.
+ dpAlign_Local_Protein_MillerMyers returns a dpAlign_AlignOutput data
+ structure that can be easily converted into a Bio::SimpleAlign object.
+ */
+dpAlign_AlignOutput *
+dpAlign_Local_Protein_MillerMyers(char * seq1, char * seq2, dpAlign_ScoringMatrix * matrix)
+{
+ sw_AlignStruct * as;
+ int ** s;
+ int * a; /* alphabet array */
+ int gap = 7;
+ int ext = 1;
+ int sz = 24; /* size of alphabet */
+ int i, j;
+
+ if (seq1 == NULL)
+ dpAlign_fatal("Sequence 1 is a NULL pointer!\n");
+ if (seq2 == NULL)
+ dpAlign_fatal("Sequence 2 is a NULL pointer!\n");
+
+/* initialize the scoring matrix */
+ if (matrix == NULL) {
+ s = (int **) malloc(sz*sizeof(int *));
+ if (s == NULL)
+ dpAlign_fatal("Cannot allocate memory for scoring matrix row!\n");
+ for (i = 0; i < sz; ++i) {
+ s[i] = (int *) malloc(24*sizeof(int));
+ if (s[i] == NULL)
+ dpAlign_fatal("Cannot allocate memory for scoring matrix col!\n");
+ for (j = 0; j < sz; ++j) {
+ s[i][j] = blosum62[i][j];
+ }
+ }
+ a = (int *) malloc(256*sizeof(int));
+ if (a == NULL)
+ dpAlign_fatal("Cannot allocate memory for protein encoding array!\n");
+
+ a['A'] = 0x00; a['R'] = 0x01; a['N'] = 0x02; a['D'] = 0x03;
+ a['C'] = 0x04; a['Q'] = 0x05; a['E'] = 0x06; a['G'] = 0x07;
+ a['H'] = 0x08; a['I'] = 0x09; a['L'] = 0x0a; a['K'] = 0x0b;
+ a['M'] = 0x0c; a['F'] = 0x0d; a['P'] = 0x0e; a['S'] = 0x0f;
+ a['T'] = 0x10; a['W'] = 0x11; a['Y'] = 0x12; a['V'] = 0x13;
+ a['B'] = 0x14; a['Z'] = 0x15; a['X'] = 0x16; a['*'] = 0x17;
+ }
+ else {
+ a = matrix->a;
+ s = matrix->s;
+ gap = matrix->gap;
+ ext = matrix->ext;
+ sz = matrix->sz;
+ }
+
+/* initialize alignment data structure */
+ as = init_AlignStruct(seq1, seq2, s, gap, ext);
+
+/* uppercase the sequence and encode it */
+ for (i = 0; i < as->len1; ++i) {
+ if (as->seq1[i] >= 'a' && as->seq1[i] <= 'z') as->seq1[i] -= 0x20;
+ as->s1[i] = a[as->seq1[i]];
+ }
+ for (i = 0; i < as->len2; ++i) {
+ if (as->seq2[i] >= 'a' && as->seq2[i] <= 'z') as->seq2[i] -= 0x20;
+ as->s2[i] = a[as->seq2[i]];
+ }
+
+ if (matrix == NULL)
+ free(a); /* free array after encoding */
+
+/* locate the end points of the subsequence that results in the maximal score */
+ find_ends(as);
+
+/* align the subsequences bounded by the end points */
+ as->score = align(as->s1 + as->start1 - 1, as->s2 + as->start2 - 1, as->end1 - as->start1 + 1, as->end2 - as->start2 + 1, as->s, as->gap, as->ext, as->FF, as->RR, as->spc1, as->spc2);
+/* free scoring matrix
+ for (i = 0; i < sz; ++i) {
+ free(as->s[i]);
+ }
+ free(as->s);
+*/
+ return traceback(as);
+}
+
+/*
+ dpAlign_Global_Protein_MillerMyers implements the Miller-Myers algorithm
+ that aligns protein sequences as defined in their 1988 paper. It takes
+ two character strings seq1 and seq2, the name of a scoring matrix.
+ Currently, we only support "BLOSUM62" matrix.
+ dpAlign_Local_Protein_MillerMyers returns a dpAlign_AlignOutput data
+ structure that can be easily converted into a Bio::SimpleAlign object.
+ */
+dpAlign_AlignOutput *
+dpAlign_Global_Protein_MillerMyers(char * seq1, char * seq2, dpAlign_ScoringMatrix * matrix)
+{
+ sw_AlignStruct * as;
+ int ** s;
+ int * a; /* alphabet array */
+ int gap = 7;
+ int ext = 1;
+ int sz = 24; /* size of alphabet */
+ int i, j;
+
+ if (seq1 == NULL)
+ dpAlign_fatal("Sequence 1 is a NULL pointer!\n");
+ if (seq2 == NULL)
+ dpAlign_fatal("Sequence 2 is a NULL pointer!\n");
+
+/* initialize the scoring matrix */
+ if (matrix == NULL) {
+ s = (int **) malloc(sz*sizeof(int *));
+ if (s == NULL)
+ dpAlign_fatal("Cannot allocate memory for scoring matrix row!\n");
+ for (i = 0; i < sz; ++i) {
+ s[i] = (int *) malloc(sz*sizeof(int));
+ if (s[i] == NULL)
+ dpAlign_fatal("Cannot allocate memory for scoring matrix col!\n");
+ for (j = 0; j < sz; ++j) {
+ s[i][j] = blosum62[i][j];
+ }
+ }
+ a = (int *) malloc(256*sizeof(int));
+ if (a == NULL)
+ dpAlign_fatal("Cannot allocate memory for protein encoding array!\n");
+
+ a['A'] = 0x00; a['R'] = 0x01; a['N'] = 0x02; a['D'] = 0x03;
+ a['C'] = 0x04; a['Q'] = 0x05; a['E'] = 0x06; a['G'] = 0x07;
+ a['H'] = 0x08; a['I'] = 0x09; a['L'] = 0x0a; a['K'] = 0x0b;
+ a['M'] = 0x0c; a['F'] = 0x0d; a['P'] = 0x0e; a['S'] = 0x0f;
+ a['T'] = 0x10; a['W'] = 0x11; a['Y'] = 0x12; a['V'] = 0x13;
+ a['B'] = 0x14; a['Z'] = 0x15; a['X'] = 0x16; a['*'] = 0x17;
+ }
+ else {
+ a = matrix->a;
+ s = matrix->s;
+ gap = matrix->gap;
+ ext = matrix->ext;
+ sz = matrix->sz;
+ }
+
+/* initialize alignment data structure */
+ as = init_AlignStruct(seq1, seq2, s, gap, ext);
+
+/* uppercase the sequence and encode it */
+ for (i = 0; i < as->len1; ++i) {
+ if (as->seq1[i] >= 'a' && as->seq1[i] <= 'z') as->seq1[i] -= 0x20;
+ as->s1[i] = a[as->seq1[i]];
+ }
+ for (i = 0; i < as->len2; ++i) {
+ if (as->seq2[i] >= 'a' && as->seq2[i] <= 'z') as->seq2[i] -= 0x20;
+ as->s2[i] = a[as->seq2[i]];
+ }
+
+ if (matrix == NULL)
+ free(a); /* free array after encoding */
+
+/* initialize the spaces arrays */
+ as->spc1 = (int *) calloc(as->len1 + 1, sizeof(int));
+ if (as->spc1 == NULL)
+ dpAlign_fatal("Can't allocate memory for spaces array for seq 1!\n");
+ as->spc2 = (int *) calloc(as->len2 + 1, sizeof(int));
+ if (as->spc2 == NULL)
+ dpAlign_fatal("Can't allocate memory for spaces array for seq 2!\n");
+/* align the subsequences bounded by the end points */
+ as->score = align(as->s1, as->s2, as->len1, as->len2, as->s, as->gap, as->ext, as->FF, as->RR, as->spc1, as->spc2);
+
+/* free scoring matrix
+ for (i = 0; i < sz; ++i) {
+ free(as->s[i]);
+ }
+ free(as->s);
+*/
+
+/* set start and end for global alignment */
+ as->start1 = 1;
+ as->end1 = as->len1;
+ as->start2 = 1;
+ as->end2 = as->len2;
+
+ return traceback(as);
+}
+
+/*
+ init_AlignStruct initializes the alignment data structure by allocating
+ memory and setting values. It returns a sw_AlignStruct that is
+ initialized based on the two sequence strings seq1 and seq2, the
+ scoring matrix s, the gap opening cost gap and gap extension cost ext.
+ */
+static sw_AlignStruct *
+init_AlignStruct(char * seq1, char * seq2, int ** s, int gap, int ext)
+{
+ sw_AlignStruct * as = (sw_AlignStruct *) calloc(1, sizeof(sw_AlignStruct));
+
+ if (as == NULL)
+ dpAlign_fatal("Cannot allocate memory for dpAlign_AlignStruct!\n");
+
+ as->seq1 = seq1;
+ as->len1 = strlen(seq1);
+ if (as->len1 <= 0)
+ dpAlign_fatal("Sequence 1 is has non-positive length!\n");
+
+ as->seq2 = seq2;
+ as->len2 = strlen(seq2);
+ if (as->len2 <= 0)
+ dpAlign_fatal("Sequence 2 is has non-positive length!\n");
+
+/* allocate memory for Gotoh arrays */
+ as->FF = (struct swstr *) calloc((as->len2+1), sizeof(struct swstr));
+ if (as->FF == NULL)
+ dpAlign_fatal("Can't allocate memory for forward swstr array!\n");
+ as->RR = (struct swstr *) calloc((as->len2+1), sizeof(struct swstr));
+ if (as->RR == NULL)
+ dpAlign_fatal("Can't allocate memory for reverse swstr array!\n");
+
+/* allocate memory for encoded sequence string */
+ as->s1 = (unsigned char *) malloc(as->len1*sizeof(unsigned char));
+ if (as->s1 == NULL)
+ dpAlign_fatal("Cannot allocate memory for encoded sequence 1!\n");
+ as->s2 = (unsigned char *) malloc(as->len2*sizeof(unsigned char));
+ if (as->s2 == NULL)
+ dpAlign_fatal("Cannot allocate memory for encoded sequence 2!\n");
+
+ as->gap = gap;
+ as->ext = ext;
+ as->s = s;
+
+ return as;
+}
+
+/*
+ traceback takes a sw_AlignStruct with the gap arrays computed by
+ align and inserts gaps into the aligned subsequences. Then it
+ returns the dpAlign_AlignOutput that is to be converted into a
+ Bio::SimpleAlign object.
+ */
+static dpAlign_AlignOutput *
+traceback(sw_AlignStruct * as)
+{
+ dpAlign_AlignOutput * ao;
+ int aln1_len = as->end1 - as->start1 + 1;
+ int aln2_len = as->end2 - as->start2 + 1;
+ char aln_seq1[as->len1+as->len2+1];
+ char aln_seq2[as->len1+as->len2+1];
+ int i, j, k;
+
+/* free all allocated memory before we exit this module */
+ free(as->s1);
+ free(as->s2);
+ free(as->FF);
+ free(as->RR);
+
+ ao = (dpAlign_AlignOutput *) calloc(1, sizeof(dpAlign_AlignOutput));
+ if (ao == NULL)
+ dpAlign_fatal("Can't allocate memory for AlignOutput!\n");
+
+ ao->score = as->score;
+/* insert gaps to sequence 1 */
+ k = 0;
+ for (i = 0; i <= aln1_len; ++i) {
+ for (j = 0; j < as->spc1[i]; ++j)
+ aln_seq1[k++] = '-';
+ aln_seq1[k++] = as->seq1[i+as->start1-1];
+ }
+ aln_seq1[k-1] = '\0';
+
+ ao->aln1 = (char *) malloc(k*sizeof(char));
+ if (ao->aln1 == NULL)
+ dpAlign_fatal("Can't allocate memory for aln1!\n");
+ strcpy(ao->aln1, aln_seq1);
+
+/* insert gaps to sequence 2 */
+ k = 0;
+ for (i = 0; i <= aln2_len; ++i) {
+ for (j = 0; j < as->spc2[i]; ++j)
+ aln_seq2[k++] = '-';
+ aln_seq2[k++] = as->seq2[i+as->start2-1];
+ }
+ aln_seq2[k-1] = '\0';
+
+ ao->aln2 = (char *) malloc(k*sizeof(char));
+ if (ao->aln2 == NULL)
+ dpAlign_fatal("Can't allocate memory for aln1!\n");
+ strcpy(ao->aln2, aln_seq2);
+
+ ao->start1 = as->start1;
+ ao->start2 = as->start2;
+ ao->end1 = as->end1;
+ ao->end2 = as->end2;
+
+/* free the rest of allocated memory */
+ free(as->spc1);
+ free(as->spc2);
+ free(as);
+ return ao;
+}
+
+/*
+ find_ends set the end points in the sw_AlignStruct by employing
+ the Gotoh way of calculating alignment score. First it goes forward
+ to find the end points. Then from there it goes backwards to find
+ the start points. There are no return values, the end points
+ in sw_AlignStruct are set before the function returns.
+ */
+static void
+find_ends(sw_AlignStruct * as)
+{
+ struct swstr * F = as->FF;
+ struct swstr * R = as->RR;
+ int M = as->len1;
+ int N = as->len2;
+ unsigned char * A = as->s1;
+ unsigned char * B = as->s2;
+ struct swstr * swp;
+ int i, j;
+ int from, P;
+ int c; /* score of a cell */
+ int d; /* down value in Q array */
+ int ** s = as->s;
+ int * ss;
+ int g = as->gap;
+ int e = as->ext;
+ int h = g + e;
+ int score1 = 0, score2 = 0;
+
+/* find end points */
+ for (i = 0; i < M; ++i) {
+ F[0].H = P = from = c = 0;
+ ss = s[A[i]];
+ for (swp = F+1, j = 0; swp <= F+N; ++swp, ++j) {
+ if ((c = c - h) > (P = P - e)) P = c;
+ if ((c = swp->H - h) > (d = swp->E - e)) d = c;
+ c = from + ss[B[j]];
+ if (c < 0) c = 0;
+ if (P > c) c = P;
+ if (d > c) c = d;
+ swp->E = d;
+ from = swp->H;
+ swp->H = c;
+ if (c > score1) {
+ score1 = c;
+ as->end1 = i+1;
+ as->end2 = j+1;
+ }
+ }
+ }
+
+/* find start point */
+ M = as->end1;
+ N = as->end2;
+ for (i = M-1; i >= 0; --i) {
+ R[0].H = P = from = c = 0;
+ ss = s[A[i]];
+ for (swp = R+1, j = N-1; swp <= R+N; ++swp, --j) {
+ if ((c = c - h) > (P = P - e)) P = c;
+ if ((c = swp->H - h) > (d = swp->E - e)) d = c;
+ c = from + ss[B[j]];
+ if (c < 0) c = 0;
+ if (P > c) c = P;
+ if (d > c) c = d;
+ swp->E = d;
+ from = swp->H;
+ swp->H = c;
+ if (c > score2) {
+ score2 = c;
+ as->start1 = i+1;
+ as->start2 = j+1;
+ if (c >= score1) goto found; // score same as before, done!
+ }
+ }
+ }
+
+found:
+ as->score = score1;
+/* initialize the spaces arrays */
+ as->spc1 = (int *) calloc(as->end1 - as->start1 + 2, sizeof(int));
+ if (as->spc1 == NULL)
+ dpAlign_fatal("Can't allocate memory for spaces array for seq 1!\n");
+ as->spc2 = (int *) calloc(as->end2 - as->start2 + 2, sizeof(int));
+ if (as->spc2 == NULL)
+ dpAlign_fatal("Can't allocate memory for spaces array for seq 2!\n");
+}
View
8 Bio/Ext/Align/libs/makefile
@@ -35,13 +35,15 @@ OBJS = aln.o\
wiseoverlay.o\
wiserandom.o\
wisestring.o\
- wisetime.o
+ wisetime.o\
+ dpalign.o\
+ linspc.o
libsw.a : $(OBJS)
ar ru libsw.a $(OBJS)
-wisefile.o : wisefile.c
- $(CC) $(CFLAGS) -DNOERROR wisefile.c
+#wisefile.o : wisefile.c
+# $(CC) $(CFLAGS) -DNOERROR wisefile.c
#
# For NetBSD or Sun (solaris) installs, add -fPIC to the CFLAGS lines
View
24 Bio/Ext/Align/md_20.mat
@@ -0,0 +1,24 @@
+ A R N D C Q E G H I L K M F P S T W Y V B Z X
+A 10 -10 -9 -8 -10 -10 -7 -5 -12 -10 -12 -11 -9 -15 -5 -2 -1 -17 -16 -3 -9 -8 -1
+R -10 12 -10 -14 -7 -3 -11 -6 -3 -14 -12 0 -11 -18 -9 -7 -9 -6 -14 -14 -12 -7 -1
+N -9 -10 13 -1 -11 -8 -9 -8 -2 -11 -15 -4 -12 -16 -13 -1 -4 -18 -9 -14 6 -8 -1
+D -8 -14 -1 12 -16 -9 1 -6 -7 -16 -18 -11 -15 -20 -15 -9 -11 -20 -11 -12 6 -4 -1
+C -10 -7 -11 -16 17 -16 -19 -9 -9 -14 -13 -17 -12 -8 -14 -4 -11 -7 -4 -10 -14 -17 -1
+Q -10 -3 -8 -9 -16 13 -3 -12 0 -16 -9 -3 -11 -18 -5 -10 -10 -14 -12 -14 -9 5 -1
+E -7 -11 -9 1 -19 -3 11 -7 -12 -16 -17 -5 -14 -20 -14 -12 -12 -17 -18 -11 -4 4 -1
+G -5 -6 -8 -6 -9 -12 -7 11 -13 -17 -18 -12 -15 -19 -12 -5 -11 -10 -17 -11 -7 -9 -1
+H -12 -3 -2 -7 -9 0 -12 -13 15 -14 -10 -9 -12 -11 -7 -8 -10 -16 0 -15 -4 -6 -1
+I -10 -14 -11 -16 -14 -16 -16 -17 -14 12 -4 -14 -1 -8 -15 -11 -4 -16 -12 2 -13 -16 -1
+L -12 -11 -15 -18 -13 -9 -17 -18 -10 -4 10 -15 -2 -4 -7 -10 -12 -10 -13 -5 -17 -13 -1
+K -11 0 -4 -12 -17 -3 -5 -12 -9 -14 -15 12 -9 -21 -13 -10 -7 -16 -17 -15 -8 -4 -1
+M -9 -11 -12 -15 -12 -11 -15 -16 -12 -1 -2 -9 15 -10 -14 -12 -4 -13 -14 -3 -13 -13 -1
+F -15 -19 -16 -19 -8 -18 -20 -19 -11 -8 -4 -19 -10 13 -14 -8 -15 -10 0 -9 -17 -19 -1
+P -5 -9 -13 -15 -14 -5 -14 -12 -7 -15 -7 -13 -14 -14 12 -3 -7 -18 -16 -13 -14 -10 -1
+S -2 -8 -1 -9 -4 -10 -12 -5 -8 -11 -10 -10 -12 -8 -3 10 -1 -12 -9 -11 -5 -11 -1
+T -1 -9 -4 -11 -10 -10 -12 -11 -10 -4 -12 -7 -4 -15 -7 -1 11 -16 -14 -7 -7 -11 -1
+W -17 -6 -18 -18 -7 -14 -18 -10 -17 -17 -10 -17 -14 -10 -18 -12 -15 18 -9 -13 -18 -16 -1
+Y -16 -14 -9 -11 -4 -12 -18 -17 0 -12 -12 -17 -14 0 -16 -9 -13 -9 14 -15 -10 -15 -1
+V -3 -14 -14 -12 -9 -14 -11 -11 -15 2 -5 -15 -2 -9 -13 -11 -7 -13 -14 11 -13 -12 -1
+B -9 -12 6 6 -14 -9 -4 -7 -4 -13 -17 -8 -13 -18 -14 -5 -7 -19 -10 -13 12 -6 -1
+Z -12 -13 -13 -4 -27 4 10 -13 -12 -24 -21 -6 -20 -29 -17 -17 -17 -24 -24 -18 -6 12 -1
+X -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
View
88 Bio/Ext/Align/test.pl
@@ -1,37 +1,97 @@
-#!/usr/local/bin/perl
+# test framework rewritten by Jason Stajich
## Test framework for pSW XS stuff
## $Id$
## We start with some black magic to print on failure.
-BEGIN { $| = 1; print "1..2\n"; }
-END {print "not ok 1\n" unless $loaded;}
+my $DEBUG = $ENV{'BIOPERLDEBUG'} || 0;
+BEGIN {
+ eval { require Test; };
+ use Test;
+ plan tests => 9;
+}
use Bio::Ext::Align;
+use Bio::Tools::dpAlign;
+use Bio::Seq;
+use Bio::AlignIO;
$loaded = 1;
-print "ok 1\n"; # 1st test passes.
-
-print "\n2..2\nTesting bp_sw with a protein alignment...\n\n";
+ok(1); # modules loaded
&Bio::Ext::Align::change_max_BaseMatrix_kbytes(20000);
$cm = &Bio::Ext::Align::CompMat::read_Blast_file_CompMat("blosum62.bla");
+ok($cm);
$seq1 = &Bio::Ext::Align::new_Sequence_from_strings("one","WLGQRNLVSSTGGNLLNVWLKDW");
+ok($seq1);
$seq2 = &Bio::Ext::Align::new_Sequence_from_strings("two","WMGNRNVVNLLNVWFRDW");
+ok($seq2);
+
+$alb = &Bio::Ext::Align::Align_Sequences_ProteinSmithWaterman($seq1,$seq2,
+ $cm,-12,-2);
+&Bio::Ext::Align::write_pretty_str_align($alb,$seq1->name,
+ $seq1->seq,$seq2->name,
+ $seq2->seq,15,50,STDERR) if $DEBUG;
+
+
+warn( "Testing Local Alignment case...\n") if $DEBUg;
+
+$alnout = new Bio::AlignIO(-format => 'pfam', -fh => \*STDERR);
+$aln = &Bio::Ext::Align::Align_DNA_Sequences("AATGCCATTGACGG",
+ "CAGCCTCGCTTAG",3,-1,3,1,
+ Bio::Tools::dpAlign::DPALIGN_LOCAL_MILLER_MYERS);
+
+$out = Bio::SimpleAlign->new();
+
+$out->add_seq(Bio::LocatableSeq->new(-seq => $aln->aln1,
+ -start => $aln->start1,
+ -end => $aln->end1,
+ -id => "one"));
+
+$out->add_seq(Bio::LocatableSeq->new(-seq => $aln->aln2,
+ -start => $aln->start2,
+ -end => $aln->end2,
+ -id => "two"));
+$alnout->write_aln($out) if $DEBUG;
-$alb = &Bio::Ext::Align::Align_Sequences_ProteinSmithWaterman($seq1,$seq2,$cm,-12,-2);
+$aln = &Bio::Ext::Align::Align_Protein_Sequences("WLGQRNLVSSTGGNLLNVWLKDW","WMGNRNVVNLLNVWFRDW",0,
+ Bio::Tools::dpAlign::DPALIGN_LOCAL_MILLER_MYERS);
+$out = Bio::SimpleAlign->new();
+ok($aln);
-&Bio::Ext::Align::write_pretty_str_align($alb,$seq1->name,$seq1->seq,$seq2->name,$seq2->seq,15,50,STDERR);
+$out->add_seq(Bio::LocatableSeq->new(-seq => $aln->aln1,
+ -start => $aln->start1,
+ -end => $aln->end1,
+ -id => "one"));
-print "ok 2\n"; # Assume 2nd test worked.
+$out->add_seq(Bio::LocatableSeq->new(-seq => $aln->aln2,
+ -start => $aln->start2,
+ -end => $aln->end2,
+ -id => "two"));
+$alnout->write_aln($out) if $DEBUG;
+ok(1);
-## End of black magic.
-##
-## Insert additional test code below but remember to change
-## the print "1..x\n" in the BEGIN block to reflect the
-## total number of tests that will be run.
+warn( "Testing Global Alignment case...\n") if $DEBUG;
+$factory = new Bio::Tools::dpAlign('-alg' => Bio::Tools::dpAlign::DPALIGN_GLOBAL_MILLER_MYERS);
+$s1 = new Bio::Seq(-id => "one", -seq => "AATGCCATTGACGG", -alphabet => 'dna');
+$s2 = new Bio::Seq(-id => "two", -seq => "CAGCCTCGCTTAG", -alphabet => 'dna');
+$aln = $factory->pairwise_alignment($s1, $s2);
+$alnout->write_aln($aln) if $DEBUG;
+$factory->align_and_show($s1, $s2) if $DEBUG;
+ok(1);
+$s1 = new Bio::Seq(-id => "one", -seq => "WLGQRNLVSSTGGNLLNVWLKDW",
+ -alphabet => 'protein');
+$s2 = new Bio::Seq(-id => "two", -seq => "WMGNRNVVNLLNVWFRDW",
+ -alphabet => 'protein');
+$aln = $factory->pairwise_alignment($s1, $s2);
+$alnout->write_aln($aln) if $DEBUG;
+$factory->align_and_show($s1, $s2) if $DEBUG;
+ok(1);
+$prof = $factory->sequence_profile($s1);
+warn( "Optimal Alignment Score = %d\n", $factory->pairwise_alignment_score($prof, $s2)) if $DEBUG;
+ok($factory->pairwise_alignment_score($prof,$s2),77);
View
21 Bio/Ext/Align/typemap
@@ -349,3 +349,24 @@ T_bp_sw_Translation
OUTPUT
T_bp_sw_Translation
sv_setref_pv($arg, "Bio::Ext::Align::Translation", (void*) $var);
+
+TYPEMAP
+dpAlign_AlignOutput * T_AlignOutput
+dpAlign_SequenceProfile * T_SequenceProfile
+dpAlign_ScoringMatrix * T_ScoringMatrix
+
+INPUT
+T_AlignOutput
+ $var = ($type) (SvROK($arg) == 0 ? ($type) NULL : ($type) SvIV((SV*)SvRV($arg)))
+T_SequenceProfile
+ $var = ($type) (SvROK($arg) == 0 ? ($type) NULL : ($type) SvIV((SV*)SvRV($arg)))
+T_ScoringMatrix
+ $var = ($type) (SvROK($arg) == 0 ? ($type) NULL : ($type) SvIV((SV*)SvRV($arg)))
+
+OUTPUT
+T_AlignOutput
+ sv_setref_pv($arg, "Bio::Ext::Align::AlignOutput", (void*) $var);
+T_SequenceProfile
+ sv_setref_pv($arg, "Bio::Ext::Align::SequenceProfile", (void*) $var);
+T_ScoringMatrix
+ sv_setref_pv($arg, "Bio::Ext::Align::ScoringMatrix", (void*) $var);
View
49 Bio/Ext/HMM/HMM.pm
@@ -0,0 +1,49 @@
+
+package Bio::Ext::HMM;
+
+use vars qw($AUTOLOAD @ISA @EXPORT_OK $dl_debug);
+use Exporter;
+use Carp;
+use strict;
+
+use DynaLoader;
+
+
+@ISA = qw(Exporter DynaLoader);
+# Items to export into callers namespace by default. Note: do not export
+# names by default without a very good reason. Use EXPORT_OK instead.
+# Do not simply export all your public functions/methods/constants.
+
+sub AUTOLOAD {
+ # This AUTOLOAD is used to 'autoload' constants from the constant()
+ # XS function. If a constant is not found then control is passed
+ # to the AUTOLOAD in AutoLoader.
+
+ my $constname;
+ ($constname = $AUTOLOAD) =~ s/.*:://;
+ my $val = constant($constname, @_ ? $_[0] : 0);
+ if ($! != 0) {
+ if ($! =~ /Invalid/) {
+ $AutoLoader::AUTOLOAD = $AUTOLOAD;
+ goto &AutoLoader::AUTOLOAD;
+ }
+ else {
+ croak "Your vendor has not defined Test macro $constname";
+ }
+ }
+ eval "sub $AUTOLOAD { $val }";
+ goto &$AUTOLOAD;
+}
+
+BEGIN {
+ $dl_debug = 40;
+}
+
+bootstrap Bio::Ext::HMM; # hopefully has the correct things...
+
+# Preloaded methods go here.
+
+# Autoload methods go after __END__, and are processed by the autosplit program.
+
+1;
+__END__
View
207 Bio/Ext/HMM/HMM.xs
@@ -0,0 +1,207 @@
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "EXTERN.h"
+#include "perl.h"
+#include "XSUB.h"
+#ifdef __cplusplus
+}
+#endif
+
+#include "hmm.h"
+
+MODULE = Bio::Ext::HMM PACKAGE = Bio::Ext::HMM
+
+void
+HMM_statistical_training(class, hmm, obs, hs)
+ char * class
+ HMM * hmm
+ SV * obs
+ SV * hs
+ PPCODE:
+ AV * obs_av = (AV *) SvRV(obs);
+ AV * hs_av = (AV *) SvRV(hs);
+ int i;
+ int avlen = av_len(obs_av);
+ char ** obs_ar = (char **) malloc(avlen*sizeof(char *));
+ char ** hs_ar = (char **) malloc(avlen*sizeof(char *));
+ int * obs_len = (int *) malloc(avlen*sizeof(int));
+ if (obs_ar == NULL || hs_ar == NULL)
+ croak("Can't allocate memory for observation and/or state arrays!\n");
+ if (obs_len == NULL)
+ croak("Can't allocate memory for observation length array!\n");
+ for (i = 0; i < avlen; ++i) {
+ obs_ar[i] = (char *) SvPV(*av_fetch(obs_av, i, 0), PL_na);
+ obs_len[i] = strlen(obs_ar[i]);
+ obs_ar[i] = (char *) malloc((obs_len[i]+1)*sizeof(char));
+ if (obs_ar[i] == NULL)
+ croak("Can't allocate memory for observation array!\n");
+ strcpy(obs_ar[i], (char *) SvPV(*av_fetch(obs_av, i, 0), PL_na));
+ hs_ar[i] = (char *) malloc((obs_len[i]+1)*sizeof(char));
+ if (hs_ar[i] == NULL)
+ croak("Can't allocate memory for state array!\n");
+ strcpy(hs_ar[i], (char *) SvPV(*av_fetch(hs_av, i, 0), PL_na));
+ omap(hmm, obs_ar[i], obs_len[i]);
+ smap(hmm, hs_ar[i], obs_len[i]);
+ }
+ state_est(hmm, obs_ar, hs_ar, obs_len, avlen);
+ for (i = 0; i < avlen; ++i) {
+ free(obs_ar[i]);
+ free(hs_ar[i]);
+ }
+ free(obs_ar);
+ free(hs_ar);
+ free(obs_len);
+
+double
+HMM_likelihood(class, hmm, seq)
+ char * class
+ HMM * hmm
+ char * seq
+ CODE:
+ int T = strlen(seq);
+ char obs[T+1];
+ strcpy(obs, seq);
+ omap(hmm, obs, T);
+ RETVAL = Palpha(hmm, obs, T);
+ OUTPUT:
+ RETVAL
+
+void
+HMM_baum_welch_training(class, hmm, obs)
+ char * class
+ HMM * hmm
+ SV * obs
+ PPCODE:
+ AV * obs_av = (AV *) SvRV(obs);
+ int i;
+ int avlen = av_len(obs_av);
+ char ** obs_ar = (char **) malloc(avlen*sizeof(char *));
+ int * obs_len = (int *) malloc(avlen*sizeof(int));
+ if (obs_ar == NULL)
+ croak("Can't allocate memory for observation arrays!\n");
+ if (obs_len == NULL)
+ croak("Can't allocate memory for observation length array!\n");
+ for (i = 0; i < avlen; ++i) {
+ obs_ar[i] = (char *) SvPV(*av_fetch(obs_av, i, 0), PL_na);
+ obs_len[i] = strlen(obs_ar[i]);
+ obs_ar[i] = (char *) malloc((obs_len[i]+1)*sizeof(char));
+ if (obs_ar[i] == NULL)
+ croak("Can't allocate memory for observation array!\n");
+ strcpy(obs_ar[i], (char *) SvPV(*av_fetch(obs_av, i, 0), PL_na));
+ omap(hmm, obs_ar[i], obs_len[i]);
+ }
+ baum_welch(hmm, obs_ar, obs_len, avlen);
+ for (i = 0; i < avlen; ++i)
+ free(obs_ar[i]);
+ free(obs_ar);
+
+SV *
+HMM_viterbi(class, hmm, seq)
+ char * class
+ HMM * hmm
+ char * seq
+ PPCODE:
+ SV * sv;
+ int T = strlen(seq);
+ char * hss = (char *) malloc(T*sizeof(char));
+ char obs[T+1];
+ if (hss == NULL)
+ croak("Can't allocate memory for hidden state sequence!\n");
+ strcpy(obs, seq);
+ omap(hmm, obs, T);
+ viterbi(hmm, hss, obs, T);
+ sv = newSVpv(hss, strlen(hss));
+ free(hss);
+ PUSHs(sv_2mortal(sv));
+
+
+MODULE = Bio::Ext::HMM PACKAGE = Bio::Ext::HMM::HMM
+
+HMM *
+new(class, symbols, states)
+ char * class
+ char * symbols
+ char * states
+ PPCODE:
+ HMM * out;
+ out = HMM_new(symbols, states);
+ ST(0) = sv_newmortal();
+ sv_setref_pv(ST(0), class, (void *) out);
+ XSRETURN(1);
+
+double
+get_init_entry(class, hmm, state)
+ char * class
+ HMM * hmm
+ char * state
+ CODE:
+ RETVAL = HMM_get_init_entry(hmm, state);
+ OUTPUT:
+ RETVAL
+
+void
+set_init_entry(class, hmm, state, val)
+ char * class
+ HMM * hmm
+ char * state
+ double val
+ CODE:
+ HMM_set_init_entry(hmm, state, val);
+
+double
+get_a_entry(class, hmm, state1, state2)
+ char * class
+ HMM * hmm
+ char * state1
+ char * state2
+ CODE:
+ RETVAL = HMM_get_a_entry(hmm, state1, state2);
+ OUTPUT:
+ RETVAL
+
+void
+set_a_entry(class, hmm, state1, state2, val)
+ char * class
+ HMM * hmm
+ char * state1
+ char * state2
+ double val
+ CODE:
+ HMM_set_a_entry(hmm, state1, state2, val);
+
+double
+get_e_entry(class, hmm, state, symbol)
+ char * class
+ HMM * hmm
+ char * state
+ char * symbol
+ CODE:
+ RETVAL = HMM_get_e_entry(hmm, state, symbol);
+ OUTPUT:
+ RETVAL
+
+void
+set_e_entry(class, hmm, state, symbol, val)
+ char * class
+ HMM * hmm
+ char * state
+ char * symbol
+ double val
+ CODE:
+ HMM_set_e_entry(hmm, state, symbol, val);
+
+void
+DESTROY(obj)
+ HMM * obj
+ PPCODE:
+ int i;
+ free(obj->init);
+ for (i = 0; i < obj->N; ++i)
+ free(obj->a_mat[i]);
+ free(obj->a_mat);
+ for (i = 0; i < obj->N; ++i)
+ free(obj->e_mat[i]);
+ free(obj->e_mat);
+
View
13 Bio/Ext/HMM/Makefile.PL
@@ -0,0 +1,13 @@
+use ExtUtils::MakeMaker;
+# See lib/ExtUtils/MakeMaker.pm for details of how to influence
+# the contents of the Makefile that is written.
+WriteMakefile(
+ 'NAME' => 'Bio::Ext::HMM',
+ 'VERSION' => '0.1',
+ 'LIBS' => ['-lm'], # e.g., '-lm'
+ 'DEFINE' => '', # e.g., '-DHAVE_SOMETHING'
+ 'INC' => '', # e.g., '-I/usr/include/other'
+ 'OBJECT' => 'HMM.o hmmlib.o',
+ 'clean' => { 'FILES' => '*.o' }
+);
+
View
36 Bio/Ext/HMM/hmm.h
@@ -0,0 +1,36 @@
+#ifndef _HMM_H_
+#define _HMM_H_
+
+typedef struct HMM
+{
+ int M;
+ int N;
+ char * symbols;
+ unsigned char omap[256];
+ char * states;
+ unsigned char smap[256];
+ double * init;
+ double ** a_mat;
+ double ** e_mat;
+} HMM;
+
+#define UPPER_TOL 0.000001
+#define LOWER_TOL 1e-100
+
+HMM * init_HMM(char *, char *, double *, double **, double **);
+HMM * HMM_new(char *, char *);
+double HMM_get_init_entry(HMM *, char *);
+void HMM_set_init_entry(HMM *, char *, double);
+double HMM_get_a_entry(HMM *, char *, char *);
+void HMM_set_a_entry(HMM *, char *, char *, double);
+double HMM_get_e_entry(HMM *, char *, char *);
+void HMM_set_e_entry(HMM *, char *, char *, double);
+void omap(HMM *, char *, int);
+void smap(HMM *, char *, int);
+double Palpha(HMM *, char *, int);
+double Pbeta(HMM *, char *, int);
+void viterbi(HMM *, char *, char *, int);
+void state_est(HMM *, char **, char **, int *, int);
+void baum_welch(HMM *, char **, int *, int);
+
+#endif
View
779 Bio/Ext/HMM/hmmlib.c
@@ -0,0 +1,779 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/types.h>
+#include "hmm.h"
+
+static unsigned int hmm_seed;
+static void alphaT(HMM *, double *, char *, int);
+static void beta1(HMM *, double *, char *, int);
+static double * xi(HMM *);
+static void alpha_mat(HMM *, double **, char *, int);
+static void beta_mat(HMM *, double **, char *, int);
+static void HMM_fatal(char *);
+static double sumLogProbs(double *, int);
+static double sumLogProb(double, double);
+
+/*
+ sumLogProbs adds up the log probabilities stored in the array
+ logprobs of size sz by dividing every items with the largest
+ item in the array. This prevents underflow such that calculations
+ for long sequence are allowed. The function returns the log of the
+ sum of log probabilities.
+ log(a_0+a_1+...+a_n) = log(a_max)+log(a_0/a_max+a_1/a_max+...+a_n/a_max)
+ */
+static double
+sumLogProbs(double * logprobs, int sz)
+{
+ double max = 0.0;
+ double p = 0.0;
+ int i;
+ for (i = 0; i < sz; ++i)
+ if (i == 0 || logprobs[i] > max)
+ max = logprobs[i];
+ if (max == log(0))
+ return max;
+ for (i = 0; i < sz; ++i)
+ p += exp(logprobs[i] - max);
+ return max + log(p);
+}
+
+/*
+ sumLogProb is similar to sumLogProbs except that it is adding up
+ only two log probabilities p1 and p2. It returns the log of the sum
+ of p1 and p2.
+ */
+static double
+sumLogProb(double p1, double p2)
+{
+ if (p1 == log(0) && p2 == log(0))
+ return p1;
+ if (p1 > p2)
+ return p1 + log(1 + exp(p2 - p1));
+ else
+ return p2 + log(1 + exp(p1 - p2));
+}
+
+/*
+ HMM_fatal prints the string s to STDERR and then exit
+ */
+static void
+HMM_fatal(char * s)
+{
+ fprintf(stderr, s);
+ exit(-1);
+}
+
+/*
+ HMM_new initializes an HMM object with string of all possible
+ symbols and string of all possible states. The other parameters
+ like initial probabilities (HMM->init), state transition matrix
+ (HMM->a_mat) and emission matrix (HMM->e_mat) are initialized
+ randomly such that it is ready for training.
+ */
+HMM *
+HMM_new(char * symbols, char * states)
+{
+ HMM * hmm;
+ double sum, random;
+ int i, j;
+
+ hmm_seed = time(NULL);
+ hmm = (HMM *) malloc(sizeof(HMM));
+ if (hmm == NULL)
+ HMM_fatal("Can't allocate memory for HMM, die!\n");
+ hmm->symbols = symbols;
+ hmm->states = states;
+ hmm->M = strlen(symbols);
+ hmm->N = strlen(states);
+ for (i = 0; i < hmm->M; ++i)
+ hmm->omap[symbols[i]] = i;
+ for (i = 0; i < hmm->N; ++i)
+ hmm->smap[states[i]] = i;
+ hmm->init = (double *) malloc(hmm->N*sizeof(double));
+ if (hmm->init == NULL)
+ HMM_fatal("Can't allocate memory for init array of HMM, die!\n");
+/* initialize the initial state array */
+ sum = 0.0;
+ for (j = 0; j < hmm->N; ++j) {
+ srand(hmm_seed++);
+ random = (double) rand();
+ hmm->init[j] = random;
+ sum += random;
+ }
+ for (j = 0; j < hmm->N; ++j)
+ hmm->init[j] /= sum;
+ hmm->e_mat = (double **) malloc(hmm->N*sizeof(double *));
+ hmm->a_mat = (double **) malloc(hmm->N*sizeof(double *));
+ if (hmm->a_mat == NULL)
+ HMM_fatal("Can't allocate memory for transition matrix of HMM, die!\n");
+ for (i = 0; i < hmm->N; ++i) {
+ hmm->a_mat[i] = (double *) malloc(hmm->N*sizeof(double));
+ if (hmm->a_mat[i] == NULL)
+ HMM_fatal("Can't allocate memory for transition matrix of HMM, die!\n");
+
+/* randomize the transition matrix */
+ sum = 0.0;
+ for (j = 0; j < hmm->N; ++j) {
+ srand(hmm_seed++);
+ random = (double) rand();
+ hmm->a_mat[i][j] = random;
+ sum += random;
+ }
+ for (j = 0; j < hmm->N; ++j)
+ hmm->a_mat[i][j] /= sum;
+ }
+ hmm->e_mat = (double **) malloc(hmm->N*sizeof(double *));
+ if (hmm->e_mat == NULL)
+ HMM_fatal("Can't allocate memory for emission matrix of HMM, die!\n");
+ for (i = 0; i < hmm->N; ++i) {
+ hmm->e_mat[i] = (double *) malloc(hmm->M*sizeof(double));
+ if (hmm->e_mat[i] == NULL)
+ HMM_fatal("Can't allocate memory for emission matrix of HMM, die!\n");
+
+/* randomize the emission matrix */
+ sum = 0.0;
+ for (j = 0; j < hmm->M; ++j) {
+ srand(hmm_seed++);
+ random = (double) rand();
+ hmm->e_mat[i][j] = random;
+ sum += random;
+ }
+ for (j = 0; j < hmm->M; ++j)
+ hmm->e_mat[i][j] /= sum;
+ }
+ return hmm;
+}
+
+/*
+ init_HMM is similar to HMM_new except that it allows you to initialize
+ the HMM with your own initial probabilities (init), your own state
+ transition matrix (a_mat) and your own emission matrix (e_mat).
+ */
+HMM *
+init_HMM(char * symbols, char * states, double * init, double ** a_mat, double ** e_mat)
+{
+ HMM * hmm;
+ int i;
+
+ hmm = (HMM *) calloc(1, sizeof(HMM));
+ if (hmm == NULL) {
+ fprintf(stderr, "Can't allocate memory for HMM, die!\n");
+ exit(-1);
+ }
+ hmm->symbols = symbols;
+ hmm->states = states;
+ hmm->M = strlen(symbols);
+ hmm->N = strlen(states);
+ for (i = 0; i < hmm->M; ++i)
+ hmm->omap[symbols[i]] = i;
+ for (i = 0; i < hmm->N; ++i)
+ hmm->smap[states[i]] = i;
+ hmm->init = init;
+ hmm->a_mat = a_mat;
+ hmm->e_mat = e_mat;
+ return hmm;
+}
+
+/*
+ HMM_get_init_entry returns the value of a cell in the initial
+ probability array based on the state supplied.
+ HMM_get_init_entry is written such that XS can access the C array.
+ */
+double
+HMM_get_init_entry(HMM * hmm, char * state)
+{
+ return hmm->init[hmm->smap[state[0]]];
+}
+
+/*
+ HMM_set_init_entry sets the value of a cell in the initial
+ probability array based on the val supplied.
+ HMM_set_init_entry is written such that XS can modify the C array.
+ */
+void
+HMM_set_init_entry(HMM * hmm, char * state, double val)
+{
+ hmm->init[hmm->smap[state[0]]] = val;
+}
+
+/*
+ HMM_get_a_entry returns the value of a cell in the state transition
+ matrix based on the from-state (state1) and the to-state (state2)
+ supplied.
+ HMM_get_a_entry is written such that XS can access the C matrix.
+ */
+double
+HMM_get_a_entry(HMM * hmm, char * state1, char * state2)
+{
+ return hmm->a_mat[hmm->smap[state1[0]]][hmm->smap[state2[0]]];
+}
+
+/*
+ HMM_set_a_entry sets the value of a cell in the state transition
+ matrix to val based on the from-state (state1) and the to-state
+ (state2) supplied.
+ HMM_set_a_entry is written such that XS can modify the C matrix.
+ */
+void
+HMM_set_a_entry(HMM * hmm, char * state1, char * state2, double val)
+{
+ hmm->a_mat[hmm->smap[state1[0]]][hmm->smap[state2[0]]] = val;
+}
+
+/*
+ HMM_get_e_entry returns the value of a cell in the emission
+ matrix based on the from-state (state1) and the to-state (state2)
+ supplied.
+ HMM_get_e_entry is written such that XS can access the C matrix.
+ */
+double
+HMM_get_e_entry(HMM * hmm, char * state, char * symbol)
+{
+ return hmm->e_mat[hmm->smap[state[0]]][hmm->omap[symbol[0]]];
+}
+
+/*
+ HMM_set_e_entry sets the value of a cell in the emission matrix
+ to val based on the from-state (state1) and the to-state (state2)
+ supplied.
+ HMM_set_e_entry is written such that XS can modify the C matrix.
+ */
+void
+HMM_set_e_entry(HMM * hmm, char * state, char * symbol, double val)
+{
+ hmm->e_mat[hmm->smap[state[0]]][hmm->omap[symbol[0]]] = val;
+}
+
+/*
+ omap takes a symbol character and return its index in the state
+ transition matrix.
+ */
+void
+omap(HMM * hmm, char * c, int L)
+{
+ int i;
+
+ for (i = 0; i < L; ++i)
+ c[i] = hmm->omap[c[i]];
+}
+
+/*
+ smap takes a state character and return its index in the emission
+ matrix.
+ */
+void
+smap(HMM * hmm, char * c, int L)
+{
+ int i;
+
+ for (i = 0; i < L; ++i)
+ c[i] = hmm->smap[c[i]];
+}
+
+/*
+ alpha function for HMM. It computes the alpha function value
+ at the time T which is the length of the observation sequence
+ obs for every state. Note that alpha_vec needs to be malloc'd
+ before we supply it to alphaT.
+ Note that the return vector is in logarithmic form.
+ */
+static void
+alphaT(HMM * hmm, double * alpha_vec, char * obs, int T)
+{
+ int N = hmm->N;
+ double alpha_vec_prev[N];
+ double * init = hmm->init;
+ double ** a_mat = hmm->a_mat;
+ double ** e_mat = hmm->e_mat;
+ int i, j, k;
+
+ if (T <= 0) {
+ fprintf(stderr, "Nonpositive T, die!\n");
+ exit(-1);
+ }
+ for (i = 0; i < N; ++i)
+ alpha_vec[i] = log(init[i]) + log(e_mat[i][obs[0]]);
+ if (T == 1)
+ return;
+ if (T > 1) {
+ for (i = 1; i < T; ++i) {
+ for (j = 0; j < N; ++j) {
+ alpha_vec_prev[j] = alpha_vec[j];
+ alpha_vec[j] = log(0);
+ }
+
+ for (j = 0; j < N; ++j) {
+ for (k = 0; k < N; ++k)
+ alpha_vec[j] = sumLogProb(alpha_vec[j], alpha_vec_prev[k] + log(a_mat[k][j]));
+ alpha_vec[j] = alpha_vec[j] + log(e_mat[j][obs[i]]);
+ }
+ }
+ }
+ return;
+}
+
+/*
+ Palpha computes the probability of an observation sequence
+ using the alpha function.
+ */
+double
+Palpha(HMM * hmm, char * obs, int T)
+{
+ double P = 0.0;
+ int i;
+ double * alpha_vec;
+
+ alpha_vec = (double *) calloc(hmm->N, sizeof(double));
+ if (alpha_vec == NULL) {
+ fprintf(stderr, "Can't allocate memory for alpha vector, die!\n");
+ exit(-1);
+ }
+ alphaT(hmm, alpha_vec, obs, T);
+ P = sumLogProbs(alpha_vec, hmm->N);
+ free(alpha_vec);
+ return P;
+}
+
+
+/*
+ beta function for HMM. It computes the beta function value
+ at the time 1 for every state. Note that beta_vec should
+ be allocated memory thru malloc before we use it.
+ Note that the return vector is in logarithmic form.
+ */
+static void
+beta1(HMM * hmm, double * beta_vec, char * obs, int T)
+{
+ int N = hmm->N;
+ double beta_vec_next[N];
+ double * init = hmm->init;
+ double ** a_mat = hmm->a_mat;
+ double ** e_mat = hmm->e_mat;
+ int i, j, k;
+
+ if (T <= 0) {
+ fprintf(stderr, "Nonpositive T, die!\n");
+ exit(-1);
+ }
+ for (i = 0; i < N; ++i)
+ beta_vec[i] = 0;
+ if (T == 1)
+ return;
+ if (T > 1) {
+ for (i = T - 1; i >= 1; --i) {
+ for (j = 0; j < N; ++j) {
+ beta_vec_next[j] = beta_vec[j];
+ beta_vec[j] = log(0);
+ }
+
+ for (j = 0; j < N; ++j) {
+ for (k = 0; k < N; ++k)
+ beta_vec[j] = sumLogProb(beta_vec[j], beta_vec_next[k] + log(a_mat[j][k]) + log(e_mat[k][obs[i]]));
+ }
+ }
+ }
+ return;
+}
+
+/*
+ Pbeta computes the probability of an observation sequence
+ using the beta function.
+ */
+double
+Pbeta(HMM * hmm, char * obs, int T)
+{
+ double P = log(0);
+ int i;
+ double * beta_vec;
+
+ beta_vec = (double *) calloc(hmm->N, sizeof(double));
+ if (beta_vec == NULL) {
+ fprintf(stderr, "Can't allocate memory for beta vector, die!\n");
+ exit(-1);
+ }
+ beta1(hmm, beta_vec, obs, T);
+ for (i = 0; i < hmm->N; ++i) {
+ P = sumLogProb(P, log(hmm->init[i]) + log(hmm->e_mat[i][obs[0]]) + beta_vec[i]);
+ }
+ free(beta_vec);
+ return P;
+}
+
+/*
+ alpha_mat computes the values for the alpha function in the TxN space
+ where T is the length of the observation sequence and N is the number
+ of states. Note that the alpha matrix has to be malloc'd before we
+ supply it to alpha_mat. The computed values will be stored in
+ alpha.
+ */
+static void
+alpha_mat(HMM * hmm, double ** alpha, char * obs, int T)
+{
+ int i, j, t;
+ int N = hmm->N;
+ double ** a_mat = hmm->a_mat;
+ double ** e_mat = hmm->e_mat;
+ double * init = hmm->init;
+
+ for (i = 0; i < N; ++i)
+ alpha[i][0] = log(init[i]) + log(e_mat[i][obs[0]]);
+ for (t = 1; t < T; ++t) {
+ for (i = 0; i < N; ++i)
+ alpha[i][t] = log(0);
+ for (i = 0; i < N; ++i) {
+ for (j = 0; j < N; ++j)
+ alpha[i][t] = sumLogProb(alpha[i][t], alpha[j][t-1] + log(a_mat[j][i]));
+ alpha[i][t] += log(e_mat[i][obs[t]]);
+ }
+ }
+}
+
+/*
+ beta_mat computes the values for the alpha function in the TxN space
+ where T is the length of the observation sequence and N is the number
+ of states. Note that the beta matrix has to be malloc'd before we
+ supply it to beta_mat. The computed values will be stored in
+ beta.
+ */
+static void
+beta_mat(HMM * hmm, double ** beta, char * obs, int T)
+{
+ int i, j, t;
+ int N = hmm->N;
+ double ** a_mat = hmm->a_mat;
+ double ** e_mat = hmm->e_mat;
+ double * init = hmm->init;
+
+ for (i = 0; i < N; ++i)
+ beta[i][T-1] = 0;
+ for (t = T-2; t >= 0; --t) {
+ for (i = 0; i < N; ++i)
+ beta[i][t] = log(0);
+ for (i = 0; i < N; ++i)
+ for (j = 0; j < N; ++j)
+ beta[i][t] = sumLogProb(beta[i][t], beta[j][t+1] + log(a_mat[i][j]) + log(e_mat[j][obs[t+1]]));
+ }
+}
+
+/*
+ baum_welch takes an array of observation sequence strings (obs) and
+ the number of strings in the array (L) to estimate the parameters
+ of an HMM using a special case of Expectation Maximization called
+ Baum-Welch algorithm. baum_welch doesn't return anything but upon
+ its completion, it will set the init array, a_mat matrix, e_mat
+ matrix with parameters that maximize the chance of seeing the
+ observation sequences you supplied.
+ */
+void
+baum_welch(HMM * hmm, char ** obs, int * T, int L)
+{
+ int i, j, l, t;
+ int maxL = -1; /* length of longest training sequence */
+ double P[L];
+ double newP[L];
+ double S, newS, diff;
+ int N = hmm->N, M = hmm->M;
+ double ** alpha;
+ double ** beta;
+ double A[N][N];
+ double E[N][M];
+ double init[N];
+ double a_mat[N][N];
+ double e_mat[N][M];
+
+ for (l = 0; l < L; ++l) {
+ P[l] = Palpha(hmm, obs[l], T[l]);
+ if (maxL == -1)
+ maxL = T[l];
+ else if (T[l] > maxL)
+ maxL = T[l];
+ }
+
+ alpha = (double **) malloc(hmm->N*sizeof(double *));
+ if (alpha == NULL) {
+ fprintf(stderr, "Can't allocate memory for alpha matrix!\n");
+ exit(-1);
+ }
+ for (i = 0; i < N; ++i) {
+ alpha[i] = (double *) malloc(maxL*sizeof(double));
+ if (alpha[i] == NULL) {
+ fprintf(stderr, "Can't allocate memory for rows of alpha matrix!\n");
+ exit(-1);
+ }
+ }
+ beta = (double **) malloc(hmm->N*sizeof(double *));
+ if (beta == NULL) {
+ fprintf(stderr, "Can't allocate memory for beta matrix!\n");
+ exit(-1);
+ }
+ for (i = 0; i < N; ++i) {
+ beta[i] = (double *) malloc(maxL*sizeof(double));
+ if (beta[i] == NULL) {
+ fprintf(stderr, "Can't allocate memory for rows of beta matrix!\n");
+ exit(-1);
+ }
+ }
+
+ do { /* do...while loop */
+
+/* initialize the matrices */
+ for (i = 0; i < N; ++i)
+ init[i] = 0.0;
+ for (i = 0; i < N; ++i)
+ for (j = 0; j < N; ++j)
+ A[i][j] = 0.0;
+ for (i = 0; i < N; ++i)
+ for (j = 0; j < M; ++j)
+ E[i][j] = 0.0;
+
+/* use all training sequences to train */
+ for (l = 0; l < L; ++l) {
+ alpha_mat(hmm, alpha, obs[l], T[l]);
+ beta_mat(hmm, beta, obs[l], T[l]);
+
+/* estimate new initial state probability */
+ for (i = 0; i < N; ++i)
+ for (j = 0; j < N; ++j)
+ init[i] += exp(alpha[i][0] + log(hmm->a_mat[i][j]) + log(hmm->e_mat[j][obs[l][1]]) + beta[j][1] - P[l]);
+
+/* estimate A matrix */
+ for (i = 0; i < N; ++i)
+ for (j = 0; j < N; ++j)
+ for (t = 0; t < T[l]-1; ++t)
+ A[i][j] += exp(alpha[i][t] + log(hmm->a_mat[i][j]) + log(hmm->e_mat[j][obs[l][t+1]]) + beta[j][t+1] - P[l]);
+
+/* Estimate E matrix */
+ for (i = 0; i < N; ++i)
+ for (t = 0; t < T[l]; ++t)
+ E[i][obs[l][t]] += exp(alpha[i][t] + beta[i][t] - P[l]);
+ }
+
+/* Estimate init */
+ for (i = 0; i < N; ++i)
+ hmm->init[i] = init[i] / (double) L;
+
+/* Estimate a_mat */
+ for (i = 0; i < N; ++i)
+ for (j = 0; j < N; ++j)
+ a_mat[i][j] = hmm->a_mat[i][j];
+
+ for (i = 0; i < N; ++i)
+ for (j = 0; j < N; ++j) {
+ hmm->a_mat[i][j] = 0.0;
+ for (t = 0; t < N; ++t)
+ hmm->a_mat[i][j] += A[i][t];
+ hmm->a_mat[i][j] = A[i][j] / hmm->a_mat[i][j];
+ }
+
+/* Estimate e_mat */
+ for (i = 0; i < N; ++i)
+ for (j = 0; j < M; ++j)
+ e_mat[i][j] = hmm->e_mat[i][j];
+
+ for (i = 0; i < N; ++i)
+ for (j = 0; j < M; ++j) {
+ hmm->e_mat[i][j] = 0.0;
+ for (t = 0; t < M; ++t)