From fc4279451727d186a3e4fb55caf8f33fcc37a1af Mon Sep 17 00:00:00 2001 From: Tim Wintle Date: Tue, 27 Sep 2011 23:57:25 +0100 Subject: [PATCH 1/5] Pre-allocation of single char strings For dense tries this allows a very significant memory reduction --- Bio/trie.c | 136 +++++++++++++++++++++++++++++++++++++---------- Bio/triemodule.c | 1 - 2 files changed, 109 insertions(+), 28 deletions(-) diff --git a/Bio/trie.c b/Bio/trie.c index c7973978b4e..ffab97599b4 100644 --- a/Bio/trie.c +++ b/Bio/trie.c @@ -1,17 +1,101 @@ #include /* printf */ #include /* malloc */ -#include /* strcmp, strlen */ +#include /* strcmp, strncmp, strlen */ #include "trie.h" +const char _single_char_suffixes[512] = { + 0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0, + 8,0,9,0,10,0,11,0,12,0,13,0,14,0,15,0, + 16,0,17,0,18,0,19,0,20,0,21,0,22,0,23,0, + 24,0,25,0,26,0,27,0,28,0,29,0,30,0,31,0, + 32,0,33,0,34,0,35,0,36,0,37,0,38,0,39,0, + 40,0,41,0,42,0,43,0,44,0,45,0,46,0,47,0, + 48,0,49,0,50,0,51,0,52,0,53,0,54,0,55,0, + 56,0,57,0,58,0,59,0,60,0,61,0,62,0,63,0, + 64,0,65,0,66,0,67,0,68,0,69,0,70,0,71,0, + 72,0,73,0,74,0,75,0,76,0,77,0,78,0,79,0, + 80,0,81,0,82,0,83,0,84,0,85,0,86,0,87,0, + 88,0,89,0,90,0,91,0,92,0,93,0,94,0,95,0, + 96,0,97,0,98,0,99,0,100,0,101,0,102,0,103,0, + 104,0,105,0,106,0,107,0,108,0,109,0,110,0,111,0, + 112,0,113,0,114,0,115,0,116,0,117,0,118,0,119,0, + 120,0,121,0,122,0,123,0,124,0,125,0,126,0,127,0, + 128,0,129,0,130,0,131,0,132,0,133,0,134,0,135,0, + 136,0,137,0,138,0,139,0,140,0,141,0,142,0,143,0, + 144,0,145,0,146,0,147,0,148,0,149,0,150,0,151,0, + 152,0,153,0,154,0,155,0,156,0,157,0,158,0,159,0, + 160,0,161,0,162,0,163,0,164,0,165,0,166,0,167,0, + 168,0,169,0,170,0,171,0,172,0,173,0,174,0,175,0, + 176,0,177,0,178,0,179,0,180,0,181,0,182,0,183,0, + 184,0,185,0,186,0,187,0,188,0,189,0,190,0,191,0, + 192,0,193,0,194,0,195,0,196,0,197,0,198,0,199,0, + 200,0,201,0,202,0,203,0,204,0,205,0,206,0,207,0, + 208,0,209,0,210,0,211,0,212,0,213,0,214,0,215,0, + 216,0,217,0,218,0,219,0,220,0,221,0,222,0,223,0, + 224,0,225,0,226,0,227,0,228,0,229,0,230,0,231,0, + 232,0,233,0,234,0,235,0,236,0,237,0,238,0,239,0, + 240,0,241,0,242,0,243,0,244,0,245,0,246,0,247,0, + 248,0,249,0,250,0,251,0,252,0,253,0,254,0,255,0, +}; + +static char* duplicate(const char* s) { + int length = strlen(s); + if (length == 1) { + return (char *)_single_char_suffixes + (s[0] * 2); + } + // Don't use strdup, as it's not ANSI C. + char* t = malloc((strlen(s) + 1) * sizeof(char)); + if (!t) return NULL; + strcpy(t, s); + return t; +} + +static char* shared_suffix(const char* s, int length) { + if (length == 1) { + return (char *)(_single_char_suffixes + ((int)s[0] * 2)); + } + char *t = malloc(length + 1); + if (!t) return NULL; + strncpy(t, s, length); + t[length] = 0; + return t; +} + +static void free_suffix(char *s) { + if (s >= _single_char_suffixes || s < _single_char_suffixes + (256 * 2)) { + // single character suffix + return; + } else { + free(s); + } +} + +/** + * Old method without using lookup table + */ +/* static char* duplicate(const char* s) { - /* Don't use strdup, as it's not ANSI C. */ + // Don't use strdup, as it's not ANSI C. char* t = malloc((strlen(s)+1)*sizeof(char)); if (!t) return NULL; strcpy(t, s); return t; } +static char *shared_suffix(const char* s, int length) { + char *t= malloc(length + 1); + if (!t) return NULL; + strncpy(t, s, length); + t[length] = 0; + return t; +} + +static void free_suffix(char *s) { + free(s); +} +*/ + /* Transition holds information about the transitions leading from * one Trie to another. The trie structure here is different from @@ -130,7 +214,7 @@ int Trie_set(Trie* trie, const char *key, const void *value) { insert_memerror: if(new_transitions) free(new_transitions); if(newtrie) free(newtrie); - if(new_suffix) free(new_suffix); + if(new_suffix) free_suffix(new_suffix); return 1; } } @@ -156,10 +240,8 @@ int Trie_set(Trie* trie, const char *key, const void *value) { Trie* newtrie=NULL; char *new_suffix1=NULL, *new_suffix2=NULL; - if(!(new_suffix1 = malloc(chars_shared+1))) + if(!(new_suffix1 = shared_suffix(key, chars_shared))) goto split_memerror; - strncpy(new_suffix1, key, chars_shared); - new_suffix1[chars_shared] = 0; if(!(new_suffix2 = duplicate(suffix+chars_shared))) goto split_memerror; if(!(newtrie = Trie_new())) @@ -170,7 +252,7 @@ int Trie_set(Trie* trie, const char *key, const void *value) { newtrie->transitions[0].next = transition->next; newtrie->transitions[0].suffix = new_suffix2; - free(transition->suffix); + free_suffix(transition->suffix); transition->suffix = new_suffix1; transition->next = newtrie; @@ -178,8 +260,8 @@ int Trie_set(Trie* trie, const char *key, const void *value) { split_memerror: if(newtrie && newtrie->transitions) free(newtrie->transitions); if(newtrie) free(newtrie); - if(new_suffix2) free(new_suffix2); - if(new_suffix1) free(new_suffix1); + if(new_suffix2) free_suffix(new_suffix2); + if(new_suffix1) free_suffix(new_suffix1); return 1; } } @@ -196,7 +278,7 @@ void Trie_del(Trie* trie) { for(i=0; inum_transitions; i++) { Transition* transition = &trie->transitions[i]; if(transition->suffix) - free(transition->suffix); + free_suffix(transition->suffix); Trie_del(transition->next); } free(trie); @@ -215,22 +297,22 @@ void *Trie_get(const Trie* trie, const char *key) { first = 0; last = trie->num_transitions-1; while(first <= last) { - Transition* transition; - char *suffix; - int c; - mid = (first+last)/2; - transition = &trie->transitions[mid]; - suffix = transition->suffix; - /* If suffix is a substring of key, then get the value from - the next trie. - */ - c = strncmp(key, suffix, strlen(suffix)); - if(c < 0) - last = mid-1; - else if(c > 0) - first = mid+1; - else - return Trie_get(transition->next, key+strlen(suffix)); + Transition* transition; + char *suffix; + int c; + mid = (first+last)/2; + transition = &trie->transitions[mid]; + suffix = transition->suffix; + /* If suffix is a substring of key, then get the value from + the next trie. + */ + c = strncmp(key, suffix, strlen(suffix)); + if(c < 0) + last = mid-1; + else if(c > 0) + first = mid+1; + else + return Trie_get(transition->next, key+strlen(suffix)); } return NULL; } @@ -723,7 +805,7 @@ int _deserialize_transition(Transition* transition, _deserialize_transition_error: if(transition->suffix) { - free(transition->suffix); + free_suffix(transition->suffix); transition->suffix = NULL; } if(transition->next) { diff --git a/Bio/triemodule.c b/Bio/triemodule.c index ec0e8a22ade..c2bdee380df 100644 --- a/Bio/triemodule.c +++ b/Bio/triemodule.c @@ -655,6 +655,5 @@ DL_EXPORT(void) inittrie(void) { Trie_Type.ob_type = &PyType_Type; - (void) Py_InitModule3("trie", trie_methods, trie__doc__); } From 21908a8db8470c11ac2891bf8f2114d3447aca6e Mon Sep 17 00:00:00 2001 From: Tim Wintle Date: Wed, 28 Sep 2011 02:12:13 +0100 Subject: [PATCH 2/5] Removing old functions These were commented out but here for simplicity of testing alternative performance --- Bio/trie.c | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/Bio/trie.c b/Bio/trie.c index ffab97599b4..c1a32ef4dcb 100644 --- a/Bio/trie.c +++ b/Bio/trie.c @@ -71,31 +71,6 @@ static void free_suffix(char *s) { } } -/** - * Old method without using lookup table - */ -/* -static char* duplicate(const char* s) { - // Don't use strdup, as it's not ANSI C. - char* t = malloc((strlen(s)+1)*sizeof(char)); - if (!t) return NULL; - strcpy(t, s); - return t; -} - -static char *shared_suffix(const char* s, int length) { - char *t= malloc(length + 1); - if (!t) return NULL; - strncpy(t, s, length); - t[length] = 0; - return t; -} - -static void free_suffix(char *s) { - free(s); -} -*/ - /* Transition holds information about the transitions leading from * one Trie to another. The trie structure here is different from From f8e111c7eedfef34d1131355ad0645e58f4b3f03 Mon Sep 17 00:00:00 2001 From: Tim Wintle Date: Fri, 30 Sep 2011 10:07:46 +0100 Subject: [PATCH 3/5] Fixing memory leak --- Bio/trie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Bio/trie.c b/Bio/trie.c index c1a32ef4dcb..d4e5e574c23 100644 --- a/Bio/trie.c +++ b/Bio/trie.c @@ -63,7 +63,7 @@ static char* shared_suffix(const char* s, int length) { } static void free_suffix(char *s) { - if (s >= _single_char_suffixes || s < _single_char_suffixes + (256 * 2)) { + if (s >= _single_char_suffixes && s < _single_char_suffixes + (256 * 2)) { // single character suffix return; } else { From ad435a0693e46ffdd8dcd2687b009d6a717fe140 Mon Sep 17 00:00:00 2001 From: Tim Wintle Date: Mon, 3 Oct 2011 09:24:33 +0100 Subject: [PATCH 4/5] Issues with non-ascii codepoints --- Bio/trie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Bio/trie.c b/Bio/trie.c index d4e5e574c23..8b0caa78b1f 100644 --- a/Bio/trie.c +++ b/Bio/trie.c @@ -42,7 +42,7 @@ const char _single_char_suffixes[512] = { static char* duplicate(const char* s) { int length = strlen(s); if (length == 1) { - return (char *)_single_char_suffixes + (s[0] * 2); + return (char *)_single_char_suffixes + ((unsigned char)s[0] * 2); } // Don't use strdup, as it's not ANSI C. char* t = malloc((strlen(s) + 1) * sizeof(char)); From 4379e8663c9f0b7daa3caae3fa20c66b0cac765e Mon Sep 17 00:00:00 2001 From: Tim Wintle Date: Mon, 3 Oct 2011 09:25:35 +0100 Subject: [PATCH 5/5] Issues with non-ascii codepoints --- Bio/trie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Bio/trie.c b/Bio/trie.c index 8b0caa78b1f..92b5e55023c 100644 --- a/Bio/trie.c +++ b/Bio/trie.c @@ -53,7 +53,7 @@ static char* duplicate(const char* s) { static char* shared_suffix(const char* s, int length) { if (length == 1) { - return (char *)(_single_char_suffixes + ((int)s[0] * 2)); + return (char *)(_single_char_suffixes + ((unsigned char)s[0] * 2)); } char *t = malloc(length + 1); if (!t) return NULL;