Skip to content

Commit

Permalink
Switched from hashmap scoring matrix to array for speed
Browse files Browse the repository at this point in the history
  • Loading branch information
njbooher committed Jan 7, 2014
1 parent 2c42531 commit cbc2af3
Show file tree
Hide file tree
Showing 5 changed files with 389 additions and 221 deletions.
2 changes: 1 addition & 1 deletion bcutils
111 changes: 104 additions & 7 deletions cython_wrapper/talesf.pyx
@@ -1,36 +1,133 @@
from libc.stdlib cimport malloc, free, calloc
from libc.stdio cimport FILE, stdout, fopen, fclose
import re

ctypedef void (*valuefreefunc)(void *)

cdef extern from "Hashmap.h":

ctypedef struct Hashmap:
pass

Hashmap *hashmap_new(int size)
int hashmap_add(Hashmap *hash, char *key, void *value)
void hashmap_delete(Hashmap *hash, valuefreefunc)
int hashmap_size(Hashmap *hash)
void *hashmap_get(Hashmap *hash, char *key)
char **hashmap_keys(Hashmap *hash)

cdef extern from "Array.h":

ctypedef struct Array:
pass

Array *array_new()
void array_add(Array *r, void *e)
void *array_get(Array *r, int index)
void array_delete(Array *r, valuefreefunc)

cdef extern from "bcutils.h":
double *double_array(double a, double c, double g, double t, double dummy)
Hashmap *get_diresidue_probabilities(Array *rvdseq, double w)
Hashmap *convert_probabilities_to_scores(Hashmap *diresidue_probabilities)

cdef get_best_score(rvd_seq, Hashmap *rvdscores):

cdef:
int i,j
double best_score = 0.0
double min_score = -1.0
double *scores

for i in range(len(rvd_seq)):
scores = <double*> hashmap_get(rvdscores, rvd_seq[i])
if scores == NULL:
return -1.0
for j in range(4):
if j == 0 or scores[j] < min_score:
min_score = scores[j]
best_score += min_score

return best_score

cdef extern from "talesf.h":
int run_talesf_task(Hashmap *kwargs)

def ScoreTalesfTask(char *seqfilename, char* rvdstring, char *output_filepath, char *log_filepath, bint forwardonly, int c_upstream, double cutoff, int numprocs, char *organism_name):

cdef Hashmap *talesf_kwargs = hashmap_new(32)
def ScoreTalesfTask(char *seqfilename, rvd_string, char *output_filepath, char *log_filepath, bint forwardonly, int c_upstream, double cutoff, int numprocs, char *organism_name):

cdef:
int i, j
double weight = 0.9

Hashmap *talesf_kwargs = hashmap_new(32)

int BIGGEST_RVD_SCORE_EVER = 100

Array *rvd_array = array_new()

char *rvd_str

char *rvd_string_ptr = rvd_string

cdef double weight = 0.9
split_rvd_string = re.split(' |_', rvd_string)

for rvd in split_rvd_string:
rvd_str = rvd
array_add(rvd_array, rvd_str)

cdef:

Hashmap *diresidue_probabilities = get_diresidue_probabilities(rvd_array, weight)
Hashmap *diresidue_scores = convert_probabilities_to_scores(diresidue_probabilities)

hashmap_delete(diresidue_probabilities, NULL)
hashmap_add(diresidue_scores, "XX", double_array(0, 0, 0, 0, BIGGEST_RVD_SCORE_EVER))

cdef:
double **scoring_matrix = <double**> calloc(hashmap_size(diresidue_scores), sizeof(double*))

unsigned int *rvd_seq = <unsigned int*> calloc(len(split_rvd_string), sizeof(unsigned int))

unsigned int rvd_seq_len = len(split_rvd_string)

double best_score = get_best_score(split_rvd_string, diresidue_scores)

cdef char **diresidues = hashmap_keys(diresidue_scores)

rvd_to_int = {}

for i in range(hashmap_size(diresidue_scores)):
rvd_to_int[diresidues[i]] = i
scoring_matrix[i] = <double*> hashmap_get(diresidue_scores, diresidues[i])
scoring_matrix[i][4] = BIGGEST_RVD_SCORE_EVER

for i in range(rvd_seq_len):
rvd_seq[i] = rvd_to_int[split_rvd_string[i]]

hashmap_add(talesf_kwargs, "seq_filename", seqfilename)
hashmap_add(talesf_kwargs, "rvd_string", rvdstring)
hashmap_add(talesf_kwargs, "rvd_seq", rvd_seq)
hashmap_add(talesf_kwargs, "rvd_seq_len", &rvd_seq_len)
hashmap_add(talesf_kwargs, "rvd_string", rvd_string_ptr)
hashmap_add(talesf_kwargs, "best_score", &best_score)
hashmap_add(talesf_kwargs, "scoring_matrix", scoring_matrix)
hashmap_add(talesf_kwargs, "output_filepath", output_filepath)
hashmap_add(talesf_kwargs, "log_filepath", log_filepath)
hashmap_add(talesf_kwargs, "weight", &weight)
hashmap_add(talesf_kwargs, "cutoff", &cutoff)
hashmap_add(talesf_kwargs, "forward_only", &forwardonly)
hashmap_add(talesf_kwargs, "c_upstream", &c_upstream)
hashmap_add(talesf_kwargs, "num_procs", &numprocs)
hashmap_add(talesf_kwargs, "organism_name", organism_name)

hashmap_add(talesf_kwargs, "forward_only", &forwardonly)

cdef int task_result = run_talesf_task(talesf_kwargs)

free(scoring_matrix)
free(diresidues)
array_delete(rvd_array, NULL)
hashmap_delete(diresidue_scores, free)
free(rvd_seq)

hashmap_delete(talesf_kwargs, NULL)

return task_result
89 changes: 86 additions & 3 deletions frontend.c
@@ -1,8 +1,14 @@
#include <getopt.h>
#include <stdio.h>
#include <omp.h>

#include "talesf.h"

#include <bcutils/Hashmap.h>
#include <bcutils/Array.h>
#include <bcutils/bcutils.h>

#define BIGGEST_RVD_SCORE_EVER 100

// Print usage statement
void print_usage(FILE *out_stream, char *prog_name)
Expand Down Expand Up @@ -93,6 +99,11 @@ int main(int argc, char **argv)
fprintf(stderr, "Error: unable to convert numprocs '%s' to an integer\n", optarg);
return 1;
}
if( num_procs > omp_get_num_procs())
{
fprintf(stderr, "Error: numprocs was %d but only %d are available\n", num_procs, omp_get_num_procs());
return 1;
}
break;

case 'o':
Expand Down Expand Up @@ -127,23 +138,95 @@ int main(int argc, char **argv)

seq_filepath = argv[optind];
rvd_string = argv[optind + 1];

Hashmap *talesf_kwargs = hashmap_new(32);


Array *rvd_array = rvd_string_to_array(rvd_string);

// Get RVD/bp matching scores

Hashmap *diresidue_probabilities = get_diresidue_probabilities(rvd_array, weight);
Hashmap *diresidue_scores = convert_probabilities_to_scores(diresidue_probabilities);
hashmap_delete(diresidue_probabilities, NULL);

// Convert hashmap to int map

hashmap_add(diresidue_scores, "XX", double_array(0, 0, 0, 0, BIGGEST_RVD_SCORE_EVER));

double **scoring_matrix = calloc(hashmap_size(diresidue_scores), sizeof(double*));

Hashmap *rvd_to_int = hashmap_new(hashmap_size(diresidue_scores));
unsigned int *rvd_ints = calloc(hashmap_size(diresidue_scores), sizeof(unsigned int));

char **diresidues = hashmap_keys(diresidue_scores);

for (unsigned int i = 0; i < hashmap_size(diresidue_scores); i++) {

rvd_ints[i] = i;
hashmap_add(rvd_to_int, diresidues[i], rvd_ints + i);

scoring_matrix[i] = hashmap_get(diresidue_scores, diresidues[i]);
scoring_matrix[i][4] = BIGGEST_RVD_SCORE_EVER;

}

unsigned int *rvd_seq = (unsigned int*) calloc(array_size(rvd_array), sizeof(unsigned int));

for (unsigned int i = 0; i < array_size(rvd_array); i++) {
rvd_seq[i] = *(unsigned int *)(hashmap_get(rvd_to_int, array_get(rvd_array, i)));
}

unsigned int rvd_seq_len = array_size(rvd_array);

double best_score = get_best_score(rvd_array, diresidue_scores);

hashmap_add(talesf_kwargs, "seq_filename", seq_filepath);
hashmap_add(talesf_kwargs, "rvd_seq", rvd_seq);
hashmap_add(talesf_kwargs, "rvd_seq_len", &rvd_seq_len);
hashmap_add(talesf_kwargs, "rvd_string", rvd_string);
hashmap_add(talesf_kwargs, "best_score", &best_score);
hashmap_add(talesf_kwargs, "scoring_matrix", scoring_matrix);
hashmap_add(talesf_kwargs, "output_filepath", out_filepath);
hashmap_add(talesf_kwargs, "log_filepath", log_filepath);
hashmap_add(talesf_kwargs, "weight", &weight);
hashmap_add(talesf_kwargs, "cutoff", &cutoff);
hashmap_add(talesf_kwargs, "forward_only", &forward_only);
hashmap_add(talesf_kwargs, "c_upstream", &c_upstream);
hashmap_add(talesf_kwargs, "num_procs", &num_procs);
hashmap_add(talesf_kwargs, "organism_name", "");

hashmap_add(talesf_kwargs, "forward_only", &forward_only);

int task_result = run_talesf_task(talesf_kwargs);

hashmap_delete(talesf_kwargs, NULL);

if (rvd_seq) {
free(rvd_seq);
}

if (scoring_matrix) {
free(scoring_matrix);
}

if (rvd_to_int) {
hashmap_delete(rvd_to_int, NULL);
}

if (rvd_ints) {
free(rvd_ints);
}

if (diresidues) {
free(diresidues);
}

if (rvd_array) {
array_delete(rvd_array, free);
}

if (diresidue_scores) {
hashmap_delete(diresidue_scores, free);
}

return task_result;

Expand Down
4 changes: 2 additions & 2 deletions makefile
Expand Up @@ -2,10 +2,10 @@ LIB = libtalesf.so
PROG = talesf

default:
gcc -g -O3 -Wall -m64 -o $(LIB) talesf.c -lbcutils -lm -lz -fopenmp -fPIC -shared -rdynamic
gcc -fmax-errors=1 -std=gnu99 -g -O3 -Wall -m64 -o $(LIB) talesf.c -lbcutils -lm -lz -fopenmp -fPIC -shared -rdynamic

frontend:
gcc -g -O3 -Wall -m64 -I /usr/include/talesf -o $(PROG) frontend.c -lbcutils -ltalesf
gcc -fmax-errors=1 -std=gnu99 -g -O3 -Wall -m64 -I /usr/include/talesf -o $(PROG) frontend.c -lbcutils -ltalesf -fopenmp

clean:
rm -f *.o *~ $(LIB)
Expand Down

0 comments on commit cbc2af3

Please sign in to comment.