Skip to content
This repository
Newer
Older
100644 424 lines (393 sloc) 10.825 kb
62cd1a28 »
2011-01-13 Added the khmm library
1 #include <math.h>
2 #include <stdio.h>
3 #include <assert.h>
4 #include <string.h>
5 #include <stdlib.h>
6 #include "khmm.h"
7
8 // new/delete hmm_par_t
9
10 hmm_par_t *hmm_new_par(int m, int n)
11 {
12 hmm_par_t *hp;
13 int i;
14 assert(m > 0 && n > 0);
15 hp = (hmm_par_t*)calloc(1, sizeof(hmm_par_t));
16 hp->m = m; hp->n = n;
17 hp->a0 = (FLOAT*)calloc(n, sizeof(FLOAT));
18 hp->a = (FLOAT**)calloc2(n, n, sizeof(FLOAT));
19 hp->e = (FLOAT**)calloc2(m + 1, n, sizeof(FLOAT));
20 hp->ae = (FLOAT**)calloc2((m + 1) * n, n, sizeof(FLOAT));
21 for (i = 0; i != n; ++i) hp->e[m][i] = 1.0;
22 return hp;
23 }
24 void hmm_delete_par(hmm_par_t *hp)
25 {
26 int i;
27 if (hp == 0) return;
28 for (i = 0; i != hp->n; ++i) free(hp->a[i]);
29 for (i = 0; i <= hp->m; ++i) free(hp->e[i]);
30 for (i = 0; i < (hp->m + 1) * hp->n; ++i) free(hp->ae[i]);
31 free(hp->a); free(hp->e); free(hp->a0); free(hp->ae);
32 free(hp);
33 }
34
35 // new/delete hmm_data_t
36
37 hmm_data_t *hmm_new_data(int L, const char *seq, const hmm_par_t *hp)
38 {
39 hmm_data_t *hd;
40 hd = (hmm_data_t*)calloc(1, sizeof(hmm_data_t));
41 hd->L = L;
42 hd->seq = (char*)malloc(L + 1);
43 memcpy(hd->seq + 1, seq, L);
44 return hd;
45 }
46 void hmm_delete_data(hmm_data_t *hd)
47 {
48 int i;
49 if (hd == 0) return;
50 for (i = 0; i <= hd->L; ++i) {
51 if (hd->f) free(hd->f[i]);
52 if (hd->b) free(hd->b[i]);
53 }
54 free(hd->f); free(hd->b); free(hd->s); free(hd->v); free(hd->p); free(hd->seq);
55 free(hd);
56 }
57
58 // new/delete hmm_exp_t
59
60 hmm_exp_t *hmm_new_exp(const hmm_par_t *hp)
61 {
62 hmm_exp_t *he;
63 assert(hp);
64 he = (hmm_exp_t*)calloc(1, sizeof(hmm_exp_t));
65 he->m = hp->m; he->n = hp->n;
66 he->A0 = (FLOAT*)calloc(hp->n, sizeof(FLOAT));
67 he->A = (FLOAT**)calloc2(hp->n, hp->n, sizeof(FLOAT));
68 he->E = (FLOAT**)calloc2(hp->m + 1, hp->n, sizeof(FLOAT));
69 return he;
70 }
71 void hmm_delete_exp(hmm_exp_t *he)
72 {
73 int i;
74 if (he == 0) return;
75 for (i = 0; i != he->n; ++i) free(he->A[i]);
76 for (i = 0; i <= he->m; ++i) free(he->E[i]);
77 free(he->A); free(he->E); free(he->A0);
78 free(he);
79 }
80
81 // Viterbi algorithm
82
83 FLOAT hmm_Viterbi(const hmm_par_t *hp, hmm_data_t *hd)
84 {
85 FLOAT **la, **le, *preV, *curV, max;
86 int **Vmax, max_l; // backtrace matrix
87 int k, l, b, u;
88
89 if (hd->v) free(hd->v);
90 hd->v = (int*)calloc(hd->L+1, sizeof(int));
91 la = (FLOAT**)calloc2(hp->n, hp->n, sizeof(FLOAT));
92 le = (FLOAT**)calloc2(hp->m + 1, hp->n, sizeof(FLOAT));
93 Vmax = (int**)calloc2(hd->L+1, hp->n, sizeof(int));
94 preV = (FLOAT*)malloc(sizeof(FLOAT) * hp->n);
95 curV = (FLOAT*)malloc(sizeof(FLOAT) * hp->n);
96 for (k = 0; k != hp->n; ++k)
97 for (l = 0; l != hp->n; ++l)
98 la[k][l] = log(hp->a[l][k]); // this is not a bug
99 for (b = 0; b != hp->m; ++b)
100 for (k = 0; k != hp->n; ++k)
101 le[b][k] = log(hp->e[b][k]);
102 for (k = 0; k != hp->n; ++k) le[hp->m][k] = 0.0;
103 // V_k(1)
104 for (k = 0; k != hp->n; ++k) {
105 preV[k] = le[(int)hd->seq[1]][k] + log(hp->a0[k]);
106 Vmax[1][k] = 0;
107 }
108 // all the rest
109 for (u = 2; u <= hd->L; ++u) {
110 FLOAT *tmp, *leu = le[(int)hd->seq[u]];
111 for (k = 0; k != hp->n; ++k) {
112 FLOAT *laa = la[k];
113 for (l = 0, max = -HMM_INF, max_l = -1; l != hp->n; ++l) {
114 if (max < preV[l] + laa[l]) {
115 max = preV[l] + laa[l];
116 max_l = l;
117 }
118 }
119 assert(max_l >= 0); // cannot be zero
120 curV[k] = leu[k] + max;
121 Vmax[u][k] = max_l;
122 }
123 tmp = curV; curV = preV; preV = tmp; // swap
124 }
125 // backtrace
126 for (k = 0, max_l = -1, max = -HMM_INF; k != hp->n; ++k) {
127 if (max < preV[k]) {
128 max = preV[k]; max_l = k;
129 }
130 }
131 assert(max_l >= 0); // cannot be zero
132 hd->v[hd->L] = max_l;
133 for (u = hd->L; u >= 1; --u)
134 hd->v[u-1] = Vmax[u][hd->v[u]];
135 for (k = 0; k != hp->n; ++k) free(la[k]);
136 for (b = 0; b < hp->m; ++b) free(le[b]);
137 for (u = 0; u <= hd->L; ++u) free(Vmax[u]);
138 free(la); free(le); free(Vmax); free(preV); free(curV);
139 hd->status |= HMM_VITERBI;
140 return max;
141 }
142
143 // forward algorithm
144
145 void hmm_forward(const hmm_par_t *hp, hmm_data_t *hd)
146 {
147 FLOAT sum, tmp, **at;
148 int u, k, l;
149 int n, m, L;
150 assert(hp && hd);
151 // allocate memory for hd->f and hd->s
152 n = hp->n; m = hp->m; L = hd->L;
153 if (hd->s) free(hd->s);
154 if (hd->f) {
155 for (k = 0; k <= hd->L; ++k) free(hd->f[k]);
156 free(hd->f);
157 }
158 hd->f = (FLOAT**)calloc2(hd->L+1, hp->n, sizeof(FLOAT));
159 hd->s = (FLOAT*)calloc(hd->L+1, sizeof(FLOAT));
160 hd->status &= ~(unsigned)HMM_FORWARD;
161 // at[][] array helps to improve the cache efficiency
162 at = (FLOAT**)calloc2(n, n, sizeof(FLOAT));
163 // transpose a[][]
164 for (k = 0; k != n; ++k)
165 for (l = 0; l != n; ++l)
166 at[k][l] = hp->a[l][k];
167 // f[0], but it should never be used
168 hd->s[0] = 1.0;
169 for (k = 0; k != n; ++k) hd->f[0][k] = 0.0;
170 // f[1]
171 for (k = 0, sum = 0.0; k != n; ++k)
172 sum += (hd->f[1][k] = hp->a0[k] * hp->e[(int)hd->seq[1]][k]);
173 for (k = 0; k != n; ++k) hd->f[1][k] /= sum;
174 hd->s[1] = sum;
175 // f[2..hmmL], the core loop
176 for (u = 2; u <= L; ++u) {
177 FLOAT *fu = hd->f[u], *fu1 = hd->f[u-1], *eu = hp->e[(int)hd->seq[u]];
178 for (k = 0, sum = 0.0; k != n; ++k) {
179 FLOAT *aa = at[k];
180 for (l = 0, tmp = 0.0; l != n; ++l) tmp += fu1[l] * aa[l];
181 sum += (fu[k] = eu[k] * tmp);
182 }
183 for (k = 0; k != n; ++k) fu[k] /= sum;
184 hd->s[u] = sum;
185 }
186 // free at array
187 for (k = 0; k != hp->n; ++k) free(at[k]);
188 free(at);
189 hd->status |= HMM_FORWARD;
190 }
191
192 // precalculate hp->ae
193
194 void hmm_pre_backward(hmm_par_t *hp)
195 {
196 int m, n, b, k, l;
197 assert(hp);
198 m = hp->m; n = hp->n;
199 for (b = 0; b <= m; ++b) {
200 for (k = 0; k != n; ++k) {
201 FLOAT *p = hp->ae[b * hp->n + k];
202 for (l = 0; l != n; ++l)
203 p[l] = hp->e[b][l] * hp->a[k][l];
204 }
205 }
206 }
207
208 // backward algorithm
209
210 void hmm_backward(const hmm_par_t *hp, hmm_data_t *hd)
211 {
212 FLOAT tmp;
213 int k, l, u;
214 int m, n, L;
215 assert(hp && hd);
216 assert(hd->status & HMM_FORWARD);
217 // allocate memory for hd->b
218 m = hp->m; n = hp->n; L = hd->L;
219 if (hd->b) {
220 for (k = 0; k <= hd->L; ++k) free(hd->b[k]);
221 free(hd->b);
222 }
223 hd->status &= ~(unsigned)HMM_BACKWARD;
224 hd->b = (FLOAT**)calloc2(L+1, hp->n, sizeof(FLOAT));
225 // b[L]
226 for (k = 0; k != hp->n; ++k) hd->b[L][k] = 1.0 / hd->s[L];
227 // b[1..L-1], the core loop
228 for (u = L-1; u >= 1; --u) {
229 FLOAT *bu1 = hd->b[u+1], **p = hp->ae + (int)hd->seq[u+1] * n;
230 for (k = 0; k != n; ++k) {
231 FLOAT *q = p[k];
232 for (l = 0, tmp = 0.0; l != n; ++l) tmp += q[l] * bu1[l];
233 hd->b[u][k] = tmp / hd->s[u];
234 }
235 }
236 hd->status |= HMM_BACKWARD;
237 for (l = 0, tmp = 0.0; l != n; ++l)
238 tmp += hp->a0[l] * hd->b[1][l] * hp->e[(int)hd->seq[1]][l];
239 if (tmp > 1.0 + 1e-6 || tmp < 1.0 - 1e-6) // in theory, tmp should always equal to 1
240 fprintf(stderr, "++ Underflow may have happened (%lg).\n", tmp);
241 }
242
243 // log-likelihood of the observation
244
245 FLOAT hmm_lk(const hmm_data_t *hd)
246 {
247 FLOAT sum = 0.0, prod = 1.0;
248 int u, L;
249 L = hd->L;
250 assert(hd->status & HMM_FORWARD);
251 for (u = 1; u <= L; ++u) {
252 prod *= hd->s[u];
253 if (prod < HMM_TINY || prod >= 1.0/HMM_TINY) { // reset
254 sum += log(prod);
255 prod = 1.0;
256 }
257 }
258 sum += log(prod);
259 return sum;
260 }
261
262 // posterior decoding
263
264 void hmm_post_decode(const hmm_par_t *hp, hmm_data_t *hd)
265 {
266 int u, k;
267 assert(hd->status && HMM_BACKWARD);
268 if (hd->p) free(hd->p);
269 hd->p = (int*)calloc(hd->L + 1, sizeof(int));
270 for (u = 1; u <= hd->L; ++u) {
271 FLOAT prob, max, *fu = hd->f[u], *bu = hd->b[u], su = hd->s[u];
272 int max_k;
273 for (k = 0, max = -1.0, max_k = -1; k != hp->n; ++k) {
274 if (max < (prob = fu[k] * bu[k] * su)) {
275 max = prob; max_k = k;
276 }
277 }
278 assert(max_k >= 0);
279 hd->p[u] = max_k;
280 }
281 hd->status |= HMM_POSTDEC;
282 }
283
284 // posterior probability of states
285
286 FLOAT hmm_post_state(const hmm_par_t *hp, const hmm_data_t *hd, int u, FLOAT *prob)
287 {
288 FLOAT sum = 0.0, ss = hd->s[u], *fu = hd->f[u], *bu = hd->b[u];
289 int k;
290 for (k = 0; k != hp->n; ++k)
291 sum += (prob[k] = fu[k] * bu[k] * ss);
292 return sum; // in theory, this should always equal to 1.0
293 }
294
295 // expected counts
296
297 hmm_exp_t *hmm_expect(const hmm_par_t *hp, const hmm_data_t *hd)
298 {
299 int k, l, u, b, m, n;
300 hmm_exp_t *he;
301 assert(hd->status & HMM_BACKWARD);
302 he = hmm_new_exp(hp);
303 // initialization
304 m = hp->m; n = hp->n;
305 for (k = 0; k != n; ++k)
306 for (l = 0; l != n; ++l) he->A[k][l] = HMM_TINY;
307 for (b = 0; b <= m; ++b)
308 for (l = 0; l != n; ++l) he->E[b][l] = HMM_TINY;
309 // calculate A_{kl} and E_k(b), k,l\in[0,n)
310 for (u = 1; u < hd->L; ++u) {
311 FLOAT *fu = hd->f[u], *bu = hd->b[u], *bu1 = hd->b[u+1], ss = hd->s[u];
312 FLOAT *Ec = he->E[(int)hd->seq[u]], **p = hp->ae + (int)hd->seq[u+1] * n;
313 for (k = 0; k != n; ++k) {
314 FLOAT *q = p[k], *AA = he->A[k], fuk = fu[k];
315 for (l = 0; l != n; ++l) // this is cache-efficient
316 AA[l] += fuk * q[l] * bu1[l];
317 Ec[k] += fuk * bu[k] * ss;
318 }
319 }
320 // calculate A0_l
321 for (l = 0; l != n; ++l)
322 he->A0[l] += hp->a0[l] * hp->e[(int)hd->seq[1]][l] * hd->b[1][l];
323 return he;
324 }
325
326 FLOAT hmm_Q0(const hmm_par_t *hp, hmm_exp_t *he)
327 {
328 int k, l, b;
329 FLOAT sum = 0.0;
330 for (k = 0; k != hp->n; ++k) {
331 FLOAT tmp;
332 for (b = 0, tmp = 0.0; b != hp->m; ++b) tmp += he->E[b][k];
333 for (b = 0; b != hp->m; ++b)
334 sum += he->E[b][k] * log(he->E[b][k] / tmp);
335 }
336 for (k = 0; k != hp->n; ++k) {
337 FLOAT tmp, *A = he->A[k];
338 for (l = 0, tmp = 0.0; l != hp->n; ++l) tmp += A[l];
339 for (l = 0; l != hp->n; ++l) sum += A[l] * log(A[l] / tmp);
340 }
341 return (he->Q0 = sum);
342 }
343
344 // add he0 to he1
345
346 void hmm_add_expect(const hmm_exp_t *he0, hmm_exp_t *he1)
347 {
348 int b, k, l;
349 assert(he0->m == he1->m && he0->n == he1->n);
350 for (k = 0; k != he1->n; ++k) {
351 he1->A0[k] += he0->A0[k];
352 for (l = 0; l != he1->n; ++l)
353 he1->A[k][l] += he0->A[k][l];
354 }
355 for (b = 0; b != he1->m; ++b) {
356 for (l = 0; l != he1->n; ++l)
357 he1->E[b][l] += he0->E[b][l];
358 }
359 }
360
361 // the EM-Q function
362
363 FLOAT hmm_Q(const hmm_par_t *hp, const hmm_exp_t *he)
364 {
365 FLOAT sum = 0.0;
366 int bb, k, l;
367 for (bb = 0; bb != he->m; ++bb) {
368 FLOAT *eb = hp->e[bb], *Eb = he->E[bb];
369 for (k = 0; k != hp->n; ++k) {
370 if (eb[k] <= 0.0) return -HMM_INF;
371 sum += Eb[k] * log(eb[k]);
372 }
373 }
374 for (k = 0; k != he->n; ++k) {
375 FLOAT *Ak = he->A[k], *ak = hp->a[k];
376 for (l = 0; l != he->n; ++l) {
377 if (ak[l] <= 0.0) return -HMM_INF;
378 sum += Ak[l] * log(ak[l]);
379 }
380 }
381 return (sum -= he->Q0);
382 }
383
384 // simulate sequence
385
386 char *hmm_simulate(const hmm_par_t *hp, int L)
387 {
388 int i, k, l, b;
389 FLOAT x, y, **et;
390 char *seq;
391 seq = (char*)calloc(L+1, 1);
392 // calculate the transpose of hp->e[][]
393 et = (FLOAT**)calloc2(hp->n, hp->m, sizeof(FLOAT));
394 for (k = 0; k != hp->n; ++k)
395 for (b = 0; b != hp->m; ++b)
396 et[k][b] = hp->e[b][k];
397 // the initial state, drawn from a0[]
398 x = drand48();
399 for (k = 0, y = 0.0; k != hp->n; ++k) {
400 y += hp->a0[k];
401 if (y >= x) break;
402 }
403 // main loop
404 for (i = 0; i != L; ++i) {
405 FLOAT *el, *ak = hp->a[k];
406 x = drand48();
407 for (l = 0, y = 0.0; l != hp->n; ++l) {
408 y += ak[l];
409 if (y >= x) break;
410 }
411 el = et[l];
412 x = drand48();
413 for (b = 0, y = 0.0; b != hp->m; ++b) {
414 y += el[b];
415 if (y >= x) break;
416 }
417 seq[i] = b;
418 k = l;
419 }
420 for (k = 0; k != hp->n; ++k) free(et[k]);
421 free(et);
422 return seq;
423 }
Something went wrong with that request. Please try again.