-
Notifications
You must be signed in to change notification settings - Fork 67
/
LandauVishkin.h
110 lines (83 loc) · 3.69 KB
/
LandauVishkin.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#pragma once
#include "Compat.h"
#include "FixedSizeMap.h"
#include "BigAlloc.h"
const int MAX_K = 31;
// Computes the edit distance between two strings without returning the edits themselves.
class LandauVishkin {
public:
LandauVishkin(int cacheSize = 0);
~LandauVishkin();
// Compute the edit distance between two strings, if it is <= k, or return -1 otherwise.
// For LandauVishkin instances with a cache, the cacheKey should be a unique identifier for
// the text and pattern combination (e.g. (readID << 33) | isRC << 32 | genomeLocation).
int computeEditDistance(
const char* text,
int textLen,
const char* pattern,
const char *qualityString,
int patternLen,
int k,
double *matchProbability,
_uint64 cacheKey = 0);
// Version that does not requre match probability and quality string
inline int computeEditDistance(
const char* text,
int textLen,
const char* pattern,
int patternLen,
int k)
{
return computeEditDistance(text, textLen, pattern, NULL, patternLen, k, NULL);
}
// Clear the cache of distances computed.
void clearCache();
void *operator new(size_t size) {return BigAlloc(size);}
void operator delete(void *ptr) {BigDealloc(ptr);}
static void setProbabilities(double *i_indelProbabilities, double *i_phredToProbability, double mutationProbability);
static void initializeProbabilitiesToPhredPlus33();
private:
// TODO: For long reads, we should include a version that only has L be 2 x (2*MAX_K+1) cells
int L[MAX_K+1][2 * MAX_K + 1];
// Action we did to get to each position: 'D' = deletion, 'I' = insertion, 'X' = substitution. This is needed to compute match probability.
char A[MAX_K+1][2 * MAX_K + 1];
// Arrays for backtracing the actions required to match two strings
char backtraceAction[MAX_K+1];
int backtraceMatched[MAX_K+1];
int backtraceD[MAX_K+1];
struct LVResult {
int k;
int result;
double matchProbability;
LVResult() { k = -1; result = -1; }
LVResult(int k_, int result_, double matchProbability_) { k = k_; result = result_; matchProbability = matchProbability_;}
inline bool isValid() { return k != -1; }
};
FixedSizeMap<_uint64, LVResult> *cache;
static double *indelProbabilities; // Maps indels by length to probability of occurance.
static double *phredToProbability; // Maps ASCII phred character to probability of error, including
static double *perfectMatchProbability; // Probability that a read of this length has no mutations
};
// Computes the edit distance between two strings and returns a CIGAR string for the edits.
enum CigarFormat
{
COMPACT_CIGAR_STRING = 0,
EXPANDED_CIGAR_STRING = 1,
COMPACT_CIGAR_BINARY = 2,
};
class LandauVishkinWithCigar {
public:
LandauVishkinWithCigar();
// Compute the edit distance between two strings and write the CIGAR string in cigarBuf.
// Returns -1 if the edit distance exceeds k or -2 if we run out of space in cigarBuf.
int computeEditDistance(const char* text, int textLen, const char* pattern, int patternLen, int k,
char* cigarBuf, int cigarBufLen, bool useM, CigarFormat format = COMPACT_CIGAR_STRING);
private:
int L[MAX_K+1][2 * MAX_K + 1];
// Action we did to get to each position: 'D' = deletion, 'I' = insertion, 'X' = substitution.
char A[MAX_K+1][2 * MAX_K + 1];
// Arrays for backtracing the actions required to match two strings
char backtraceAction[MAX_K+1];
int backtraceMatched[MAX_K+1];
int backtraceD[MAX_K+1];
};