-
Notifications
You must be signed in to change notification settings - Fork 0
/
KMP.cs
179 lines (147 loc) · 7.05 KB
/
KMP.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
using System.Collections.Generic;
using System.Text;
namespace PsUtils {
public class KMP {
#region Helpers
/// <summary>
/// Creates a lookup for partial matches, also known as a failure function.
/// </summary>
/// <param name="pattern">The pattern for which to create the table.</param>
/// <returns>Array of integers representing the failure function for each byte in <paramref name="pattern"/>.</returns>
public static int[] NewFailureFunction(byte[] pattern) {
int[] table = new int[pattern.Length];
table[0] = -1;
if (pattern.Length == 1)
return table;
table[1] = 0;
int pos = 2;
int nextChar = 0;
while (pos < pattern.Length) {
if (pattern[pos - 1] == pattern[nextChar]) {
table[pos] = nextChar + 1;
++nextChar;
++pos;
} else if (nextChar > 0) {
nextChar = table[nextChar];
} else {
table[pos] = 0;
++pos;
}
}
return table;
}
#endregion
#region First occurence
/// <summary>
/// Uses the Knuth-Morris-Pratt algorithm to find the first instance of pattern in input.
/// </summary>
/// <param name="input">Byte array to search.</param>
/// <param name="pattern">Byte array to find in <paramref name="input"/>.</param>
/// <returns>The index of the start of the match in <paramref name="input"/>. -1 if nothing found.</returns>
public static int Find(byte[] input, byte[] pattern) {
int[] table = NewFailureFunction(pattern);
int posMatch = 0;
int posChar = 0;
while (posMatch + posChar < input.Length) {
if (pattern[posChar] == input[posMatch + posChar]) {
if (posChar == pattern.Length - 1) {
return posMatch; // Found first instance
} else {
++posChar;
}
} else {
if (table[posChar] > -1) {
posMatch += posChar - table[posChar];
posChar = table[posChar];
} else {
++posMatch;
posChar = 0;
}
}
}
return -1;
}
/// <summary>
/// Uses the Knuth-Morris-Pratt algorithm to find the first instance of pattern in input. Uses a default encoding of <see cref="Encoding.UTF8"/> to convert the strings to byte arrays.
/// </summary>
/// <param name="input">String to search.</param>
/// <param name="pattern">String to find in <paramref name="input"/>.</param>
/// <returns>The index of the start of the match in <paramref name="input"/>. -1 if nothing found.</returns>
public static int Find(string input, string pattern) {
return Find(input, pattern, Encoding.UTF8);
}
/// <summary>
/// Uses the Knuth-Morris-Pratt algorithm to find the first instance of pattern in input.
/// </summary>
/// <param name="input">String to search.</param>
/// <param name="pattern">String to find in <paramref name="input"/>.</param>
/// <param name="encoding">The encoding to use to convert the input and pattern into a byte array.</param>
/// <returns>The index of the start of the match in <paramref name="input"/>. -1 if nothing found.</returns>
public static int Find(string input, string pattern, Encoding encoding) {
var div = encoding.GetByteCount(pattern) / pattern.Length;
var found = Find(encoding.GetBytes(input), encoding.GetBytes(pattern));
return found / div;
}
#endregion
#region All occurences
/// <summary>
/// Uses the Knuth-Morris-Pratt algorithm to find all occurences of pattern in input.
/// </summary>
/// <remarks>Does not find overlapping patterns.</remarks>
/// <param name="input">Byte array to search.</param>
/// <param name="pattern">Byte array to find in <paramref name="input"/>.</param>
/// <returns>Array of integers with each integer corresponding to the start of a found <paramref name="pattern"/>.</returns>
public static int[] FindAll(byte[] input, byte[] pattern) {
List<int> found = new List<int>();
int[] table = NewFailureFunction(pattern);
int posMatch = 0;
int posChar = 0;
while (posMatch + posChar < input.Length) {
if (pattern[posChar] == input[posMatch + posChar]) {
if (posChar == pattern.Length - 1) {
found.Add(posMatch);
posMatch += posChar + 1; // Skip ahead to end of match.
posChar = 0;
} else {
++posChar;
}
} else {
if (table[posChar] > -1) {
posMatch += posChar - table[posChar];
posChar = table[posChar];
} else {
++posMatch;
posChar = 0;
}
}
}
return found.ToArray();
}
/// <summary>
/// Uses the Knuth-Morris-Pratt algorithm to find all occurences of pattern in input. Uses a default encoding of <see cref="Encoding.UTF8"/> to convert the strings to byte arrays.
/// </summary>
/// <param name="input">String to search.</param>
/// <param name="pattern">String to find in <paramref name="input"/>.</param>
/// <returns>Array of integers with each integer corresponding to the start of a found <paramref name="pattern"/>.</returns>
public static int[] FindAll(string input, string pattern) {
return FindAll(input, pattern, Encoding.UTF8);
}
/// <summary>
/// Uses the Knuth-Morris-Pratt algorithm to find all occurences of pattern in input.
/// </summary>
/// <param name="input">String to search.</param>
/// <param name="pattern">String to find in <paramref name="input"/>.</param>
/// <param name="encoding">The encoding to use to convert the input and pattern into a byte array.</param>
/// <returns>Array of integers with each integer corresponding to the start of a found <paramref name="pattern"/>.</returns>
public static int[] FindAll(string input, string pattern, Encoding encoding) {
var div = encoding.GetByteCount(pattern) / pattern.Length;
var found = FindAll(encoding.GetBytes(input), encoding.GetBytes(pattern));
if (div > 1) {
for (int i = 0; i < found.Length; i++)
found[i] /= div;
}
return found;
}
#endregion
}
}