Skip to content

Commit

Permalink
Added black-box with partial scan
Browse files Browse the repository at this point in the history
  • Loading branch information
anuragkh committed Jul 23, 2015
1 parent 4a1cbf3 commit e18c1a2
Show file tree
Hide file tree
Showing 4 changed files with 234 additions and 27 deletions.
2 changes: 2 additions & 0 deletions include/succinct/SuccinctCore.hpp
Expand Up @@ -86,6 +86,8 @@ class SuccinctCore : public SuccinctBase {
// Get index of value v in C
uint64_t lookupC(uint64_t val);

char charAt(uint64_t i);

// Serialize succinct data structures
virtual size_t serialize();

Expand Down
251 changes: 226 additions & 25 deletions include/succinct/regex/RegEx.hpp
Expand Up @@ -10,11 +10,12 @@
#include "succinct/regex/executor/RegExExecutorBwd.hpp"
#include "succinct/regex/executor/RegExExecutorFwd.hpp"

#define BB_PARTIAL_SCAN
class SRegEx {
private:
typedef std::pair<size_t, size_t> OffLen;
typedef std::set<OffLen> RRes;
typedef RRes::iterator RResIt;
typedef std::pair<size_t, size_t> OffsetLength;
typedef std::set<OffsetLength> RegExResults;
typedef RegExResults::iterator RegExResultsIterator;

void explain_subexp(RegEx *re) {
switch(re->getType()) {
Expand Down Expand Up @@ -87,7 +88,7 @@ class SRegEx {
}
}

void parse() {
void get_subexpressions() {
// TODO: Right now this assumes we don't have nested ".*" operators
// It would be nice to allow .* anywhere.
std::string delimiter = ".*";
Expand All @@ -97,28 +98,26 @@ class SRegEx {

while ((pos = exp.find(delimiter)) != std::string::npos) {
subexp = exp.substr(0, pos);
RegExParser parser((char *)subexp.c_str());
subexps.push_back(parser.parse());
subexps.push_back(subexp);
exp.erase(0, pos + delimiter.length());
}

subexp = exp;
RegExParser parser((char *)subexp.c_str());
subexps.push_back(parser.parse());
subexps.push_back(subexp);
}

public:
SRegEx(std::string exp, SuccinctCore *s_core, bool opt = true) {
this->exp = exp;
this->s_core = s_core;
this->opt = opt;
parse();
get_subexpressions();
}

void execute() {
std::vector<RRes> subresults;
std::vector<RegExResults> subresults;
for(auto subexp: subexps) {
RRes subresult;
RegExResults subresult;
subquery(subresult, subexp);
subresults.push_back(subresult);
}
Expand All @@ -133,27 +132,30 @@ class SRegEx {

void count(std::vector<size_t> &counts) {
for(auto subexp: subexps) {
counts.push_back(subcount(subexp));
RegExParser p((char *)subexp.c_str());
counts.push_back(subcount(p.parse()));
}
}

void wildcard(RRes &left, RRes &right) {
RRes wildcard_res;
RResIt left_it, right_it;
void wildcard(RegExResults &left, RegExResults &right) {
RegExResults wildcard_res;
RegExResultsIterator left_it, right_it;
for(left_it = left.begin(); left_it != left.end(); left_it++) {
OffLen search_candidate(left_it->first + left_it->second, 0);
RResIt first_entry = right.lower_bound(search_candidate);
OffsetLength search_candidate(left_it->first + left_it->second, 0);
RegExResultsIterator first_entry = right.lower_bound(search_candidate);
for(right_it = first_entry; right_it != right.end(); right_it++) {
size_t offset = left_it->first;
size_t length = right_it->first - left_it->first + right_it->second;
wildcard_res.insert(OffLen(offset, length));
wildcard_res.insert(OffsetLength(offset, length));
}
}
right = wildcard_res;
}

void subquery(RRes &result, RegEx *r) {
void subquery(RegExResults &result, std::string sub_expression) {
if(opt) {
RegExParser p((char *)sub_expression.c_str());
RegEx *r = p.parse();
if(is_suffixed(r) || !is_prefixed(r)) {
RegExExecutorBwd executor(s_core, r);
executor.execute();
Expand All @@ -164,9 +166,207 @@ class SRegEx {
executor.getResults(result);
}
} else {
RegExExecutorBlackBox executor(s_core, r);
#ifdef BB_PARTIAL_SCAN
std::vector<std::string> sub_sub_expressions;
std::string sub_sub_expression = "";
size_t i = 0;
while (i < sub_expression.length()) {
if (sub_expression[i] == '[') {
if (sub_sub_expression != "") {
sub_sub_expressions.push_back(sub_sub_expression);
sub_sub_expression = "";
}
std::string range = "";
for (; sub_expression[i] != ']'; i++) {
if (sub_expression[i] == '-') {
for (char c = sub_expression[i - 1] + 1; c < sub_expression[i + 1];
c++) {
range += c;
}
i++;
}
range += sub_expression[i];
}
range += sub_expression[i++];
if (sub_expression[i] == '+' || sub_expression[i] == '*') {
range += sub_expression[i++];
}
sub_sub_expressions.push_back(range);
} else if (sub_expression[i] == '.') {
if (sub_sub_expression != "") {
sub_sub_expressions.push_back(sub_sub_expression);
sub_sub_expression = "";
}
sub_sub_expressions.push_back(".");
i++;
} else {
sub_sub_expression += sub_expression[i];
i++;
}
}

if (sub_sub_expression != "") {
sub_sub_expressions.push_back(sub_sub_expression);
}

// Sequentially go through the list of sub-sub-expressions
std::string last_token = "";
int32_t last_token_id = -1;
RegExResults last_results;
for (size_t i = 0; i < sub_sub_expressions.size(); i++) {
std::string ssexp = sub_sub_expressions[i];
if (ssexp[0] == '[' || ssexp[0] == '.') {
if(last_token_id == -1) {
continue;
}
RegExResults range_results;
if (ssexp == ".") {
for (RegExResultsIterator it = last_results.begin();
it != last_results.end(); it++) {
range_results.insert(OffsetLength(it->first, it->second + 1));
}
} else if(ssexp[ssexp.length() - 1] == '+') {
std::string range = ssexp.substr(1, ssexp.length() - 3);
for (RegExResultsIterator it = last_results.begin();
it != last_results.end(); it++) {
size_t start_pos = it->first + it->second - 1;
char c;
size_t len = 1;
while(true) {
c = s_core->charAt(start_pos + len);
if (range.find(c) != std::string::npos) {
range_results.insert(OffsetLength(it->first, it->second + len));
} else {
break;
}
len++;
}
}
} else if(ssexp[ssexp.length() - 1] == '+') {
std::string range = ssexp.substr(1, ssexp.length() - 3);
range_results.insert(last_results.begin(), last_results.end());
for (RegExResultsIterator it = last_results.begin();
it != last_results.end(); it++) {
size_t start_pos = it->first + it->second - 1;
char c;
size_t len = 1;
while(true) {
c = s_core->charAt(start_pos + len);
if (range.find(c) != std::string::npos) {
range_results.insert(OffsetLength(it->first, it->second + len));
} else {
break;
}
len++;
}
}
} else {
std::string range = ssexp.substr(1, ssexp.length() - 2);
for (RegExResultsIterator it = last_results.begin();
it != last_results.end(); it++) {
size_t cur_pos = it->first + it->second;
char c = s_core->charAt(cur_pos);
if (range.find(c) != std::string::npos) {
range_results.insert(OffsetLength(it->first, it->second + 1));
}
}
}
last_results = range_results;
} else {

bool backtrack = false;
if (last_token_id == -1) {
backtrack = true;
}

last_token_id = i;
last_token = ssexp;

RegExResults cur_results;
RegExParser p((char *) ssexp.c_str());
RegEx *r = p.parse();
RegExExecutorBlackBox executor(s_core, r);
executor.execute();
executor.getResults(cur_results);

if (backtrack) {
last_results = cur_results;
for (int32_t j = i - 1; j >= 0; j--) {
ssexp = sub_sub_expressions[j];
if (ssexp[ssexp.length() - 1] == '*')
continue;

RegExResults range_results;
if (ssexp == ".") {
for (RegExResultsIterator it = last_results.begin();
it != last_results.end(); it++) {
range_results.insert(
OffsetLength(it->first - 1, it->second + 1));
}
} else if(ssexp[ssexp.length() - 1] == '+') {
std::string range = ssexp.substr(1, ssexp.length() - 2);
for (RegExResultsIterator it = last_results.begin(); it != last_results.end(); it++) {
size_t cur_pos = it->first - 1;
char c = s_core->charAt(cur_pos);
if (range.find(c) != std::string::npos) {
range_results.insert(
OffsetLength(it->first - 1, it->second + 1));
}
}
} else if(ssexp[ssexp.length() - 1] == '*') {
std::string range = ssexp.substr(1, ssexp.length() - 2);
range_results.insert(last_results.begin(), last_results.end());
for (RegExResultsIterator it = last_results.begin(); it != last_results.end(); it++) {
size_t cur_pos = it->first - 1;
char c = s_core->charAt(cur_pos);
if (range.find(c) != std::string::npos) {
range_results.insert(
OffsetLength(it->first - 1, it->second + 1));
}
}
} else {
std::string range = ssexp.substr(1, ssexp.length() - 2);
for (RegExResultsIterator it = last_results.begin();
it != last_results.end(); it++) {
size_t cur_pos = it->first - 1;
char c = s_core->charAt(cur_pos);
if (range.find(c) != std::string::npos) {
range_results.insert(
OffsetLength(it->first - 1, it->second + 1));
}
}
}
last_results = range_results;
}
} else {
RegExResults concat_results;
RegExResultsIterator left_it, right_it;
for (left_it = last_results.begin(), right_it = cur_results.begin();
left_it != last_results.end() && right_it != cur_results.end();
left_it++) {
while (right_it != cur_results.end()
&& right_it->first < left_it->first + left_it->second)
right_it++;
if (right_it == cur_results.end())
break;

if (right_it->first == left_it->first + left_it->second) {
concat_results.insert(
OffsetLength(left_it->first,
left_it->second + right_it->second));
}
}
last_results = concat_results;
}
}
}
result = last_results;
#else
RegExParser p((char *)sub_expression.c_str());
RegExExecutorBlackBox executor(s_core, p.parse());
executor.execute();
executor.getResults(result);
#endif
}
}

Expand All @@ -183,7 +383,8 @@ class SRegEx {
void explain() {
fprintf(stderr, "***");
for(auto subexp: subexps) {
explain_subexp(subexp);
RegExParser p((char *)subexp.c_str());
explain_subexp(p.parse());
fprintf(stderr, "***");
}
}
Expand All @@ -192,7 +393,7 @@ class SRegEx {
if(limit <= 0)
limit = r_results.size();
limit = MIN(limit, r_results.size());
RResIt it;
RegExResultsIterator it;
size_t i;
fprintf(stderr, "Showing %zu of %zu results.\n", limit, r_results.size());
fprintf(stderr, "{");
Expand All @@ -202,17 +403,17 @@ class SRegEx {
fprintf(stderr, "...}\n");
}

void get_results(RRes &results) {
void get_results(RegExResults &results) {
results = r_results;
}

private:
std::string exp;
std::vector<RegEx *> subexps;
std::vector<std::string> subexps;
SuccinctCore *s_core;
bool opt;

RRes r_results;
RegExResults r_results;
};


Expand Down
2 changes: 1 addition & 1 deletion include/succinct/regex/executor/RegExExecutor.hpp
Expand Up @@ -130,7 +130,7 @@ class RegExExecutorBlackBox: public RegExExecutor {

RegExResultIterator left_it, right_it;
for (left_it = left.begin(), right_it = right.begin(); left_it != left.end() && right_it != right.end(); left_it++) {
while (right_it != right.end() && right_it->first <= left_it->first) right_it++;
while (right_it != right.end() && right_it->first < left_it->first + left_it->second) right_it++;
if (right_it == right.end()) break;

if (right_it->first == left_it->first + left_it->second)
Expand Down
6 changes: 5 additions & 1 deletion src/succinct/SuccinctCore.cpp
Expand Up @@ -348,6 +348,10 @@ uint64_t SuccinctCore::lookupC(uint64_t i) {
return get_rank1(&Cinv_idx, i);
}

char SuccinctCore::charAt(uint64_t i) {
return alphabet[lookupC(lookupISA(i))];
}

size_t SuccinctCore::serialize() {
size_t out_size = 0;
typedef std::map<char, std::pair<uint64_t, uint32_t> >::iterator iterator_t;
Expand Down Expand Up @@ -655,7 +659,7 @@ int SuccinctCore::compare(std::string p, int64_t i) {

int SuccinctCore::compare(std::string p, int64_t i, size_t offset) {

long j = 0;
uint64_t j = 0;

// Skip first offset chars
while(offset) {
Expand Down

0 comments on commit e18c1a2

Please sign in to comment.