Skip to content

Commit

Permalink
optimize: pike vm: now we quickly skip bytes that do not match any of…
Browse files Browse the repository at this point in the history
… the possible first byte values when we are in the initial state.
  • Loading branch information
agentzh committed Jun 18, 2013
1 parent 2d2622e commit da760e2
Show file tree
Hide file tree
Showing 8 changed files with 486 additions and 107 deletions.
2 changes: 2 additions & 0 deletions src/sre_cli.c
Expand Up @@ -556,6 +556,8 @@ process_string(sre_char *s, size_t len, sre_program_t *prog, sre_int_t *ovector,

printf("splitted pike ");

dd("===== splitted pike =====");

pctx = sre_vm_pike_create_ctx(pool, prog, ovector, ovecsize);
assert(pctx);

Expand Down
49 changes: 25 additions & 24 deletions src/sregex/sre_capture.c
Expand Up @@ -18,25 +18,35 @@


SRE_NOAPI sre_capture_t *
sre_capture_create(sre_pool_t *pool, size_t ovecsize, unsigned clear)
sre_capture_create(sre_pool_t *pool, size_t ovecsize, unsigned clear,
sre_capture_t **freecap)
{
sre_char *p;
sre_capture_t *cap;

p = sre_pnalloc(pool, sizeof(sre_capture_t) + ovecsize);
if (p == NULL) {
return NULL;
}
if (*freecap) {
dd("reusing cap %p", *freecap);
cap = *freecap;
*freecap = cap->next;
cap->next = NULL;
cap->ref = 1;

} else {
p = sre_pnalloc(pool, sizeof(sre_capture_t) + ovecsize);
if (p == NULL) {
return NULL;
}

cap = (sre_capture_t *) p;
cap = (sre_capture_t *) p;

cap->ovecsize = ovecsize;
cap->ref = 1;
cap->next = NULL;
cap->regex_id = 0;
cap->ovecsize = ovecsize;
cap->ref = 1;
cap->next = NULL;
cap->regex_id = 0;

p += sizeof(sre_capture_t);
cap->vector = (sre_int_t *) p;
p += sizeof(sre_capture_t);
cap->vector = (sre_int_t *) p;
}

if (clear) {
(void) memset(cap->vector, -1, ovecsize);
Expand All @@ -55,18 +65,9 @@ sre_capture_update(sre_pool_t *pool, sre_capture_t *cap, sre_uint_t group,
dd("update cap %u to %d", group, pos);

if (cap->ref > 1) {
if (*freecap) {
dd("reusing cap %p", *freecap);
newcap = *freecap;
*freecap = newcap->next;
newcap->next = NULL;
newcap->ref = 1;

} else {
newcap = sre_capture_create(pool, cap->ovecsize, 0);
if (newcap == NULL) {
return NULL;
}
newcap = sre_capture_create(pool, cap->ovecsize, 0, freecap);
if (newcap == NULL) {
return NULL;
}

memcpy(newcap->vector, cap->vector, cap->ovecsize);
Expand Down
2 changes: 1 addition & 1 deletion src/sregex/sre_capture.h
Expand Up @@ -34,7 +34,7 @@ struct sre_capture_s {


SRE_NOAPI sre_capture_t *sre_capture_create(sre_pool_t *pool, size_t ovecsize,
unsigned clear);
unsigned clear, sre_capture_t **freecap);

SRE_NOAPI sre_capture_t *sre_capture_update(sre_pool_t *pool,
sre_capture_t *cap, sre_uint_t group, sre_int_t pos,
Expand Down
6 changes: 6 additions & 0 deletions src/sregex/sre_core.h
Expand Up @@ -11,6 +11,7 @@

#include <sregex/sregex.h>
#include <string.h>
#include <assert.h>


#ifndef SRE_USE_VALGRIND
Expand Down Expand Up @@ -59,4 +60,9 @@
#endif


#ifndef sre_assert
#define sre_assert assert
#endif


#endif /* _SRE_CORE_H_INCLUDED_ */
162 changes: 161 additions & 1 deletion src/sregex/sre_regex_compiler.c
@@ -1,6 +1,6 @@

/*
* Copyright 2012 Yichun "agentzh" Zhang
* Copyright 2012-2013 Yichun Zhang (agentzh)
* Copyright 2007-2009 Russ Cox. All Rights Reserved.
* Use of this source code is governed by a BSD-style
* license that can be found in the LICENSE file.
Expand All @@ -16,6 +16,11 @@
#include <sregex/sre_vm_bytecode.h>


static sre_int_t sre_program_get_leading_bytes(sre_pool_t *pool,
sre_program_t *prog, sre_chain_t **res);
static sre_int_t sre_program_get_leading_bytes_helper(sre_pool_t *pool,
sre_instruction_t *pc, sre_program_t *prog, sre_chain_t **res,
unsigned tag);
static sre_uint_t sre_program_len(sre_regex_t *r);
static sre_instruction_t *sre_regex_emit_bytecode(sre_pool_t *pool,
sre_instruction_t *pc, sre_regex_t *re);
Expand Down Expand Up @@ -70,17 +75,172 @@ sre_regex_compile(sre_pool_t *pool, sre_regex_t *re)
prog->lookahead_asserts = 0;
prog->dup_threads = 0;
prog->uniq_threads = 0;
prog->nullable = 0;
prog->leading_bytes = NULL;
prog->leading_byte = -1;

prog->ovecsize = 0;
for (i = 0; i < prog->nregexes; i++) {
prog->ovecsize += prog->multi_ncaps[i] + 1;
}
prog->ovecsize *= 2 * sizeof(sre_uint_t);

if (sre_program_get_leading_bytes(pool, prog, &prog->leading_bytes)
== SRE_ERROR)
{
return NULL;
}

if (prog->leading_bytes && prog->leading_bytes->next == NULL) {
pc = prog->leading_bytes->data;
if (pc->opcode == SRE_OPCODE_CHAR) {
prog->leading_byte = pc->v.ch;
}
}

dd("nullable: %u", prog->nullable);

#if (DDEBUG)
{
sre_chain_t *cl;

for (cl = prog->leading_bytes; cl; cl = cl->next) {
pc = cl->data;
fprintf(stderr, "[");
sre_dump_instruction(stderr, pc, prog->start);
fprintf(stderr, "]");
}
if (prog->leading_bytes) {
fprintf(stderr, "\n");
}
}
#endif

return prog;
}


static sre_int_t
sre_program_get_leading_bytes(sre_pool_t *pool, sre_program_t *prog,
sre_chain_t **res)
{
unsigned tag;
sre_int_t rc;

tag = prog->tag + 1;

rc = sre_program_get_leading_bytes_helper(pool, prog->start, prog, res,
tag);
prog->tag = tag;

if (rc == SRE_ERROR) {
return SRE_ERROR;
}

if (rc == SRE_DECLINED || prog->nullable) {
*res = NULL;
return SRE_DECLINED;
}

return rc;
}


static sre_int_t
sre_program_get_leading_bytes_helper(sre_pool_t *pool, sre_instruction_t *pc,
sre_program_t *prog, sre_chain_t **res, unsigned tag)
{
sre_int_t rc;
sre_chain_t *cl, *ncl;
sre_instruction_t *bc;

if (pc->tag == tag) {
return SRE_OK;
}

if (pc == prog->start + 1) {
/* skip the dot (.) in the initial boilerplate ".*?" */
return SRE_OK;
}

pc->tag = tag;

switch (pc->opcode) {
case SRE_OPCODE_SPLIT:
rc = sre_program_get_leading_bytes_helper(pool, pc->x, prog, res,
tag);
if (rc != SRE_OK) {
return rc;
}

return sre_program_get_leading_bytes_helper(pool, pc->y, prog, res,
tag);

case SRE_OPCODE_JMP:
return sre_program_get_leading_bytes_helper(pool, pc->x, prog, res,
tag);

case SRE_OPCODE_SAVE:
if (++pc == prog->start + prog->len) {
return SRE_OK;
}

return sre_program_get_leading_bytes_helper(pool, pc, prog, res,
tag);

case SRE_OPCODE_MATCH:
prog->nullable = 1;
return SRE_DONE;

case SRE_OPCODE_ASSERT:
if (++pc == prog->start + prog->len) {
return SRE_OK;
}

return sre_program_get_leading_bytes_helper(pool, pc, prog, res, tag);

case SRE_OPCODE_ANY:
return SRE_DECLINED;

default:
/* CHAR, ANY, IN, NOTIN */

ncl = sre_palloc(pool, sizeof(sre_chain_t));
if (ncl == NULL) {
return SRE_ERROR;
}

ncl->data = pc;
ncl->next = NULL;

if (*res) {
for (cl = *res; /* void */; cl = cl->next) {
bc = cl->data;
if (bc->opcode == pc->opcode) {
if (bc->opcode == SRE_OPCODE_CHAR) {
if (bc->v.ch == pc->v.ch) {
return SRE_OK;
}
}
}

if (cl->next == NULL) {
cl->next = ncl;
return SRE_OK;
}
}

} else {
*res = ncl;
}

return SRE_OK;
}

/* impossible to reach here */
}


static sre_uint_t
sre_program_len(sre_regex_t *r)
{
Expand Down

0 comments on commit da760e2

Please sign in to comment.