In [11]:
import redis
import re

from nltk.corpus import stopwords

In [13]:
STOP_WORDS = set(stopwords.words('english'))
len(STOP_WORDS)

179

In [21]:
# Take only alphabets, with a minimum length of 2.
WORDS_RE = re.compile('[a-z]{2,}')

In [27]:
def tokenize(content):
    words = set()
    for match in WORDS_RE.finditer(content.lower()):
        word = match.group().strip("'")
        if len(word) >= 2:
            words.add(word)
    return words - STOP_WORDS

In [28]:
def index_document(conn, docid, content):
    words = tokenize(content)
    
    pipeline = conn.pipeline(True)
    for word in words:
        pipeline.sadd(f'idx:{word}', docid)
    return len(pipeline.execute())

In [29]:
r = redis.Redis(password='123456', decode_responses=True)
index_document(r, 1, 'hello world')

2

In [30]:
def _set_common(conn, method, names, ttl=30, execute=True):
    id = str(uuid.uuid4())
    pipeline = conn.pipeline(True) if execute else conn
    names = [f'idx:{name}' for name in names]
    getattr(pipeline, method)(f'idx:{id}', *names)
    pipeline.expire(f'idx:{id}', ttl)
    if execute:
        pipeline.execute()
    return id

In [31]:
def intersect(conn, items, ttl=30, _execute=True):
    return _set_common(conn, 'sinterstore', items, ttl, _execute)

In [32]:
def union(conn, items, ttl=30, _execute=True):
    return _set_common(conn, 'sunionstore', items, ttl, _execute)

In [33]:
def difference(conn, items, ttl=30, _execute=True):
    return _set_common(conn, 'sdiffstore', items, ttl, _execute)

In [34]:
QUERY_RE = re.compile("[+-]?[a-z']{2,}")

def parse(query):
    unwanted = set()
    all = []
    current = set()
    
    for match in QUERY_RE.finditer(query.lower()):
        word = match.group()
        prefix = word[:1]
        if prefix in '+-':
            word = word[1:]
        else:
            prefix = None
        
        word = word.strip("'")
        if len(word) < 2 or word in STOP_WORDS:
            continue
            
        
        if prefix == '-':
            unwanted.add(word)
            continue
        
        if current and not prefix:
            all.append(list(current))
            current = set()
        current.add(word)
    
    if current:
        all.append(list(current))
    return all, list(unwanted)

In [35]:
def search_and_sort(conn, query, id=None, ttl=300, sort='-updated', start=0, num=20):
    desc = sort.startswith('-')
    sort = sort.lstrip('-')
    by = f'kb:doc:*->{sort}'
    
    # Sort strings alphabetically, sort integers by numbers.
    alpha = sort not in ('updated', 'id', 'created')
    
    if id and not conn.expire(id, ttl):
        id = parse_and_search(conn, query, ttl=ttl)
    pipeline.scard(f'idx:{id}')
    pipeline.sort(f'idx:{id}', by=by, alpha=alpha, desc=desc, start=start, num=num)
    
    results = pipeline.execute()
    return results[0], results[1], id

In [37]:
def search_and_zsort(conn, query, id=None, ttl=300, update=1, vote=0, start=0, num=20, desc=True):
    if id and not conn.expire(id, ttl):
        id = None
    
    if not id:
        id = parse_and_search(conn, query, ttl=ttl)
        scored_search = {
            id: 0,
            'sort:update': update,
            'sort:votes': vote
        }
        id = zintersect(conn, scored_search, ttl)
    
    pipeline = conn.pipeline(True)
    pipeline.zcard(f'idx:{id}')
    if desc:
        pipeline.zrevrange(f'idx:{id}', start, start + num - 1)
    else:
        pipeline.zrange(f'idx:{id}', start, start + num - 1)
    results = pipeline.execute()
    return results[0], results[1], id

In [38]:
def _zset_common(conn, method, scores, ttl=300, **kw):
    id = str(uuid.uuid4())
    execute = kw.pop('_execute', True)
    pipeline = conn.pipeline(True) if execute else conn
    for key in scores.key():
        scores[f'idx:{key}'] = scores.pop(key)
    getattr(pipeline, method)(f'idx:{id}', scores, **kw)
    pipeline.expire(f'idx:{id}', ttl)
    if execute:
        pipeline.execute()
    return id

In [39]:
def zintersect(conn, items, ttl=30, **kw):
    return _zset_common(conn, 'zinterstore', dict(items), ttl, **kw)

In [40]:
def zunion(conn, items, ttl=30, **kw):
    return _zset_common(conn, 'zunionstore', dict(items), ttl, **kw)

In [41]:
def string_to_score(string, ignore_case=False):
    if ignore_case:
        string = string.lower()
    pieces = map(ord, string[:6])
    while len(pieces) < 6:
        pieces.append(-1)
        
    score = 0
    for piece in pieces:
        score = score * 257 + piece + 1
    return score * 2 + (len(string) > 6)