In [1]:
import Cython
%load_ext Cython

In [2]:
import numpy as np
import re
from tqdm import tqdm_notebook as tqdm

In [3]:
with open('../data/wiki_short.txt', 'r') as f:
    lines = f.read().lower().splitlines()

In [4]:
corpus = []
N = 3
for i, line in tqdm(enumerate(lines)):
    if i!=0 and i%N == 0:
        temp_line = ' '.join(lines[len(corpus)*N:(len(corpus)+1)*N])
        corpus.append(temp_line)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [5]:
%%cython --cplus
from cython.parallel import prange
from libcpp.vector cimport vector
from libcpp.string cimport string
from libc.stdlib cimport malloc, free
from libcpp.set cimport set

import numpy as np
cimport numpy as np

cdef extern from "regex.h" nogil:
    ctypedef struct regmatch_t:
        int rm_so
        int rm_eo
    ctypedef struct regex_t:
        pass
    int REG_EXTENDED
    int regcomp(regex_t* preg, const char* regex, int cflags)
    int regexec(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
    void regfree(regex_t* preg) 
    
cdef extern from "<string.h>" nogil:
    char *strncpy (char *pto, const char *pfrom, size_t size)

cdef class regex_cpp:
    cdef regex_t regex_obj
    
    def __cinit__(self, char* regex):
        regcomp(&self.regex_obj, regex, REG_EXTENDED)
        
    cdef vector[string] findall(self, char* stringbytes) nogil:
        cdef regmatch_t regmatch_obj[1]
        cdef vector[string] results
        cdef int regex_res = 0
        cdef int current_str_pos = 0
        cdef char *substring
        
        regex_res = regexec(&self.regex_obj, stringbytes, 1, regmatch_obj, 0)
        while regex_res == 0:
            substring = <char *> malloc((regmatch_obj[0].rm_eo-regmatch_obj[0].rm_so+1) * sizeof(char))
            substring[regmatch_obj[0].rm_eo-regmatch_obj[0].rm_so] = 0
            strncpy(substring, stringbytes+current_str_pos+regmatch_obj[0].rm_so, regmatch_obj[0].rm_eo-regmatch_obj[0].rm_so)
            
            results.push_back(<string> substring)
            
            current_str_pos += regmatch_obj[0].rm_eo
            regex_res = regexec(&self.regex_obj, stringbytes + current_str_pos, 1, regmatch_obj, 0)
            
        return results

cpdef set[string] vectorize(str patern, list STRINGS):
    cdef vector[string] results
    cdef char* PAT = <char *> malloc((len(patern)+1) * sizeof(char))
    PAT[len(patern)] = 0
    
    cdef string _str_patern = <string> patern.encode('utf-8')
    
    cdef vector[char*] cpp_words = [word.encode('utf-8') for word in STRINGS]
    
    reg = regex_cpp(PAT)
    
    cdef long i = 0
    cdef long length = cpp_words.size()
    
    cdef set[string] answers
    cdef long j = 0

    for i in prange(length, nogil=True, num_threads=4):
#     for i in range(length):
        results = reg.findall(cpp_words[i])
        for j in range(results.size()):
            answers.insert(results[j])
            
    
    return answers
    
    

In [6]:
N = 10
n = 0

In [7]:
%%time
ret = vectorize("\w\w+", corpus[n:N])

CPU times: user 66 µs, sys: 16 µs, total: 82 µs
Wall time: 70.1 µs


In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
model = CountVectorizer()

In [10]:
%%time
model.fit(corpus[n:N])

CPU times: user 1.2 ms, sys: 329 µs, total: 1.53 ms
Wall time: 1.38 ms


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [11]:
len(model.get_feature_names())

263

In [12]:
len(ret)

0

In [13]:
set(ret.keys()).difference(set(model.get_feature_names()))

AttributeError: 'set' object has no attribute 'keys'