In [1]:
import Cython
%load_ext Cython

# Базовый пример использования Cython

```Cython``` код с типизацией и без нее

In [2]:
%%cython

def CythonFunc(n):
    a = 0
    for _ in range(n):
        a += 1
    return a

def CythonTypedFunc(int n):
    cdef long ret = 0
    for _ in range(n):
        ret += 1
    return ret

In [3]:
def func(n):
    a = 0
    for _ in range(n):
        a += 1
    return a

In [4]:
N = 1000
K = 1000

In [5]:
%%time
answ = []
for k in range(K):
    answ.append(func(k))

CPU times: user 25.4 ms, sys: 606 µs, total: 26 ms
Wall time: 25.8 ms


In [6]:
%%time
answ = []
for k in range(K):
    answ.append(CythonFunc(k))

CPU times: user 13.2 ms, sys: 154 µs, total: 13.3 ms
Wall time: 13.5 ms


In [7]:
%%time
answ = []
for k in range(K):
    answ.append(CythonTypedFunc(k))

CPU times: user 321 µs, sys: 22 µs, total: 343 µs
Wall time: 388 µs


В данном тривиальном примере получаем следующее:

| Функция  | Время |
| ------------- | ------------- |
| Python Func  | 27.3 ms  |
| Cython Func  | 17.4 ms  |
| Cython Typed Func  | 0.212 ms |


# WordEmbeding

## ```sklean```

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
with open('./data/ru.tok', 'r') as f:
    corpus = f.read().lower().splitlines()

In [10]:
%%time
model = CountVectorizer()
model.fit(corpus)

CPU times: user 1.55 s, sys: 51.4 ms, total: 1.6 s
Wall time: 1.64 s


In [11]:
len(model.get_feature_names())

76777

## ```created```

In [12]:
import re

In [18]:
%%cython
import re

class Vectorizer:
    def __init__(self):
        self.NumOfWords = 0
        self.WordToInt = dict()
    
    def split_sent(self, sent):
        sent = re.sub('\W', ' ', sent)
        sent = re.sub('\s+', ' ', sent).strip()
        return sent.split(' ')
    
    def fit(self, corpus):
        for sent in corpus:
            words = self.split_sent(sent)
            for word in words:
                if word not in self.WordToInt:
                    self.WordToInt[word] = self.NumOfWords
                    self.NumOfWords += 1
        return
    
    def get_feature_names(self):
        return list(self.WordToInt.keys())
    
    def transform(self, corpus):
        transformed = [[0 for _ in range(self.NumOfWords)] for _ in range(len(corpus))]
        
        for i, sent in enumerate(corpus):
            words = self.split_sent(sent)
            for word in words:
                if word in self.WordToInt:
                    transformed[i][self.WordToInt[word]] += 1
                    
        return transformed
    
    def fit_transform(self, corpus):
        self.fit(corpus)
        return self.transform(corpus)

In [19]:
%%time
model = Vectorizer()
model.fit(corpus)

CPU times: user 1.47 s, sys: 15.6 ms, total: 1.49 s
Wall time: 1.5 s


In [20]:
len(model.get_feature_names())

76891