In [1]:
from sqlite3 import connect
import sys
from os import curdir, path

In [2]:
modules_dir = path.join(curdir, '..', 'modules')
sys.path.append(modules_dir)

In [3]:
from db import read_pages

In [4]:
db = connect(path.join(curdir, '..', 'articles.sqlite'))

In [5]:
pages = db.execute("SELECT * FROM {}".format('articles')).fetchall()

In [6]:
pages[0]


('https://vc.ru/marketing/63026-12-sovetov-kak-zarabotat-bolshe-podpischikov-v-instagram',
 '12 советов, как заработать больше подписчиков в Instagram — Маркетинг на vc.ru',
 '1. Продумайте контент-стратегию!')

In [7]:
titles = list(map(lambda x: x[1], pages))
descriptions = list(map(lambda x: x[2], pages))

In [8]:
print(titles[0])
print(descriptions[0])

12 советов, как заработать больше подписчиков в Instagram — Маркетинг на vc.ru
1. Продумайте контент-стратегию!


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csc_matrix

In [10]:
cv = CountVectorizer()

In [11]:
data_matrix = csc_matrix(cv.fit_transform(titles + descriptions))

In [12]:
data_matrix

<240x1283 sparse matrix of type '<class 'numpy.int64'>'
	with 4149 stored elements in Compressed Sparse Column format>

In [13]:
tokens = cv.get_feature_names()

In [14]:
import numpy as np

In [15]:
eps = 10e-8

In [16]:
class Embedding:
    def __init__(self, matrix, epsilon=eps):
        self.data = matrix
        self.sigma = np.zeros(matrix.shape)
        self.sigma.fill(epsilon)
    @staticmethod
    def create(emb_size, vocab_size, rng = 0.001):
        rnd = np.random.randn(emb_size, vocab_size) * rng
        return Embedding(rnd)


In [17]:
emb = Embedding.create(256, len(tokens))

In [18]:
print(emb.sigma.shape)
print(emb.data.shape)

(256, 1283)
(256, 1283)


In [19]:
def backward_hinge(u, v, v_hat, gamma):
    loss = gamma - np.dot(u, v) + np.dot(u, v_hat)
    if loss > 0:
        return (v_hat - v, -u, u)

In [20]:
# def update(embedding, idx, max_indicies, delta, nu):
#     p = emb.data[:, idx][:, max_indicies]
#     print(p)
#     g = emp.sigma[:, idx][:, max_indicies]
#     emb.sigma[:, idx][:, max_indicies] += delta**2
#     emp.data[:, idx][:, max_indicies] -= delta * nu / (np.sqrt(g) + eps)
def update(w, h, g_t, mask, lr=0.001, epsilon=1e-7):
    h_new = h + g_t * g_t
    w_new = w - lr * g_t / (np.sqrt(h_new) + epsilon)
    w_new = np.where(mask, w_new, w)
    return w_new, h_new

In [21]:
def agg(w, x):
    res = np.multiply(w, x.todense().transpose()).max(axis=0)
    mask = (np.vstack([res for i in range(w.shape[0])]) == w)
    #print(mask.shape)
    return res, mask # mask show values to update


In [22]:
def recall_at_k(emb, data, k = 10):
    n_test = len(data)
    
    descr_emb = [emb(data[i][2]) for i in range(n_test)]
    
    recall = []
    
    for i in range(n_test):
        title_emb = emb(data[i][1])
        recall.push(i in top_k(np.transpose(title_emb) * descr_emb, k))
    
    return np.count_nonzero(recall) / n_test

In [None]:
def train2(emb: Embedding, idx, nu):
    embs = emb.sigma[idx]
    delta_s = backward_hinge(*embs)
    if delta_sis None:
        return nothing
    
    for i, delta in zip(idx, delta_s)
        update!(emb, i, delta, nu)


In [None]:
def train1(emb, data, nu):
    first = np.random.permutation(len(data))
    second = np.roll(first, 1)
    for (f, s) in zip(first, second):
        u, v = data[f]
        np.random.randint(2) ? np.setdiff1d(v, u) : np.setdiff1d(u, v)
        train2(emb, (u, v, data[s][2]), nu)

In [None]:
def train(emb, train_data, test_data, n_epochs, nu):
    for epoch in range(n_epochs):
        t = train1(emb, train_data, nu)
        recall = recall_at_k(emb, test_data)
        print("Epoch {} ({}): recall = {}\n".format(epoch, t, recall))

0
