10 => (loss = :mrl, opt = :rmsprop, agg = :avg, constr = :nneg)

### 1. Read data

In [0]:
import os

def remove_blank_lines(filepath, pattern):
    data = []
    with open(filepath, "r", encoding="utf-8") as file:
        data.append(file.readline()) # append header
        for line in file:
            if line.startswith(pattern) and len(line.split(csv_separator)) == 4:
                data.append(line)                
    return data

def preprocess_raw_files(folder_path, new_folder_path):
    if not os.path.exists(new_folder_path):
        os.makedirs(new_folder_path)
    
    for filename in os.listdir(folder_path):
        data = remove_blank_lines(os.path.join(folder_path, filename), filename)
        with open(os.path.join(new_folder_path, filename), "w", encoding="utf-8") as file:
            for line in data:
                file.write(line)
        
data_folder = "PrepData"
csv_separator = ';'
preprocess_raw_files("Data", data_folder)

rows_to_read_from_each_topic = 2000 # use small values for debug
test_size = 1000 # use small values for debug

In [0]:
import pandas as pd
import glob

def read_data(folder_path):
    data = {}
    for file in glob.glob(folder_path + "/*"):
        csv_f = pd.read_csv(file, delimiter=";" ,nrows=rows_to_read_from_each_topic, index_col=2).T.to_dict()
        for title in csv_f:
            data[title] = csv_f[title]["og:description"]    
    return data

### 2. Tokenize & create dictionary for unique most frequent words

In [0]:
data = read_data(data_folder)

import random
print(random.sample(data.items(), 1))

In [0]:
squashed_data = [title + '\n' + data[title] for title in data.keys()]

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

words_dict = {}
idx = 0
vectorizer = CountVectorizer()

def get_words_occurring_more_than(k):
    bag_of_words = vectorizer.fit_transform(squashed_data)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    return dict(filter(lambda x: x[1] > k, words_freq))

words = get_words_occurring_more_than(1)

In [0]:
words_dict = dict((v, k) for k, v in dict(enumerate(words.keys())).items())

### 4. Define function to represent text as a set of indexes in word_dict

In [0]:
import re
def get_indices(text):
    for (word, _) in re.findall(r'(\w+(-\w+)?)', text):
        if word in words_dict:
            yield words_dict[word]

### 5. Initialize random vector for every word

In [0]:
import numpy as np
from sklearn.preprocessing import normalize

embedding_size = 32

A = np.random.normal(0, 1, size=(embedding_size, len(words_dict)))
A = normalize(A, axis = 0)

### 6. Split data to train and test

In [0]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(list(data.items()), test_size=test_size)
title, desc = random.choice(train)

In [0]:
title

In [0]:
desc

### 7. Define function for representing text in form of word indices in dict

In [0]:
def get_vectors(text):
    return A[:, list(get_indices(text))]

### 8. Define loss function

In [0]:
def loss(u, v, v_hat, γ=1.0):
    return γ - np.dot(u, v) + np.dot(u, v_hat)

def calculate_gradients(u, v, v_hat):
    l = loss(u, v, v_hat)
    if l > 0:
        return (np.subtract(v_hat, v), np.multiply(u, -1), np.array(u))
    return None

### 9. Define functions for quality testing

In [0]:
def get_aggregation(text):
    vectors, indices = get_vectors(text), list(get_indices(text))
    return np.mean(vectors, axis=(1))

In [0]:
def top_k(vec, k):
    return np.argpartition(vec, k)[:k, :]

def recall(test):   
    v_titles = [[v for v in get_aggregation(title)] for title, _ in test]
    v_descs = [[v for v in get_aggregation(desc)] for _, desc in test]
    dist = np.matmul(v_titles, np.transpose(v_descs))

    ind = 0
    tp = 0
    for top in np.transpose(top_k(dist, 10)):
        if ind in top:
            tp += 1
        ind += 1
    return tp / len(test)

### 10. Train

In [0]:
def remove_negative(A):
    normalized = A + np.amin(A)
    return normalized

In [0]:
def optimization_step(A, e_g, grads, eta, eps, decay=0.9):
    for g in grads:
        e_g = decay * e_g + (1 - decay) * g ** 2
        delta = g * eta / np.sqrt(e_g + eps)
        A -= np.transpose(delta)

In [0]:
epochs = 100
e_g = np.zeros((len(words_dict), embedding_size))
eps = 1e-8
eta = 1
iters_per_epoch = 100000

def train_sample(iter_num, A):
    random.seed()
    samples = random.sample(train, 2)
    
    title, desc = samples[0]
    _, other_desc = samples[1]   
    
    title_vals = list(get_aggregation(title))
    desc_vals = list(get_aggregation(desc))    
    other_desc_vals =  list(get_aggregation(other_desc))
    
    grads = calculate_gradients(title_vals, desc_vals, other_desc_vals)
    
    if (grads):
        optimization_step(A, e_g, grads, eta, eps)
    
    if iter_num % 1000 == 0:
        A = remove_negative(A)

recalls = []
for epoch in range(epochs):
    for i in range(iters_per_epoch):
        #print(i)
        train_sample(i, A)
    r = recall(test)
    recalls.append(r)
    
    print("Epoch {0} done. Recall: {1}".format(epoch + 1, r))