In [None]:
import numpy as np
import scipy
import scipy.linalg as linalg
from scipy.sparse import csc_matrix, bmat, load_npz, csr_matrix
from scipy.sparse.linalg import svds, eigsh
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
import implicit
from pathlib import Path
import re

## Load Data

In [None]:
s = load_npz("data/yt_s.npz")
a_train = load_npz("data/yt_a_train.npz")
a_test = load_npz("data/yt_a_test.npz")
a_val = load_npz("data/yt_a_val.npz")
n_groups = a_train.shape[1]
n_users = s.shape[0]

In [None]:
def prepare_train_data(alpha, s, a_train, a_val):
    c_train = bmat([[alpha*s, a_train], [a_train.transpose(), None]])
    c_val = bmat([[alpha*s, a_val], [a_val.transpose(), None]])
    #c_test = bmat([[alpha*s, a_test], [a_test.transpose(), None]])    
    return c_train.astype(np.float64), c_val.astype(np.float64)

In [None]:
n_users, n_groups

In [None]:
a_val

In [None]:
a_test

In [None]:
a_train

## SVD

In [None]:
def svd_get_recs(i, model, train_labels, n_groups):
        u, sig, vt = model
        score = (u[i,:]@np.diag(sig)@vt)[-n_groups:]
        score = np.multiply(score, np.logical_not(train_labels))
        score_index = np.flip(np.argsort(score))
        return score_index

In [None]:
"vrati listu (za k = 1:n) precisiona i recalla na testu za jednog usera"
def evaluate_model_user(i, n, model, c_train, c_val, n_groups, model_type):
    "i = user za kojeg generiramo recommendatione"
    true_labels = c_val.getrow(i).toarray().flatten()[-n_groups:]
    train_labels = c_train.getrow(i).toarray().flatten()[-n_groups:]
    
    "tu se dodaju novi modeli"
    if model_type == "SVD":
        score_index = svd_get_recs(i, model, train_labels, n_groups)
    elif model_type == "ALS":
        score_index = als_get_recs(i, n, model, c_train, n_groups)
    elif model_type == "random_katz":
        score_index = rand_katz_get_recs(i, n, model, train_labels, n_groups)
    elif model_type == "perfect_model":
        score_index = np.flip(np.argsort(true_labels))
    elif model_type == "katz":
        score_index = katz_get_recs(i,model, train_labels, n_groups)
    else:
        assert False
        
    positives = np.sum(true_labels)
    negatives = n_groups - positives
    user_i_stats = []
    for predictions in range(1, n+1):
        recommendations = score_index[:predictions]
        true_positives = np.sum(true_labels[recommendations] == 1)
        true_negatives = negatives - (predictions - true_positives)
        precision = true_positives/predictions
        sensitivity = true_positives/positives
        specificity = true_negatives/negatives
        user_i_stats.append((precision, sensitivity, specificity))
    return user_i_stats

In [None]:
def evaluate_model(model, c_train, c_val, n_users, n_groups, model_type):
    stats = []
    for i in tqdm(range(n_users)):
            if np.sum(c_val.getrow(i).toarray().flatten()[-n_groups:]) != 0:
                stats.append(evaluate_model_user(i, 50, model, c_train, c_val, n_groups, model_type))
    "stats mi je lista duljine broj usera, svaki element je lista duljine n koja sadrzi tupleove oblika (pr, se, sp)"
    pr_se_sp = []
    "pr_se_sp ce biti lista tupleova duljine n, tuple je oblika (mean_pr, mean_se, mean_sp) gdje je prosjek uzet po userima"
    for n in zip(*stats):
        pr_se_sp.append((np.mean([i for i,j,k in n]), np.mean([j for i,j,k in n]), np.mean([k for i,j,k in n])))
    "pss ce biti numpy array dimenzija n x 3, svaki stupac odgovara jednom od (pr, se, sp)"
    pss = np.array(pr_se_sp)
    return pss[:,0], pss[:,1], pss[:,2]

In [None]:
def get_score(precision, sensitivity, specificity):
    x=[(1-spec) for spec in specificity]
    area = np.trapz(y=sensitivity, x=x)
    return abs(area)

In [None]:
def svd_model(alpha, svd_rank, s, a_train, a_val, n_users, n_groups):
    c_train, c_val = prepare_train_data(alpha, s, a_train, a_val)
    model = svds(c_train, k = svd_rank)
    precision, sensitivity, specificity = evaluate_model(model, c_train, c_val, n_users, n_groups, model_type = "SVD")
    score = get_score(precision, sensitivity, specificity)
    return {"alpha" : alpha, "svd_rank" : svd_rank, "score" : score, "precision" : precision, "sensitivity" : sensitivity, "specificity" : specificity}

In [None]:
def validate_svd_model(alphas, svd_ranks, s, a_train, a_val, n_users, n_groups):
    validation_scores = []
    for alpha in alphas:
        for svd_rank in svd_ranks:
            validation_scores.append(svd_model(alpha, svd_rank, s, a_train, a_val, n_users, n_groups))
    return validation_scores

In [None]:
alphas = [2.5]
svd_ranks = [700]

In [None]:
validation_scores_svd = validate_svd_model(alphas, svd_ranks, s, a_train, a_val, n_users, n_groups)

In [None]:
validation_scores_svd.sort(key = lambda x : x["score"], reverse = True)

In [None]:
validation_scores_svd[0]["alpha"], validation_scores_svd[0]["svd_rank"]

In [None]:
validation_scores_svd[0]

In [None]:
for score in validation_scores_svd:
    print(score["alpha"], score["svd_rank"], score["score"])

## EVALUACIJA GENERALNO

In [None]:
"treba u evaluate_model_user za svaki model dodati granu u if-u u funkciji evaluate_model_user u kojoj se napravi score_index"
"score_index je lista/np.array koji sadrži indekse grupa sortirane po scoreu koji model daje, dakle sortirana lista grupa za recommendat"
"ideja je da se dotad sve sto ti treba za evaluirati model prenosi u varijabli model, a onda unutar tog ifa se pozove neka funkcija koja evaluira"
"za validaciju i kreiranje modela predlazem da se rade posebne funkcije za svaki jer nije bas zgodno napravit generalno, mogu biti po uzoru na ove"
#precision, sensitivity, specificity = evaluate_model(model, c_train, c_val, n_users, n_groups, model_type = "SVD")
#score = get_score(precision, sensitivity, specificity)
"dole primjer za als - 3 modificirane funkcije i onda se samo pozove - nije bas savrseno al mislim da ce bit skroz ok za nasih par modela"

## ALS

In [None]:
def als_get_recs(i, n, model, c_train, n_groups):
    recs = model.recommend(i, user_items = csr_matrix(c_train), N = n, filter_already_liked_items = True, filter_items = [j for j in range(c_train.shape[0]-n_groups)])
    indices = [rec[0]-n_users for rec in recs]
    scores = [rec[1] for rec in recs]
    score = np.zeros(n_groups)
    score[indices] = scores
    score_index = np.flip(np.argsort(score))
    return score_index

In [None]:
def als_model(alpha, n_factors, s, a_train, a_val, n_users, n_groups):
    c_train, c_val = prepare_train_data(alpha, s, a_train, a_val)
    model = implicit.als.AlternatingLeastSquares(factors = n_factors, regularization = 2)
    model.fit(c_train)    
    precision, sensitivity, specificity = evaluate_model(model, c_train, c_val, n_users, n_groups, model_type = "ALS")
    score = get_score(precision, sensitivity, specificity)
    return {"alpha" : alpha, "n_factors" : n_factors, "score" : score, "precision" : precision, "sensitivity" : sensitivity, "specificity" : specificity}

In [None]:
def validate_als_model(alphas, n_factors, s, a_train, a_val, n_users, n_groups):
    validation_scores = []
    for alpha in alphas:
        for nf in n_factors:
            validation_scores.append(als_model(alpha, nf, s, a_train, a_val, n_users, n_groups))
    return validation_scores

In [None]:
alphas = [7]
n_factors = [600]

In [None]:
validation_scores_als = validate_als_model(alphas, n_factors, s, a_train, a_val, n_users, n_groups)

In [None]:
validation_scores_als.sort(key = lambda x : x["score"], reverse = True)

In [None]:
validation_scores_als[0]

In [None]:
validation_scores_als[0]["alpha"], validation_scores_als[0]["n_factors"]

In [None]:
for score in validation_scores_als:
    print(score["alpha"], score["n_factors"], score["score"])

## Katz

In [None]:
def katz (A, S, k, rank, beta, alfa):
    S = S.astype(float)
    vals, vecs = eigsh(S, k=rank)
    matrix = A.astype(float)
    U, s, V = svds( matrix, k=rank)
    common_space = np.hstack([vecs,U])
    q, r = linalg.qr(common_space, mode="economic")
    Ds = q.transpose()@S@ q
    V, r1 = linalg.qr(A.transpose()@q,mode="economic")
    Vt=V.transpose()
    Da = q.transpose()@ A @ V
    x1 = alfa*Ds@Da
    last_factors = [[x1,1]]
    # 1 ce mi biti indikator jel prvi sumand matrica Ds ili nije
    if k==1:
        return beta*q@Da@Vt
    beta=beta**2
    final_sum = beta*x1
    if k==2:
        return prvi_sumand+q@final_sum@Vt
    for i in range(k-2):
        novi = []
        for fact in last_factors:
            if fact[1]==1:
                novi.append([alfa*Ds@fact[0],1])
                novi.append([((1/alfa)*Da@Da.transpose()@np.linalg.inv(Ds)@fact[0]),0])
            if fact[1]==0:
                novi.append([alfa*Ds@fact[0],1])
        ita_suma = sum(matr[0] for matr in novi)
        beta=beta*beta
        final_sum = final_sum + beta*ita_suma
        last_factors = novi
    #rez = beta*q@Ds@q.transpose()+q@final_sum@Vt
    
    #rez = q@final_sum@Vt
    final_sum = beta*Da+final_sum
    return q,final_sum, Vt
        
    

In [None]:
def katz_get_recs(i, model, train_labels, n_groups):
        q, final_sum, vt = model
        score = (q[i,:]@final_sum@vt)
        score = np.multiply(score, np.logical_not(train_labels))
        score_index = np.flip(np.argsort(score))
        return score_index
    

In [None]:
def katz_model(alpha,beta, katz_rank, s, a_train, a_val, n_users, n_groups, k):
    #katz (A, S, k, d, beta, alfa):
    #c_train, c_val = prepare_train_data(alpha, s, a_train, a_val)
    model = katz(a_train, s,k ,katz_rank, beta, alpha)
    precision, sensitivity, specificity = evaluate_model(model, a_train, a_val, n_users, n_groups, model_type = "katz")
    score = get_score(precision, sensitivity, specificity)
    return {"beta" : beta, "alpha" : alpha, "katz_rank" : katz_rank, "score" : score, "precision" : precision, "sensitivity" : sensitivity, "specificity" : specificity}

In [None]:
def validate_katz_model(alphas, betas, katz_ranks, s, a_train, a_val, n_users, n_groups, k):
    validation_scores = []
    for alpha in alphas:
        for beta in betas:
            for katz_rank in katz_ranks:
                validation_scores.append(katz_model(alpha, beta, katz_rank, s, a_train, a_val, n_users, n_groups, k))
    return validation_scores

In [None]:
alphas = [1]
betas = [0.1]
katz_ranks = [600]

In [None]:
validation_scores_katz = validate_katz_model(alphas,betas, katz_ranks, s, a_train, a_val, n_users, n_groups, 7)

In [None]:
validation_scores_katz.sort(key = lambda x : x["score"], reverse = True)

In [None]:
validation_scores_katz[0]

In [None]:
validation_scores_katz[0]["alpha"], validation_scores_katz[0]["beta"], validation_scores_katz[0]["katz_rank"]

In [None]:
for score in validation_scores_katz:
    print(score["alpha"], score["beta"],score["katz_rank"], score["score"])

## Random Katz

In [None]:
def rand_katz_get_recs(i, n, model, train_labels, n_groups):
    recs = model[i+1] # i is iterating from 0 to n_users, user_id in dict is between 1 and n_users
    score = np.multiply(recs, np.logical_not(train_labels))
    score_index = np.flip(np.argsort(score))
    return score_index

In [None]:
import struct

def read_results_from_bin(bin_file_path, n_groups):
    # format is uint32, uint32, double
    fmt = "IId"
    record_len = struct.calcsize(fmt)
    unpack = struct.Struct(fmt).unpack_from
    result = {}
    with open(bin_file_path, "rb") as file:
        data = file.read(record_len)
        while data:
            user, group, score = unpack(data)
            if user not in result:
                result[user] = np.zeros(n_groups)
            result[user][group-1] = score
            data = file.read(record_len)
    return result

In [None]:
def random_katz_model(file, s, a_train, a_val, n_users, n_groups):
    model = read_results_from_bin(str(file), n_groups)   #model je izracunat vec u nekom c++ kodu, ovdje samo učitaom file
    c_train, c_val = prepare_train_data(1, s, a_train, a_val)
    precision, sensitivity, specificity = evaluate_model(model, c_train, c_val, n_users, n_groups, model_type = "random_katz")
    score = get_score(precision, sensitivity, specificity)
    path_len, iterations = tuple(int(s) for s in re.findall(r'\d+', str(file)))       # cisto da imamo podatke o duljini puta i broju iteracija u ovom modelu
    return {"iterations": iterations, "path_len": path_len, "score": score, "precision": precision, "sensitivity": sensitivity, "specificity": specificity}

In [None]:
def validate_random_katz_model(s, a_train, a_val, n_users, n_groups):
    validation_scores = []
    for file in Path("data/random_katz/results/").iterdir():
        path_len, iterations = tuple(int(s) for s in re.findall(r'\d+', str(file)))
        if path_len == 5 and iterations == 10000:    #ovo je ispalo najbolje
            validation_scores.append(random_katz_model(file, s, a_train, a_val, n_users, n_groups))
    return validation_scores

In [None]:
%%time

rand_katz_validation_scores = validate_random_katz_model(s, a_train, a_val, n_users, n_groups)

In [None]:
rand_katz_validation_scores.sort(key= lambda x : x["score"])

In [None]:
for val_score in rand_katz_validation_scores:
    plt.plot(1 - val_score["specificity"], val_score["sensitivity"])

## Vizualizacija

In [None]:
import seaborn as sns

In [None]:
sns.set_theme()
sns.set_style("darkgrid")

In [None]:
plt.figure(figsize = (15,10))
plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
sns.lineplot(x = 1 - validation_scores_als[0]["specificity"], y = validation_scores_als[0]["sensitivity"])
sns.lineplot(x = 1 - validation_scores_svd[0]["specificity"], y = validation_scores_svd[0]["sensitivity"])
sns.lineplot(x = 1 - validation_scores_katz[0]["specificity"], y = validation_scores_katz[0]["sensitivity"])
sns.lineplot(x = 1 - rand_katz_validation_scores[0]["specificity"], y = rand_katz_validation_scores[0]["sensitivity"])
plt.legend(("ALS", "SVD", "KATZ", "RAND_KATZ"))
plt.xlabel("1 - specificity")
plt.ylabel("sensitivity")
plt.title("ROC CURVES FOR ALL MODELS")