## UPOZORENJA
- ćelija za učitavanje podataka treba malo više vremena (get_user_data), pozvati jednom i nakon toga zakomentirati

## TODO-ovi
- smisliti kako ćemo odabrati bolji dio podataka

In [None]:
import numpy as np
import scipy
from scipy.sparse import csc_matrix, bmat, save_npz
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
import copy

## Data preparation
- reading user data from given files (we are considering only users that are in some group here)
- random sample of filtered users
- normalizing user data (renaming users to elements of $\{0, ..., n-1\}$)

In [None]:
def normalize_users(users):
    usrs = list(set(users.keys()).union(set([link  for key in users for link in users[key]["links"] ])))
    groups = list(set([group for key in users for group in users[key]["groups"] ]))
    normalized_users = {}
    for user in users:
        normalized_users[usrs.index(user)] = {}
        normalized_users[usrs.index(user)]["links"] = [usrs.index(link) for link in users[user]["links"]]
        normalized_users[usrs.index(user)]["groups"] = [groups.index(group) for group in users[user]["groups"]]
    return normalized_users    

In [None]:
def filter_users(user_number, all_users):
    # add any additional filters here if needed (ex. at least 2 groups and 3 links)
    
    # filters like this will work only if set for groups because in the next step we are 
    # filtering links that aren't in our subgraph
    users = copy.deepcopy(all_users)
    print("Number of users in full dataset : " + str(len(users)))
    users = {user: users[user] for user in users if len(users[user]["links"]) >= 2 and len(users[user]["groups"]) >= 2}
    print("Number of users in filtered dataset : " + str(len(users)))
    
    keys = random.sample(users.keys(), user_number)
    users = {key: users[key] for key in keys}
    
    #first eliminate any links that remained in the graph and shouldn't be there
    for user in users:
        users[user]["links"] = list(filter(lambda link: link in users.keys(), users[user]["links"]))
        
    return normalize_users(users)

In [None]:
def get_user_data(links_path, groups_path):
    # return a list of two dictionaries-users and groups
    # users-return all user data where every user is in at least one group
    links = np.loadtxt(links_path)
    groups = np.loadtxt(groups_path)
    users = {} # key is user, value is dict containing all of his links and all of his groups
    groups_info = {} # key is group, value is dict containing all of its users
    
    for edge in groups:
        if edge[0] not in users:
            users[edge[0]] = { "links": [], "groups":[] }
        users[edge[0]]["groups"].append(edge[1])
        if edge[1] not in groups_info:
            groups_info[edge[1]] = { "users": [] }
        groups_info[edge[1]]["users"].append(edge[0])
        
    for link in links:
        if link[0] not in users or link[1] not in users: continue
        if link[1] in users[link[0]]["links"]: continue
        if link[0] in users[link[1]]["links"]: continue
        users[link[0]]["links"].append(link[1])
        users[link[1]]["links"].append(link[0])
    
    return [users, groups_info]

In [None]:
def user_connections(user, groups_users_list):
    # return a list of all user's friends and group friends
    connections=[friend for friend in groups_users_list[0][user]["links"]]
    for group in groups_users_list[0][user]["groups"]:
        for friend in groups_users_list[1][group]["users"]:
            connections.append(friend)
    return list(set(connections))
    

In [None]:
def tree(final_network, last_added, k, groups_users_list):
    #  second iteration: last_added = [user_connections(user) for user in last_added] - final_network 
    l=[]
    if k==0:
        return final_network
    else:
        tree_level=[]
        for friend in last_added:
            tree_level.extend(user_connections(friend, groups_users_list))
        last_added=list(set(tree_level)-set(final_network))
        final_network.extend(last_added)
        return tree(final_network, last_added, k-1, groups_users_list)

In [None]:
def get_adjacency_matrix(users, key, row_num, col_num):
    # constructs adjacency matrix
    # rows are indexed by user
    # cols are indexed based on the key
    row = np.array([])
    column = np.array([])
    value = np.array([])
    
    for user in users:
        for element in users[user][key]:
            row = np.append(row, user)
            column = np.append(column, element)
            value = np.append(value, 1)
    
    s = csc_matrix((value, (row, column)), shape = (row_num, col_num))
    return s

In [None]:
# load data
# this takes some time so comment it out after first run
%time all_users_and_groups = get_user_data("data/raw/release-youtube-links.txt", "data/raw/release-youtube-groupmemberships.txt")

In [None]:
#ovo mora biti nula, panika ako nije nula
if sum(user not in all_users_and_groups[0][neigh]["links"] for user in all_users_and_groups[0]
   for neigh in all_users_and_groups[0][user]["links"]) != 0:
    for i in range(10): print("PANIC!")

In [None]:
final_network = [1.0]
last_added = [1.0]
max_tree_level = 2
all_users = {user: all_users_and_groups[0][user] for user in tree(final_network, last_added, max_tree_level, all_users_and_groups)}

In [None]:
# isto kao prethodno ovo MORA biti nula, čini se da funckija tree ne odrzava graf onako kako bi trebala
"""
if sum(user not in all_users[neigh]["links"] for user in all_users
   for neigh in all_users[user]["links"]) != 0:
    for i in range(10): print("PANIC!")
"""

In [None]:
def print_basic_stats(dataset, dataset_name):
    print(dataset_name)
    print("median:", np.median(dataset), "average:", np.mean(dataset), 
            "max:", np.amax(dataset), "minimum", np.amin(dataset))

def get_network_stats(users):
    all_users = np.array(list(users.keys()))
    user_link_count = np.array([len(users[user]["links"]) for user in all_users])
    user_group_count = np.array([len(users[user]["groups"]) for user in all_users])

    groups = np.unique(np.array([group for user in users for group in users[user]["groups"]]))
    group_user_count = np.zeros(len(groups))
    print("There are",len(all_users), "users and", len(groups), "groups.")
    # user count of groups[i] is located at group_user_count[i]
    for user in users:
        for group in users[user]["groups"]:
            index, = np.where(groups == group)
            if len(index) != 1: raise Exception("There are duplicates in your groups array. Check for error.")
            group_user_count[index[0]] += 1  # we can use group_user_count[group] here but this is more general

    # sad za sve ovo neke statistike
    fig1, axs1 = plt.subplots(1, 2, constrained_layout=True, squeeze=True)
    axs1[0].boxplot(user_link_count)
    axs1[0].set_title("User link boxplot")
    axs1[1].hist(user_link_count)
    axs1[1].set_title("User link histogram")
    print_basic_stats(user_link_count, "User link count")
    
    fig2, axs2 = plt.subplots(1, 2, constrained_layout=True, squeeze=True)
    axs2[0].boxplot(user_group_count)
    axs2[0].set_title("User group boxplot")
    axs2[1].hist(user_group_count)
    axs2[1].set_title("User group histogram")
    print_basic_stats(user_group_count, "User group count")

    fig3, axs3 = plt.subplots(1, 2, constrained_layout=True, squeeze=True)
    axs3[0].boxplot(group_user_count)
    axs3[0].set_title("Group user boxplot")
    axs3[1].hist(group_user_count)
    axs3[1].set_title("Group user histogram")
    print_basic_stats(group_user_count, "Group user count")

    plt.show()


In [None]:
# number of users
# WARNING: small number of users will usually result in a small number of links between users
k = 15000

In [None]:
# quickly gets a filtered subset of users
filtered_users = filter_users(k, all_users)

In [None]:
get_network_stats(filtered_users)


In [None]:
len(filtered_users)

In [None]:
# S matrix, matrix of links between users
s = get_adjacency_matrix(filtered_users, "links", len(filtered_users), len(filtered_users))
s

In [None]:
# A matrix, affiliation matrix, links between users and groups
# how many groups are there?
a = get_adjacency_matrix(filtered_users, "groups", len(filtered_users), len(set([group for user in filtered_users for group in filtered_users[user]["groups"]])))
a

## Split data into training and testing

In [None]:
a_train = a.toarray()
for row in a_train:
    ones = np.transpose(row.nonzero())
    indices = np.random.randint(len(ones), size = round(len(ones)*0.3))
    row[ones[indices]] = 0
a_test = csc_matrix(a - a_train)

In [None]:
for row in a_train:
    ones = np.transpose(row.nonzero())
    indices = np.random.randint(len(ones), size = round(len(ones)*0.3))
    row[ones[indices]] = 0
a_val = csc_matrix(a - a_train - a_test)

In [None]:
save_npz("data/yt_s.npz", s)
save_npz("data/yt_a.npz", a)
save_npz("data/yt_a_train.npz", csc_matrix(a_train))
save_npz("data/yt_a_val.npz", a_val)
save_npz("data/yt_a_test.npz", a_test)

In [None]:
def print_matrix_to_file(matrix, output):
    with open(output, "w") as output:
        for i, row in enumerate(matrix):
            for j, el in enumerate(row):
                if el == 1:
                    output.write(str(i+1) + " " + str(j+1) + "\n")

In [None]:
#uzasno dugo traje
data_path = [(a_train, "data/random_katz/yt_a_train.txt"), (a_val.toarray(), "data/random_katz/yt_a_val.txt"), 
             (a_test.toarray(), "data/random_katz/yt_a_test.txt"), (a.toarray(), "data/random_katz/yt_a.txt"),
            (s.toarray(), "data/random_katz/yt_s.txt")]

for data, path in tqdm(data_path):
    print_matrix_to_file(data, path)