In [None]:
import numpy as np
import scipy
from scipy.sparse import csc_matrix, bmat, save_npz, load_npz
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
import copy

## Data preparation
- reading user data from given files (we are considering only users that are in some group here)
- random sample of filtered users
- normalizing user data (renaming users to elements of $\{0, ..., n-1\}$)

In [None]:
def normalize_users(users):
    usrs = list(set(users.keys()).union(set([link  for key in users for link in users[key]["links"] ])))
    groups = list(set([group for key in users for group in users[key]["groups"] ]))
    normalized_users = {}
    for user in users:
        normalized_users[usrs.index(user)] = {}
        normalized_users[usrs.index(user)]["links"] = [usrs.index(link) for link in users[user]["links"]]
        normalized_users[usrs.index(user)]["groups"] = [groups.index(group) for group in users[user]["groups"]]
    del usrs
    del groups
    return normalized_users    

In [None]:
def get_user_data(links_path, groups_path):
    # return a list of two dictionaries-users and groups
    # users-return all user data where every user is in at least one group
    links = np.loadtxt(links_path)
    groups = np.loadtxt(groups_path)
    users = {} # key is user, value is dict containing all of his links and all of his groups
    groups_info = {} # key is group, value is dict containing all of its users
    
    for edge in groups:
        if edge[0] not in users:
            users[edge[0]] = { "links": [], "groups":[] }
        users[edge[0]]["groups"].append(edge[1])
        if edge[1] not in groups_info:
            groups_info[edge[1]] = { "users": [] }
        groups_info[edge[1]]["users"].append(edge[0])
        
    for link in links:
        if link[0] not in users or link[1] not in users: continue
        if link[1] in users[link[0]]["links"]: continue
        if link[0] in users[link[1]]["links"]: continue
        users[link[0]]["links"].append(link[1])
        users[link[1]]["links"].append(link[0])
    
    return [users, groups_info]

In [None]:
def user_friends(user, groups_users_list):
    # return a list of all user's friends 
    connections=[friend for friend in groups_users_list[0][user]["links"]]
    return connections

In [None]:
def user_group_friends(user, groups_users_list):
    # return a list of all user's group friends 
    connections=[]
    for group in groups_users_list[0][user]["groups"]:
        for friend in groups_users_list[1][group]["users"]:
            connections.append(friend)
    return connections

In [None]:
def reduce_group_friends(user, all_users_and_groups, k):
    #uzme usera i vraca one prijatelje po grupi s kojima ima slican ukus
    # k je parametat- koliko istih grupa trebaju imat
    friends=user_group_friends(user, all_users_and_groups)
    l2=[]
    for friend in friends:
        l=[value for value in all_users_and_groups[0][user]["groups"] if value in all_users_and_groups[0][friend]["groups"]]
        if (len(l)>=k):
            l2.append(friend)
    return l2

In [None]:
def tree(user1,groups_users_list,i):
    # returns a network that contains all connected users as many times as they appear in the network
    last_added=[user1]
    network=[user1]
    while i!=0:
        tree_level=[]
        for friend in last_added:
            tree_level.extend(user_friends(friend, groups_users_list))
        last_added=list(set(tree_level)-set(network))
        network.extend(tree_level)
        i=i-1
    return network

In [None]:
def frequency(list):
    frequency = {}
    for item in list:
        if item in frequency:
            frequency[item] += 1
        else:
            frequency[item] = 1
    return frequency

In [None]:
def reduce_network(network, groups_users_list):
    dict_freq=frequency(network)
    print(sum(dict_freq.values())/len(dict_freq.values()))
    # leaves only those users who appear at least 2 times in network and are in at least 4 groups
    network1=[user for user in dict_freq if (dict_freq[user]>=2 and len(groups_users_list[0][user]["groups"])>3)]
    print(len(network1))
    return network1

In [None]:
def group_friends_list(network1, groups_users_list):
    # returns a list containing group friends of each user in the network
    group_friends=[]
    for user in network1:
        group_friends.extend(reduce_group_friends(user, groups_users_list,4))
    all_group_friends=list(set(group_friends))
    return all_group_friends

In [None]:
def network_union(root, groups_users_list, i):
    network=tree(root,groups_users_list,i)
    network1=reduce_network(network, groups_users_list)
    group_friends=group_friends_list(network1, groups_users_list)
    final_network=list(set(network1).union(set(group_friends)))
    return final_network

In [None]:
def dict_users_groups(network, groups_users_list):
    network_dict = {}
    for user in network:
        network_dict[user] = {}
        network_dict[user]["links"] = [link for link in groups_users_list[0][user]["links"] if link in network]
        network_dict[user]["groups"] = [group for group in groups_users_list[0][user]["groups"]]
    return network_dict 

In [None]:
# load data
# this takes some time so comment it out after first run
%time all_users_and_groups = get_user_data("data/raw/release-youtube-links.txt", "data/raw/release-youtube-groupmemberships.txt")

In [None]:
%time final_network_dict=dict_users_groups(network_union(1,all_users_and_groups,5), all_users_and_groups)

In [None]:
from statistics import mode
print(len(final_network_dict))
print("mean number of friends per user: " + str(np.mean([len(final_network_dict[user]['links']) for user in final_network_dict])))
print("mode number of friends per user: " + str(mode([len(final_network_dict[user]['links']) for user in final_network_dict])))
print("min number of friends per user: " + str(np.min([len(final_network_dict[user]['links']) for user in final_network_dict])))
print("min number of groups per user: " + str(np.min([len(final_network_dict[user]['groups']) for user in final_network_dict])))
print("mean number of groups per user: " + str(np.mean([len(final_network_dict[user]['groups']) for user in final_network_dict])))
print("mode number of groups per user: " + str(mode([len(final_network_dict[user]['groups']) for user in final_network_dict])))

In [None]:
def filter_data(final_network, groups_users_list):
    users = {user:groups_users_list[0][user] for user in final_network}
    return users

In [None]:
def get_adjacency_matrix(users, key, row_num, col_num):
    # constructs adjacency matrix
    # rows are indexed by user
    # cols are indexed based on the key
    row = np.array([])
    column = np.array([])
    value = np.array([])
    
    for user in users:
        for element in users[user][key]:
            row = np.append(row, user)
            column = np.append(column, element)
            value = np.append(value, 1)
    
    print(max(row), max(column), row_num)
    s = csc_matrix((value, (row, column)), shape = (row_num,col_num)) 
    return s

In [None]:
final_network_dict_norm=normalize_users(final_network_dict)

In [None]:
assert sum(user not in final_network_dict_norm[neigh]["links"] for user in final_network_dict_norm for neigh in final_network_dict_norm[user]["links"]) == 0

In [None]:
def print_matrix_to_file(matrix, output):
    with open(output, "w") as output:
        for i, row in enumerate(matrix):
            for j, el in enumerate(row):
                if el == 1:
                    output.write(str(i+1) + " " + str(j+1) + "\n")

In [None]:
# S matrix, matrix of links between users
s = get_adjacency_matrix(final_network_dict_norm,'links' ,len(final_network_dict_norm), len(final_network_dict_norm))
s

In [None]:
#stednja memorije
save_npz("data/yt_s.npz", s)
print_matrix_to_file(s.toarray(), "data/random_katz/yt_s.txt")
del s

In [None]:
# A matrix, affiliation matrix, links between users and groups
# how many groups are there?
a = get_adjacency_matrix(final_network_dict_norm, "groups", len(final_network_dict_norm), len(set([group for user in final_network_dict_norm for group in final_network_dict_norm[user]["groups"]])))
a

## Nevjerojatne funkcije za vizualizacije

In [None]:
def print_basic_stats(dataset, dataset_name):
    print(dataset_name)
    print("median:", np.median(dataset), "average:", np.mean(dataset), 
            "max:", np.amax(dataset), "minimum", np.amin(dataset))

def get_network_stats(users):
    all_users = np.array(list(users.keys()))
    user_link_count = np.array([len(users[user]["links"]) for user in all_users])
    user_group_count = np.array([len(users[user]["groups"]) for user in all_users])

    groups = np.unique(np.array([group for user in users for group in users[user]["groups"]]))
    group_user_count = np.zeros(len(groups))
    print("There are",len(all_users), "users and", len(groups), "groups.")
    # user count of groups[i] is located at group_user_count[i]
    for user in users:
        for group in users[user]["groups"]:
            index, = np.where(groups == group)
            if len(index) != 1: raise Exception("There are duplicates in your groups array. Check for error.")
            group_user_count[index[0]] += 1  # we can use group_user_count[group] here but this is more general

    # sad za sve ovo neke statistike
    fig1, axs1 = plt.subplots(1, 2, constrained_layout=True, squeeze=True)
    axs1[0].boxplot(user_link_count)
    axs1[0].set_title("User link boxplot")
    axs1[1].hist(user_link_count)
    axs1[1].set_title("User link histogram")
    print_basic_stats(user_link_count, "User link count")
    
    fig2, axs2 = plt.subplots(1, 2, constrained_layout=True, squeeze=True)
    axs2[0].boxplot(user_group_count)
    axs2[0].set_title("User group boxplot")
    axs2[1].hist(user_group_count)
    axs2[1].set_title("User group histogram")
    print_basic_stats(user_group_count, "User group count")

    fig3, axs3 = plt.subplots(1, 2, constrained_layout=True, squeeze=True)
    axs3[0].boxplot(group_user_count)
    axs3[0].set_title("Group user boxplot")
    axs3[1].hist(group_user_count)
    axs3[1].set_title("Group user histogram")
    print_basic_stats(group_user_count, "Group user count")

    plt.show()

## Split data into training and testing

In [None]:
a_train = a.toarray()
for row in a_train:
    ones = np.transpose(row.nonzero())
    indices = np.random.randint(len(ones), size = round( len(ones)*0.1))
    row[ones[indices]] = 0
a_train = csc_matrix(a_train)
a_test = a - a_train

In [None]:
#uzme mi sav ram pa idemo ovako
save_npz("data/yt_a_test.npz", csc_matrix(a_test))
del a_test

In [None]:
a_train = a_train.toarray()
for row in a_train:
    ones = np.transpose(row.nonzero())
    indices = np.random.randint(len(ones), size = round( len(ones)*0.2))
    row[ones[indices]]=0
a_train = csc_matrix(a_train)
a_val = a - a_train

save_npz("data/yt_a_train.npz", a_train)
print_matrix_to_file(a_train.toarray(), "data/random_katz/yt_a_train.txt")
save_npz("data/yt_a.npz", a)

del a_train
del a

In [None]:
a_test = load_npz("data/yt_a_test.npz")
a_val = a_val - a_test
del a_test

In [None]:
save_npz("data/yt_a_val.npz", csc_matrix(a_val))