## UPOZORENJA
- ćelija za učitavanje podataka treba malo više vremena (get_user_data), pozvati jednom i nakon toga zakomentirati

## TODO-ovi
- smisliti kako ćemo odabrati bolji dio podataka

In [2]:
import numpy as np
import scipy
from scipy.sparse import csc_matrix, bmat, save_npz
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
import copy

## Data preparation
- reading user data from given files (we are considering only users that are in some group here)
- random sample of filtered users
- normalizing user data (renaming users to elements of $\{0, ..., n-1\}$)

In [3]:
def normalize_users(users):
    usrs = list(set(users.keys()).union(set([link  for key in users for link in users[key]["links"] ])))
    groups = list(set([group for key in users for group in users[key]["groups"] ]))
    normalized_users = {}
    for user in users:
        normalized_users[usrs.index(user)] = {}
        normalized_users[usrs.index(user)]["links"] = [usrs.index(link) for link in users[user]["links"]]
        normalized_users[usrs.index(user)]["groups"] = [groups.index(group) for group in users[user]["groups"]]
    return normalized_users    

In [4]:
def filter_users(user_number, all_users):
    # add any additional filters here if needed (ex. at least 2 groups and 3 links)
    
    # filters like this will work only if set for groups because in the next step we are 
    # filtering links that aren't in our subgraph
    users = copy.deepcopy(all_users)
    print("Number of users in full dataset : " + str(len(users)))
    users = {user: users[user] for user in users if len(users[user]["links"]) >= 2 and len(users[user]["groups"]) >= 2}
    print("Number of users in filtered dataset : " + str(len(users)))
    
    #keys = random.sample(users.keys(), user_number)
    #users = {key: users[key] for key in keys}
    
    #first eliminate any links that remained in the graph and shouldn't be there
    for user in users:
        users[user]["links"] = list(filter(lambda link: link in users.keys(), users[user]["links"]))
        
    return normalize_users(users)

In [5]:
def get_user_data(links_path, groups_path):
    # return a list of two dictionaries-users and groups
    # users-return all user data where every user is in at least one group
    links = np.loadtxt(links_path)
    groups = np.loadtxt(groups_path)
    users = {} # key is user, value is dict containing all of his links and all of his groups
    groups_info = {} # key is group, value is dict containing all of its users
    
    for edge in groups:
        if edge[0] not in users:
            users[edge[0]] = { "links": [], "groups":[] }
        users[edge[0]]["groups"].append(edge[1])
        if edge[1] not in groups_info:
            groups_info[edge[1]] = { "users": [] }
        groups_info[edge[1]]["users"].append(edge[0])
        
    for link in links:
        if (link[0] in users) and (link[1] in users):
            users[link[0]]["links"].append(link[1])
    
    return [users, groups_info]

In [6]:
def user_connections(user, groups_users_list):
    # return a list of all user's friends and group friends
    connections=[friend for friend in groups_users_list[0][user]["links"]]
    for group in groups_users_list[0][user]["groups"]:
        for friend in groups_users_list[1][group]["users"]:
            connections.append(friend)
    return list(set(connections))
    

In [18]:
def filter_data(final_network, groups_users_list):
    users = {user:groups_users_list[0][user] for user in final_network}
    return users

In [7]:
def tree(final_network, last_added, k, groups_users_list):
    #  second iteration: last_added = [user_connections(user) for user in last_added] - final_network 
    l=[]
    if k==0:
        return final_network
    else:
        tree_level=[]
        for friend in last_added:
            tree_level.extend(user_connections(friend, groups_users_list))
        last_added=list(set(tree_level)-set(final_network))
        final_network.extend(last_added)
        print(final_network)
        return tree(final_network, last_added, k-1, groups_users_list)

In [8]:
def get_adjacency_matrix(users, key, row_num, col_num):
    # constructs adjacency matrix
    # rows are indexed by user
    # cols are indexed based on the key
    row = np.array([])
    column = np.array([])
    value = np.array([])
    
    for user in users:
        for element in users[user][key]:
            row = np.append(row, user)
            column = np.append(column, element)
            value = np.append(value, 1)
    
    s = csc_matrix((value, (row, column)), shape = (row_num, col_num))
    return s

In [9]:
# load data
# this takes some time so comment it out after first run
%time all_users_and_groups = get_user_data("release-youtube-links.txt", "release-youtube-groupmemberships.txt")

Wall time: 33.1 s


In [None]:
final_network = [1.0]
last_added = [1.0]
max_tree_level = 3 
all_users = filter_data(tree(final_network, last_added, max_tree_level, all_users_and_groups),all_users_and_groups)

In [14]:
# number of users
# WARNING: small number of users will usually result in a small number of links between users
k = 1000

In [15]:
# quickly gets a filtered subset of users
filtered_users = filter_users(k, all_users)

Number of users in full dataset : 94238
Number of users in filtered dataset : 26968


In [16]:
# S matrix, matrix of links between users
s = get_adjacency_matrix(filtered_users, "links", len(filtered_users), len(filtered_users))
s

<26968x26968 sparse matrix of type '<class 'numpy.float64'>'
	with 389470 stored elements in Compressed Sparse Column format>

In [26]:
# A matrix, affiliation matrix, links between users and groups
# how many groups are there?
a = get_adjacency_matrix(filtered_users, "groups", len(filtered_users), len(set([group for user in filtered_users for group in filtered_users[user]["groups"]])))
a

<1000x4922 sparse matrix of type '<class 'numpy.float64'>'
	with 10020 stored elements in Compressed Sparse Column format>

## Split data into training and testing

In [36]:
a_train = a.toarray()
for row in a_train:
    ones = np.transpose(row.nonzero())
    indices = np.random.randint(len(ones), size = round(len(ones)*0.3))
    row[ones[indices]] = 0
a_test = csc_matrix(a - a_train)

In [37]:
for row in a_train:
    ones = np.transpose(row.nonzero())
    indices = np.random.randint(len(ones), size = round(len(ones)*0.3))
    row[ones[indices]] = 0
a_val = csc_matrix(a - a_train - a_test)

In [39]:
save_npz("data/s.npz", s)
save_npz("data/a.npz", a)
save_npz("data/a_train.npz", csc_matrix(a_train))
save_npz("data/a_val.npz", a_val)
save_npz("data/a_test.npz", a_test)