## UPOZORENJA
- ćelija za učitavanje podataka treba malo više vremena (get_user_data), pozvati jednom i nakon toga zakomentirati

## TODO-ovi
- smisliti kako ćemo odabrati bolji dio podataka

In [1]:
import numpy as np
import scipy
from scipy.sparse import csc_matrix, bmat, save_npz
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
import copy

## Data preparation
- reading user data from given files (we are considering only users that are in some group here)
- random sample of filtered users
- normalizing user data (renaming users to elements of $\{0, ..., n-1\}$)

In [2]:
def normalize_users(users):
    usrs = list(set(users.keys()).union(set([link  for key in users for link in users[key]["links"] ])))
    groups = list(set([group for key in users for group in users[key]["groups"] ]))
    normalized_users = {}
    for user in users:
        normalized_users[usrs.index(user)] = {}
        normalized_users[usrs.index(user)]["links"] = [usrs.index(link) for link in users[user]["links"]]
        normalized_users[usrs.index(user)]["groups"] = [groups.index(group) for group in users[user]["groups"]]
    return normalized_users    

In [5]:
def filter_users(user_number, all_users):
    # add any additional filters here if needed (ex. at least 2 groups and 3 links)
    
    # filters like this will work only if set for groups because in the next step we are 
    # filtering links that aren't in our subgraph
    users = copy.deepcopy(all_users)
    print("Number of users in full dataset : " + str(len(users)))
    users = {user: users[user] for user in users if len(users[user]["links"]) >= 0 and len(users[user]["groups"]) >= 0}
    print("Number of users in filtered dataset : " + str(len(users)))
    
    keys = random.sample(users.keys(), user_number)
    users = {key: users[key] for key in keys}
    
    #first eliminate any links that remained in the graph and shouldn't be there
    for user in users:
        users[user]["links"] = list(filter(lambda link: link in users.keys(), users[user]["links"]))
        
    return normalize_users(users)

In [6]:
def get_user_data(links_path, groups_path):
    # return all user data where every user is in at least one group
    links = np.loadtxt(links_path)
    groups = np.loadtxt(groups_path)
    users = {} # key is user, value is dict containing all of his links and all of his groups
    
    for edge in groups:
        if edge[0] not in users:
            users[edge[0]] = { "links": [], "groups":[] }
        users[edge[0]]["groups"].append(edge[1])
        
    for link in links:
        if link[0] in users:
            users[link[0]]["links"].append(link[1])
        if link[1] in users:
            users[link[1]]["links"].append(link[0])
    
    return users

In [7]:
def get_adjacency_matrix(users, key, row_num, col_num):
    # constructs adjacency matrix
    # rows are indexed by user
    # cols are indexed based on the key
    row = np.array([])
    column = np.array([])
    value = np.array([])
    
    for user in users:
        for element in users[user][key]:
            row = np.append(row, user)
            column = np.append(column, element)
            value = np.append(value, 1)
    
    s = csc_matrix((value, (row, column)), shape = (row_num, col_num))
    return s

In [8]:
# load data
# this takes some time so comment it out after first run
%time all_users = get_user_data("data/release-youtube-links.txt", "data/release-youtube-groupmemberships.txt")

CPU times: user 28 s, sys: 233 ms, total: 28.2 s
Wall time: 28.3 s


In [18]:
grps = [group for user in all_users for group in all_users[user]["groups"]]

In [21]:
unique, counts = np.unique(grps, return_counts=True)

In [31]:
counts[-20:]

array([ 832,  844,  899,  946,  987, 1074, 1074, 1104, 1107, 1111, 1160,
       1226, 1271, 1313, 1349, 1544, 1645, 3084, 3534, 7591])

In [32]:
len(all_users)

94238

In [8]:
# number of users
# WARNING: small number of users will usually result in a small number of links between users
k = 10000

In [9]:
# quickly gets a filtered subset of users
filtered_users = filter_users(k, all_users)

Number of users in full dataset : 94238
Number of users in filtered dataset : 16077


In [10]:
# S matrix, matrix of links between users
s = get_adjacency_matrix(filtered_users, "links", len(filtered_users), len(filtered_users))
s

<10000x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 79346 stored elements in Compressed Sparse Column format>

In [11]:
# A matrix, affiliation matrix, links between users and groups
# how many groups are there?
a = get_adjacency_matrix(filtered_users, "groups", len(filtered_users), len(set([group for user in filtered_users for group in filtered_users[user]["groups"]])))
a

<10000x17129 sparse matrix of type '<class 'numpy.float64'>'
	with 103641 stored elements in Compressed Sparse Column format>

## Split data into training and testing

In [12]:
a_train = a.toarray()
for row in a_train:
    ones = np.transpose(row.nonzero())
    indices = np.random.randint(len(ones), size = round(len(ones)*0.3))
    row[ones[indices]] = 0
a_test = csc_matrix(a - a_train)

In [13]:
for row in a_train:
    ones = np.transpose(row.nonzero())
    indices = np.random.randint(len(ones), size = round(len(ones)*0.3))
    row[ones[indices]] = 0
a_val = csc_matrix(a - a_train - a_test)

In [14]:
np.sum(a_train), np.sum(a_val), np.sum(a_test)

(54155.0, 21235.0, 28251.0)

In [15]:
save_npz("data/s.npz", s)
save_npz("data/a.npz", a)
save_npz("data/a_train.npz", csc_matrix(a_train))
save_npz("data/a_val.npz", csc_matrix(a_val))
save_npz("data/a_test.npz", csc_matrix(a_test))