## UPOZORENJA
- ćelija za učitavanje podataka treba malo više vremena (get_user_data), pozvati jednom i nakon toga zakomentirati

## TODO-ovi
- smisliti kako ćemo odabrati bolji dio podataka

In [62]:
import numpy as np
import scipy
from scipy.sparse import csc_matrix, bmat, save_npz
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
import copy

## Data preparation
- reading user data from given files (we are considering only users that are in some group here)
- random sample of filtered users
- normalizing user data (renaming users to elements of $\{0, ..., n-1\}$)

In [2]:
def normalize_users(users):
    usrs = list(set(users.keys()).union(set([link  for key in users for link in users[key]["links"] ])))
    groups = list(set([group for key in users for group in users[key]["groups"] ]))
    normalized_users = {}
    for user in users:
        normalized_users[usrs.index(user)] = {}
        normalized_users[usrs.index(user)]["links"] = [usrs.index(link) for link in users[user]["links"]]
        normalized_users[usrs.index(user)]["groups"] = [groups.index(group) for group in users[user]["groups"]]
    return normalized_users    

In [84]:
def filter_users(user_number, all_users):
    # add any additional filters here if needed (ex. at least 2 groups and 3 links)
    
    # filters like this will work only if set for groups because in the next step we are 
    # filtering links that aren't in our subgraph
    users = copy.deepcopy(all_users)
    print("Number of users in full dataset : " + str(len(users)))
    users = {user: users[user] for user in users if len(users[user]["links"]) >= 3 and len(users[user]["groups"]) >= 4}
    print("Number of users in filtered dataset : " + str(len(users)))
    
    keys = random.sample(users.keys(), user_number)
    users = {key: users[key] for key in keys}
    
    #first eliminate any links that remained in the graph and shouldn't be there
    for user in users:
        users[user]["links"] = list(filter(lambda link: link in users.keys(), users[user]["links"]))
        
    return normalize_users(users)

In [4]:
def get_user_data(links_path, groups_path):
    # return all user data where every user is in at least one group
    links = np.loadtxt(links_path)
    groups = np.loadtxt(groups_path)
    users = {} # key is user, value is dict containing all of his links and all of his groups
    
    for edge in groups:
        if edge[0] not in users:
            users[edge[0]] = { "links": [], "groups":[] }
        users[edge[0]]["groups"].append(edge[1])
        
    for link in links:
        if link[0] in users:
            users[link[0]]["links"].append(link[1])
        if link[1] in users:
            users[link[1]]["links"].append(link[0])
    
    return users

In [5]:
def get_adjacency_matrix(users, key, row_num, col_num):
    # constructs adjacency matrix
    # rows are indexed by user
    # cols are indexed based on the key
    row = np.array([])
    column = np.array([])
    value = np.array([])
    
    for user in users:
        for element in users[user][key]:
            row = np.append(row, user)
            column = np.append(column, element)
            value = np.append(value, 1)
    
    s = csc_matrix((value, (row, column)), shape = (row_num, col_num))
    return s

In [66]:
# load data
# this takes some time so comment it out after first run
%time all_users = get_user_data("data/release-youtube-links.txt", "data/release-youtube-groupmemberships.txt")

CPU times: user 27.8 s, sys: 112 ms, total: 27.9 s
Wall time: 27.8 s


In [73]:
# number of users
# WARNING: small number of users will usually result in a small number of links between users
k = 15000

In [85]:
# quickly gets a filtered subset of users
filtered_users = filter_users(k, all_users)

Number of users in full dataset : 94238
Number of users in filtered dataset : 16077


In [77]:
# S matrix, matrix of links between users
s = get_adjacency_matrix(filtered_users, "links", len(filtered_users), len(filtered_users))
s

<15000x15000 sparse matrix of type '<class 'numpy.float64'>'
	with 164752 stored elements in Compressed Sparse Column format>

In [78]:
# A matrix, affiliation matrix, links between users and groups
# how many groups are there?
a = get_adjacency_matrix(filtered_users, "groups", len(filtered_users), len(set([group for user in filtered_users for group in filtered_users[user]["groups"]])))
a

<15000x20616 sparse matrix of type '<class 'numpy.float64'>'
	with 157577 stored elements in Compressed Sparse Column format>

## Split data into training and testing

In [79]:
"pogledaj sve indekse (i,j) gdje je A[i,j] = 1 i uzmi random 30% takvih"
a = a.toarray()
ones = np.transpose(a.nonzero())
indices = np.random.randint(ones.shape[0], size = int(ones.shape[0]*0.3))

In [80]:
"za test spremi citavu a, za trening izbaci onih 30%"
a_test = csc_matrix(a)
for i in range(len(indices)):
    a[ones[indices][i][0], ones[indices][i][1]] = 0
a_train = csc_matrix(a)

In [86]:
"testni podaci"
a_test

<15000x20616 sparse matrix of type '<class 'numpy.float64'>'
	with 157577 stored elements in Compressed Sparse Column format>

In [87]:
"trening podaci"
a_train

<15000x20616 sparse matrix of type '<class 'numpy.float64'>'
	with 116646 stored elements in Compressed Sparse Column format>

In [88]:
save_npz("data/s.npz", s)
save_npz("data/a_train.npz", a_train)
save_npz("data/a_test.npz", a_test)