# Constructing sparse user adjacency matrices

This notebook constructs sparse user matrices for downstream clustering / other analysis tasks.

In [15]:
import polars as pl
import itertools as itt
from pathlib import Path
import csv
import json
import ctypes as ct

path = "../../data/users/"
machine = 'local'
min_activity = 100
interaction_type = 'directs' #can also be 'indirects'

assert interaction_type in ['directs', 'indirects'], "Invalid type of interaction data provided"
data_col = 4 if interaction_type == 'directs' else 5

files = [f.absolute() for f in Path(path).glob("*.csv")]
user_stats = path + 'summaries/combined/user_stats.csv'
user_interaction_map = f"{path}summaries/{machine}/interaction_map-{interaction_type}-min-{min_activity}.csv"
adj_matrix_path = path + f"{path}summaries/{machine}/adj_matrix-{interaction_type}-min-{min_activity}.npz"
csv.field_size_limit(int(ct.c_ulong(-1).value // 2))

9223372036854775807

In [16]:
#first, we construct a dictionary to map unique user names to indices
iterator = itt.count()
user_names = pl.read_csv(user_stats).filter(pl.col("total_activity") >= min_activity).select("user_name").to_dict(as_series=False)["user_name"]
mapped = {u: next(iterator) for u in user_names}
del user_names
print("Total number of users: {}".format(len(mapped)))

Total number of users: 7810


In [17]:
def process_batch_file(filename, writer = None, data_column_no = 4, user_column_no = 8):
    with open(filename) as file:
        reader = csv.reader(file, delimiter=',')
        next(reader) #skip the header
        for i, row in enumerate(reader):
            interactions = json.loads(row[data_column_no])
            user_name = row[user_column_no]
            for interlocutor, intensity in interactions.items():                
                #get IDs
                try:
                    userid = mapped[user_name]
                    interid = mapped[interlocutor]                    
                    writer.writerow([userid, interid, intensity])
                except KeyError:
                    # ignore cases where there is no user in the mapped list
                    # this only happens because mapped list is pre-filtered
                    pass


with open(user_interaction_map, "w") as outfile:
    csvwriter = csv.writer(outfile, delimiter=',', quoting=csv.QUOTE_NONE)
    for i,f in enumerate(files):
        print("Processing file {} out of {}".format(i + 1, len(files)))        
        process_batch_file(f, csvwriter, data_column_no=data_col)

Processing file 1 out of 13
Processing file 2 out of 13
Processing file 3 out of 13
Processing file 4 out of 13
Processing file 5 out of 13
Processing file 6 out of 13
Processing file 7 out of 13
Processing file 8 out of 13
Processing file 9 out of 13
Processing file 10 out of 13
Processing file 11 out of 13
Processing file 12 out of 13
Processing file 13 out of 13


In [18]:
#create a sparse matrix
import numpy as np
import scipy.sparse as sps
data = np.loadtxt(user_interaction_map, np.int64, delimiter=',')
adj_matrix = sps.coo_matrix((data[:,2], (data[:, 0], data[:, 1])))
sps.save_npz(adj_matrix_path, adj_matrix)