# Constructing sparse user adjacency matrices

This notebook constructs sparse user matrices for downstream clustering / other analysis tasks.

In [73]:
import polars as pl
import itertools as itt
from pathlib import Path
import csv
import json

path = "../../data/users/"
files = [f.absolute() for f in Path(path).glob("*.csv")]
user_stats = path + 'summaries/user_stats.csv'
user_interaction_map = path + 'summaries/interaction_map.csv'
adj_matrix_path = path + 'summaries/adjacency_matrix.npz'


In [9]:
#first, we construct a dictionary to map unique user names to indices
iterator = itt.count()
user_names = pl.read_csv(user_stats).select("user_name").to_dict(as_series=False)["user_name"]
mapped = {u: next(iterator) for u in user_names}
del user_names
print("Total number of users: {}".format(len(mapped)))

Total number of users: 54059


In [23]:
def process_batch_file(filename, writer = None, data_column_no = 4, user_column_no = 8):
    with open(filename) as file:
        reader = csv.reader(file, delimiter=',')
        next(reader) #skip the header
        for i, row in enumerate(reader):
            interactions = json.loads(row[data_column_no])
            user_name = row[user_column_no]
            for interlocutor, intensity in interactions.items():                
                #get IDs
                userid = mapped[user_name]
                interid = mapped[interlocutor]                    
                writer.writerow([userid, interid, intensity])


with open(user_interaction_map, "w") as outfile:
    csvwriter = csv.writer(outfile, delimiter=',', quoting=csv.QUOTE_NONE)
    for f in files:
        process_batch_file(f, csvwriter, data_column_no=4)

In [74]:
#create a sparse matrix
import numpy as np
import scipy.sparse as sps
data = np.loadtxt(user_interaction_map, np.int64, delimiter=',')
adj_matrix = sps.coo_matrix((data[:,2], (data[:, 0], data[:, 1])))
sps.save_npz(adj_matrix_path, adj_matrix)