# Constructing sparse user adjacency matrices

This notebook constructs sparse interaction user matrices for downstream clustering / other analysis tasks.

In [18]:
import polars as pl
import itertools as itt
from pathlib import Path
import csv
import json
import ctypes as ct
from tqdm import tqdm

path = "../../../data/users/"
machine = 'combined'
min_activity = 100
interaction_type = 'indirects' #can also be 'indirects'

assert interaction_type in ['directs', 'indirects'], "Invalid type of interaction data provided"
data_col = 4 if interaction_type == 'directs' else 5

files = [f.absolute() for f in Path(path + 'raw/').glob("*.csv")]
user_stats = path + 'summaries/combined/user_stats.csv'
user_interaction_map = f"{path}summaries/{machine}/interaction_map-{interaction_type}-min-{min_activity}.csv"
adj_matrix_path = f"{path}summaries/{machine}/adj_matrix-{interaction_type}-min-{min_activity}.npz"
csv.field_size_limit(int(ct.c_ulong(-1).value // 2))

9223372036854775807

In [4]:
#first, we construct a dictionary to map unique user names to indices
iterator = itt.count()
user_names = pl.read_csv(user_stats).filter(pl.col("total_activity") >= min_activity).select("user_name").to_dict(as_series=False)["user_name"]
mapped = {u: next(iterator) for u in user_names}
del user_names
print("Total number of users: {}".format(len(mapped)))
print("Total files to be processed: {}".format(len(files)))

Total number of users: 7810
Total files to be processed: 489


In [11]:
def process_batch_file(filename, writer = None, data_column_no = 4, user_column_no = 8):
    with open(filename) as file:
        reader = csv.reader(file, delimiter=',')
        next(reader) #skip the header
        for i, row in enumerate(reader):
            interactions = json.loads(row[data_column_no])
            user_name = row[user_column_no]
            for interlocutor, intensity in interactions.items():                
                #get IDs
                try:
                    userid = mapped[user_name]
                    interid = mapped[interlocutor]                    
                    writer.writerow([userid, interid, intensity])
                except KeyError:
                    # ignore cases where there is no user in the mapped list
                    # this only happens because mapped list is pre-filtered
                    pass


with open(user_interaction_map, "w") as outfile:
    csvwriter = csv.writer(outfile, delimiter=',', quoting=csv.QUOTE_NONE)
    for i,f in enumerate(tqdm(files)):        
        process_batch_file(f, csvwriter, data_column_no=data_col)

100%|██████████| 489/489 [1:11:20<00:00,  8.75s/it]


In [12]:
#create a matrix
import numpy as np
adj_matrix = np.zeros((len(mapped), len(mapped)))

with open(user_interaction_map, "r") as outfile:
    reader = csv.reader(outfile, delimiter=',', quoting=csv.QUOTE_NONE)
    for row in tqdm(reader, total=79547162):
        i = int(row[0])
        j = int(row[1])
        adj_matrix[i,j] = adj_matrix[i,j] + 1
        adj_matrix[j,i] = adj_matrix[j,i] + 1


100%|██████████| 79547162/79547162 [03:14<00:00, 409739.31it/s]


In [19]:
import scipy.sparse as sps
sparse_matrix = sps.csr_matrix(adj_matrix)
sps.save_npz(adj_matrix_path, sparse_matrix)

## Combining interaction maps from the 4 sources into a single map
Fastest to combine interaction map CSVs using command line.

In [None]:
!f='interaction_map-directs-min-100.csv'
!awk '(NR == 1) || (FNR > 1)' aurimas.eu/$f local/$f vm1/$f vm2/$f > combined/$f

In [None]:
path = "../../../data/users/summaries/combined/"
int_map_path = path + 'interaction_map-directs-min-100.csv'
adj_matrix_path = path + 'adjacency_matrix.npz'

In [None]:
data = np.loadtxt(int_map_path, np.int64, delimiter=',')
adj_matrix = sps.coo_matrix((data[:,2], (data[:, 0], data[:, 1])))
sps.save_npz(adj_matrix_path, adj_matrix)