# Constructing sparse user adjacency matrices

This notebook constructs sparse interaction user matrices for downstream clustering / other analysis tasks.

In [1]:
from pathlib import Path
import csv
import json
import ctypes as ct
from tqdm import tqdm
import sqlite3 as sq
import numpy as np
import scipy.sparse as sps

csv.field_size_limit(int(ct.c_ulong(-1).value // 2))

path = "../../../data/users/"
DB_path = path + 'users.sqlite.db'

## Adding an identifier to SQL database that can be mapped to the adjacency matrices

In [2]:
with sq.connect(DB_path) as conn:
    cur = conn.cursor()    
    try:
        cur.execute("ALTER TABLE users ADD COLUMN matrix_id int")        
        cur.execute("drop table if exists tmp;")
        cur.execute("""
            CREATE TABLE tmp as 
                SELECT user_name, row_number() over (order by total_activity DESC) as no FROM users WHERE is_selected = True;"""
        )

        cur.execute("CREATE INDEX IF NOT EXISTS user_name_idx ON tmp(user_name)")

        cur.execute("UPDATE users SET matrix_id = (SELECT no FROM tmp WHERE tmp.user_name = users.user_name);")
        cur.execute("drop table if exists tmp;")
    except sq.OperationalError:
        print("columns already exist")

columns already exist


In [7]:
#retrieving a dictionary to map user names to IDs
with sq.connect(DB_path) as conn:
    cur = conn.cursor()    
    cur.execute("SELECT user_name, matrix_id FROM users WHERE matrix_id IS NOT NULL")
    mapped = dict(cur.fetchall())

print("Total number of users: {}".format(len(mapped)))


Total number of users: 7807


In [8]:
files = [f.absolute() for f in Path(path + 'raw/').glob("*.csv")]
print("Total files to be processed: {}".format(len(files)))

Total files to be processed: 489


In [13]:
no_users = len(mapped)
int_dict = {
    "indirects": {
        "data_col": 5,
        "matrix": np.zeros((no_users, no_users))
    },
    "directs": {
        "data_col": 4,
        "matrix": np.zeros((no_users, no_users))
    }
}

In [14]:
for f in tqdm(files):        
    with open(f) as file:
        reader = csv.reader(file, delimiter=',')
        next(reader) #skip the header
        for row in reader:
            for d in int_dict.values():
                interactions = json.loads(row[d['data_col']]) #get interaction dictionary
                user_name = row[8]
                for interlocutor, intensity in interactions.items():                
                    #get IDs
                    try:
                        userid = mapped[user_name] - 1 # matrix_ids are 1-indexed in the DB!
                        interid = mapped[interlocutor] - 1
                        #update the adjacency matrix
                        d['matrix'][userid, interid] += intensity
                    except KeyError:
                        # ignore cases where there is no user in the mapped list
                        # this only happens because mapped list is pre-filtered
                        pass

100%|███████████████████████████████████████████████████████████████████████████████| 489/489 [1:14:50<00:00,  9.18s/it]


In [15]:
for int_name, int_vals in int_dict.items():
    #then, generate a scipy matrix with condensed info and save
    sparse_matrix = sps.csr_matrix(int_vals['matrix'])
    adj_matrix_path = f"{path}adj_matrix-{int_name}-latest.npz"
    sps.save_npz(adj_matrix_path, sparse_matrix)