# Constructing sparse user adjacency matrices

This notebook constructs sparse interaction user matrices for downstream clustering / other analysis tasks.

In [11]:
from pathlib import Path
import csv
import json
import ctypes as ct
from tqdm import tqdm
import sqlite3 as sq
import numpy as np
import scipy.sparse as sps

csv.field_size_limit(int(ct.c_ulong(-1).value // 2))

path = "../../../data/users/"
DB_path = path + 'users.sqlite.db'

## Adding an identifier to SQL database that can be mapped to the adjacency matrices

In [3]:
with sq.connect(DB_path) as conn:
    cur = conn.cursor()    
    try:
        cur.execute("ALTER TABLE users ADD COLUMN matrix_id int")        
    except sq.OperationalError:
        print("columns already exist")

    cur.execute("drop table if exists tmp;")
    cur.execute("""
        CREATE TABLE tmp as 
            SELECT user_name, row_number() over (order by total_activity DESC) as no FROM users WHERE is_selected = True;"""
    )

    cur.execute("CREATE INDEX IF NOT EXISTS user_name_idx ON tmp(user_name)")

    cur.execute("UPDATE users SET matrix_id = (SELECT no FROM tmp WHERE tmp.user_name = users.user_name);")
    cur.execute("drop table if exists tmp;")

columns already exist


In [8]:
#retrieving a dictionary to map user names to IDs
with sq.connect(DB_path) as conn:
    cur = conn.cursor()    
    cur.execute("SELECT user_name, matrix_id FROM users WHERE matrix_id IS NOT NULL")
    mapped = dict(cur.fetchall())

print("Total number of users: {}".format(len(mapped)))


Total number of users: 7807


In [9]:
files = [f.absolute() for f in Path(path + 'raw/').glob("*.csv")]
print("Total files to be processed: {}".format(len(files)))

Total files to be processed: 489


In [11]:
def process_batch_file(filename, writer = None, data_column_no = 4, user_column_no = 8):
    with open(filename) as file:
        reader = csv.reader(file, delimiter=',')
        next(reader) #skip the header
        for i, row in enumerate(reader):
            interactions = json.loads(row[data_column_no])
            user_name = row[user_column_no]
            for interlocutor, intensity in interactions.items():                
                #get IDs
                try:
                    userid = mapped[user_name]
                    interid = mapped[interlocutor]                    
                    writer.writerow([userid, interid, intensity])
                except KeyError:
                    # ignore cases where there is no user in the mapped list
                    # this only happens because mapped list is pre-filtered
                    pass

100%|██████████| 489/489 [1:11:20<00:00,  8.75s/it]


In [None]:
interactions = {"indirects": 5, "directs": 4}
for int_type, data_col in interactions.items():
    adj_matrix_path = f"{path}/adj_matrix-{int_type}.npz"
    user_interaction_map = f"{path}/adj_map-{int_type}.csv"
    
    #first, save the interactions to a CSV (will have repeat rows)
    with open(user_interaction_map, "w") as outfile:
        csvwriter = csv.writer(outfile, delimiter=',', quoting=csv.QUOTE_NONE)
        for i,f in enumerate(tqdm(files)):        
            process_batch_file(f, csvwriter, data_column_no=data_col)
    
    #then, generate a scipy matrix with condensed info
    adj_matrix = np.zeros((len(mapped), len(mapped)))
    with open(user_interaction_map, "r") as outfile:
        reader = csv.reader(outfile, delimiter=',', quoting=csv.QUOTE_NONE)
        for row in tqdm(reader, total=79547162):
            i = int(row[0])
            j = int(row[1])
            adj_matrix[i,j] = adj_matrix[i,j] + 1
            adj_matrix[j,i] = adj_matrix[j,i] + 1
    
    sparse_matrix = sps.csr_matrix(adj_matrix)
    sps.save_npz(adj_matrix_path, sparse_matrix)