# Representing Users and Files
#### Design Document: https://docs.google.com/document/d/1F84Nj3IQ-f_36bmmsOTuOo9u65gfD1WU5LKdnw8ShcY/edit?tab=t.x97j5jy1kop1#heading=h.1vu1g9fe3ujo
#### Optimizations:
- Re-ranking topics
- More information in file embeddings

# Dependencies

In [None]:
! pip install pandas
! pip install openai
! pip install sentence-transformers
! pip install torch
! pip install matplotlib
! pip install seaborn

# Utilities/Constants

In [None]:
# Load CSV files representing database into pandas DataFrames
import pandas as pd
import os

CSV_DIR = "./csvs"

DFS = {}
for filename in os.listdir(CSV_DIR):
    path = CSV_DIR + "/" + filename
    try:
        name_no_ext = filename.split('.')[0]
        DFS[name_no_ext] = pd.read_csv(path)
    except Exception as e:
        print(f"Failed to read {path}: {e}")

# Print brief summary and show first few rows for each loaded dataframe
from IPython.display import display

for name, df in DFS.items():
    print(f"{name}: {df.shape}")
    display(df.head())

UNIQUE_OPS = DFS['resource_auditrecord']['operation'].unique()

In [None]:
from datetime import datetime, timezone

# Now for this session
NOW = datetime.now(timezone.utc)

# Cached embeddings directory
CACHE_DIR = "./pickle/"

In [None]:
# Cache any heavy embeddings or mappings we compute.
import os
import pickle
from datetime import datetime
import glob

def _most_recent_file(pattern):
  # Return the most recently modified file in cache_dir matching the glob pattern, or None if none found
  matches = glob.glob(os.path.join(CACHE_DIR, pattern))
  if not matches:
    return None
  return max(matches, key=os.path.getmtime)

def get_cache():
  audit_to_file_mapping_file_path = _most_recent_file("audit_to_file_mapping_*.pkl")
  topic_embeddings_file_path = _most_recent_file("topic_embeddings_bge-large-zh-v1.5_*.pkl")
  file_embeddings_file_path = _most_recent_file("file_embeddings_*.pkl")
  user_embeddings_file_path = _most_recent_file("user_embeddings_*.pkl")

  audit_to_file_mapping, embeddings_cache, files, users = {}, {}, {}, {}

  if audit_to_file_mapping_file_path and os.path.exists(audit_to_file_mapping_file_path):
    with open(audit_to_file_mapping_file_path, "rb") as p:
      audit_to_file_mapping = pickle.load(p)

  if topic_embeddings_file_path and os.path.exists(topic_embeddings_file_path):
    with open(topic_embeddings_file_path, "rb") as p:
      embeddings_cache = pickle.load(p)

  if file_embeddings_file_path and os.path.exists(file_embeddings_file_path):
    with open(file_embeddings_file_path, "rb") as p:
      files = pickle.load(p)

  if user_embeddings_file_path and os.path.exists(user_embeddings_file_path):
    with open(user_embeddings_file_path, "rb") as p:
      users = pickle.load(p)

  print(f"Audit to file mapping size: {len(audit_to_file_mapping)}")
  print(f"Topic embeddings cache size: {len(embeddings_cache) if embeddings_cache is not None else 0}")
  print(f"Files size: {len(files) if files is not None else 0}")
  print(f"Users size: {len(users) if users is not None else 0}")

  return audit_to_file_mapping, embeddings_cache, files, users


def save_to_cache(audit_to_file_mapping, embeddings_cache, files, users):
  # Ensure cache dir exists
  os.makedirs(CACHE_DIR, exist_ok=True)
  save_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
  with open(os.path.join(CACHE_DIR, f"topic_embeddings_bge-large-zh-v1.5_{save_time}.pkl"), "wb") as f:
    pickle.dump(embeddings_cache, f)
  with open(os.path.join(CACHE_DIR, f"file_embeddings_{save_time}.pkl"), "wb") as f:
    pickle.dump(files, f)
  with open(os.path.join(CACHE_DIR, f"user_embeddings_{save_time}.pkl"), "wb") as f:
    pickle.dump(users, f)
  with open(os.path.join(CACHE_DIR, f"audit_to_file_mapping_{save_time}.pkl"), "wb") as f:
    pickle.dump(audit_to_file_mapping, f)

AUDIT_TO_FILE_MAPPING, EMBEDDINGS_CACHE, FILES, USERS = get_cache()

In [None]:
# Database searching utilities
from functools import cache
from tqdm import tqdm

# Getting audit ids which are associated with a file we have information for
# This way we have a much smaller list of audit logs to parse when we are
# calculating user embeddings
@cache
def get_valid_audit_ids():
  rr, rrn, ra = DFS["resource_resource"], DFS["resource_resourcenode"], DFS["resource_auditrecord"]
  valid_resource_ids = set()
  print("Get valid resources")
  for row in tqdm(rr.iterrows(), total=len(rr)):
    row_info = row[1]
    id = row_info['id']

    if len(rr[rr['parent_id'] == id]) != 0:
      valid_resource_ids.add(id)

  valid_resource_node_ids = set()
  print("Get valid resource nodes")
  for row in tqdm(rrn.iterrows(), total=len(rrn)):
    row_info = row[1]
    if row_info['resource_id'] in valid_resource_ids:
      valid_resource_node_ids.add(row_info['id'])

  valid_audit_ids = set()
  ra.head()
  print("Get valid audit ids")
  for row in tqdm(ra.iterrows(), total=len(ra)):
    row_info = row[1]
    if row_info['audited_id'] in valid_resource_node_ids:
      valid_audit_ids.add(row_info['id'])
  return valid_audit_ids, valid_resource_node_ids, valid_resource_ids

def get_stream(audit_id, file_id, streams, timestamp, operation):
  # File has been changed in some way, we want to find the first STREAM that is after the audit log timestamp
  should_check_after = operation in ['MODIFIED', 'FILE_UPLOADED', 'RENAMED']
  
  # 1 stream, always pick it
  if len(streams) == 1:
    stream = streams.iloc[0]
  else:
    streams = streams.sort_values(by="timestamp")
    if should_check_after:
      filtered_streams = streams[streams['timestamp'] >= timestamp]
      if len(filtered_streams) == 0:
        # Fallback to filtering opposite way
        filtered_streams = streams[streams['timestamp'] <= timestamp]
        stream = filtered_streams.iloc[-1]
      else:
        stream = filtered_streams.iloc[0]
    else:
      filtered_streams = streams[streams['timestamp'] <= timestamp]
      if len(filtered_streams) == 0:
        # Fallback to filtering opposite way
        filtered_streams = streams[streams['timestamp'] >= timestamp]
        stream = filtered_streams.iloc[0]
      else:
        stream = filtered_streams.iloc[-1]
  return stream

def get_file_hash_as_of_audit(audit_id, operation, timestamp):
  rrn, rr = DFS['resource_resourcenode'], DFS['resource_resource']
  resource_node = rrn[rrn['id'] == audit_id]
  resource_id = resource_node['resource_id']

  # Assume only one resource id associated with audit id?
  if len(resource_id) == 0:
    raise ValueError(f"No resource id found. Audit id: {audit_id}")
  resource_id = resource_id.iloc[0]

  # Grab the resource reference by the audit log event
  resource = rr[rr['id'] == resource_id]
  if len(resource) == 0:
    raise ValueError(f"No resource found: Audit id: {audit_id}")

  # Work our way down to the relevant STREAM, or in other words actual data, relevant for this audit log event
  file_or_stream = resource.iloc[0]

  if file_or_stream['resource_type'] != "STREAM":
      # Must be a file
      assert file_or_stream['resource_type'] == "FILE"
      file = file_or_stream
      streams = rr[rr['parent_id'] == file['id']]
      if len(streams) is None:
        # If no streams, at least return file id
        return file['id'], False
      stream = get_stream(audit_id, file['id'], streams, timestamp, operation)
      hash_id = stream['hash_id']
  else:
      # If stream, directly get hash id
      hash_id = file_or_stream['hash_id']

  return hash_id, True

# User/File Embeddings

In [None]:
# Load embedding model
from sentence_transformers import SentenceTransformer

model_name = 'BAAI/bge-large-zh-v1.5'
model = SentenceTransformer(model_name)

In [None]:
# FILE/TOPIC embeddings (get topic embeddings as we calculate file embeddings)
import numpy as np
from tqdm import tqdm

def average_embeddings(embeddings):
  return np.mean(embeddings, axis=0)

def get_embedding_for_text(text: str):
  text = text.lower()

  if text in EMBEDDINGS_CACHE:
    return EMBEDDINGS_CACHE[text]
  
  EMBEDDINGS_CACHE[text] = model.encode(text, normalize_embeddings=True) # normalize for cosine similarity
  return EMBEDDINGS_CACHE[text]

def get_file_embeddings():
  files = {}
  rl = DFS['resource_label']
  rl = rl[rl['name'] == "topic"]

  print("Working through resource labels")
  for row in tqdm(rl.iterrows(), total=len(rl)):

    resource_info = row[1]
    file_info = files.setdefault(resource_info['hash_id'], {
      "labels": [],
      "embedding": []
    })

    if (resource_info["name"] != "topic"):
      continue

    file_info["labels"].append({
      "id": resource_info["id"],
      "name": resource_info["value"]
    })

  print("Average topic embeddings for files")
  for _, info in tqdm(files.items(), total=len(files)):
    info['embedding'] = average_embeddings([get_embedding_for_text(label['name']) for label in info['labels']])
  return files

In [None]:
# USER embeddings, for each time-weight the relevant file-embeddings. Create dict of {user_id: embedding}
import math
from tqdm import tqdm

def aggregate_file_embeddings_per_user(valid_audit_ids):
  users = {}
  ar = DFS['resource_auditrecord']

  ar = ar[ar['id'].isin(valid_audit_ids)]
  
  bad_resource_ids = set()
  good_resource_ids = set()

  print("Processing audit log records:")
  for row in tqdm(ar.iterrows(), total=len(ar)):
    
    row_info = row[1]
    if row_info['audited_id'] in bad_resource_ids:
      continue
    
    user_info = users.setdefault(row_info['user_id'], {
      "file_infos": []
    })
    time_of_operation = row_info["timestamp"]
    try:
      # Get most relevant file version as of audit time
      hash_id, found_streams = get_file_hash_as_of_audit(row_info['audited_id'], row_info['operation'], time_of_operation)
    except Exception as e:
      # print(f"Exception caught: {e}")
      bad_resource_ids.add(row_info['audited_id'])
      continue

    # Cache audit id to file mapping for later metadata processing, even when no streams found
    AUDIT_TO_FILE_MAPPING[row_info['id']] = hash_id

    if hash_id not in FILES:
      # print(f"Didn't find {hash_id} in file, audit id: {row_info['audited_id']}, must have failed topic extraction")
      continue

    good_resource_ids.add(row_info['audited_id'])

    # If successfully got hash, lookup in files table and add to user info
    file_info = FILES[hash_id]
    file_embedding = file_info["embedding"]
    user_info['file_infos'].append({
      "timestamp": datetime.fromisoformat(time_of_operation),
      "embedding": file_embedding
    })

  # print(f"Success: {len(good_resource_ids)}, Error: {len(bad_resource_ids)}")
  return users

from datetime import datetime

def get_user_embedding(user_info, tau=60*60*24*30): 
  '''
  1) Default tau to 1 month, in other words, we lose 37% of old information if 
     a month has passed since last indexing. Exponential decay.
  2) Unit of time is seconds
  '''

  user_embedding = np.array(user_info['file_infos'][0]['embedding'])
  embedding_time = user_info['file_infos'][0]['timestamp']

  # Build up the index recursively
  for file_info in user_info['file_infos'][1:]:
    # Get difference from last time and update time variable
    d_t = file_info['timestamp'] - embedding_time
    d_t = d_t.total_seconds()
    embedding_time = file_info['timestamp']

    assert d_t >= 0, f"User file access times must be ordered: from - {embedding_time}, to - {file_info['timestamp']}"

    # Apply the weighting
    # d_t and tau should both be in seconds
    alpha = 1 - math.exp(-d_t / tau)
    user_embedding = alpha * np.array(file_info['embedding']) + (1 - alpha) * user_embedding

  return user_embedding

def get_user_embeddings(users):
  # 2.1) Time weight the file embeddings
  print("Calculating user embeddings: ")
  for user, user_info in tqdm(users.items()):
    user_info['file_infos'] = sorted(user_info['file_infos'], key=lambda event: event['timestamp'])
    if len(user_info['file_infos']) == 0: 
      print(f"User has no files {user}")
      continue
    user_info["embedding"] = get_user_embedding(user_info)
  return users

In [None]:
len(AUDIT_TO_FILE_MAPPING)

In [None]:
USERS[list(USERS.keys())[0]]

In [None]:
FILES, USERS, AUDIT_TO_FILE_MAPPING, EMBEDDINGS_CACHE = None, None, dict(), EMBEDDINGS_CACHE
FILES = get_file_embeddings()

# This step is an optimization, don't bother processing audit ids associated with files that haven't been indexed
valid_audit_ids, _, _ = get_valid_audit_ids()
USERS_AGG = aggregate_file_embeddings_per_user(valid_audit_ids)

In [None]:
USERS = get_user_embeddings(USERS_AGG)

In [None]:
save_to_cache(AUDIT_TO_FILE_MAPPING, EMBEDDINGS_CACHE, FILES, USERS)

# Meta Embeddings

In [None]:
# Meta helpers
import math
from collections import Counter
from datetime import datetime, timezone, timedelta
from collections import Counter
import numpy as np

def get_interarrivals(timestamps):
  interarrivals = []
  if len(timestamps) <= 1:
    return []
  prev_timestamp = timestamps[0]
  for next_timestamp in timestamps[1:]:
    diff = (next_timestamp - prev_timestamp).total_seconds()
    assert diff >= 0, f"not ordered: start_time: {prev_timestamp}, next_time: {next_timestamp}"
    interarrivals.append(diff)
    prev_timestamp = next_timestamp
  return interarrivals

def get_mit(interarrivals):
  return sum(interarrivals) / len(interarrivals) if len(interarrivals) > 0 else -1

def get_sit(interarrivals, mean):
  deviations = [math.pow(i - mean, 2) for i in interarrivals]
  if len(deviations) == 0:
    return -1
  return math.sqrt(sum(deviations) / len(deviations))

def get_fft_info(buckets):
  # Calculate top 3 frequencies/amplitudes/phases of time series using FFT
  bucket_arr = np.array(buckets)
  fft_result = np.fft.fft(bucket_arr)
  fft_freqs = np.fft.fftfreq(len(bucket_arr), d=1)
  amplitudes = np.abs(fft_result)
  phases = np.angle(fft_result)
  # Ignore the zero frequency (DC component)
  indices = np.argsort(amplitudes[1:])[::-1][:3] + 1 if len(amplitudes) > 1 else []
  return [(fft_freqs[i], amplitudes[i], phases[i]) for rank, i in enumerate(indices)]

def get_burstiness(interarrivals, meta):
  # TODO
  pass

def get_time_based_meta(start, end, interval, meta, operation, time_str, times):
  # Sort datetimes so it's O(n) operation to create buckets
  buckets = []
  current = start
  idx = 0
  n = len(times)
  while current < end:
      next_bucket = current + interval
      count = 0
      # Count how many times fall into [current, next_bucket)
      while idx < n and times[idx] < next_bucket:
          if times[idx] >= current:
              count += 1
          idx += 1
      buckets.append(count)
      current = next_bucket
  
  for i, bucket in enumerate(buckets):
    meta['times'][f'{operation}_{time_str}_{i}'] = bucket
  fft_infos = get_fft_info(buckets)
  for i, info in enumerate(fft_infos):
    meta['times'][f'{operation}_{time_str}_freq_{i}'] = info[0]
    meta['times'][f'{operation}_{time_str}_amp_{i}'] = info[1]
    meta['times'][f'{operation}_{time_str}_phase_{i}'] = info[2]

def get_entropy(values, normalized=True):
  if not values:
    return 0.0
  counts = Counter(values)
  total = sum(counts.values())
  probs = [cnt / total for cnt in counts.values()]
  H = -sum(p * math.log2(p) for p in probs if p > 0)

  if not normalized:
      return H

  k = len(counts)
  if k <= 1:
      return 0.0
  return H / math.log2(k)

def setup_metadata_from_audit_log(metadata, row_info):
  # Add location information
  locs = metadata.setdefault('locs', list())
  locs.append(f'{row_info['geolocation']}-{row_info['client_ip']}')

  # Add timestamps for operations
  operation = row_info['operation']
  times = metadata.setdefault('times', {})

  all_operations = times.setdefault('all_ops', [])
  operation_times = times.setdefault(operation, [])
  event = {
     "timestamp": datetime.fromisoformat(row_info['timestamp']),
     "user": row_info["user_id"]
  }
  # Per operation
  operation_times.append(event)
  # All operations
  all_operations.append(event)

def fill_metadata_stats_for_file_or_users(metadata, users, num_top=3):
  sorted_ops = [o['timestamp'] for o in sorted(metadata['times']['all_ops'], key=lambda op: op['timestamp'])]
  interarrivals = get_interarrivals(sorted_ops)
  # mean interarrival time
  metadata['mit'] = sum(interarrivals) / len(interarrivals) if len(interarrivals) > 0 else -1
  # std interarrival time
  metadata['sit'] = get_sit(interarrivals, metadata['mit'])
  # recency
  metadata['recency'] = (NOW - sorted_ops[-1]).total_seconds()

  # user distribution entropy
  metadata['ud_entropy'] = get_entropy(users)
  # unique users
  metadata['unique_users'] = len(set(users))
  # top users
  counter = Counter(users)
  top_users = counter.most_common(num_top)
  for i, user in enumerate(top_users):
    metadata[f'top_user_{i}'] = user[0]
    metadata[f'top_user_{i}_per'] = user[1] / len(users)

  # unique locations
  metadata['unique_locs'] = len(set(metadata['locs']))

  # top locations
  counter = Counter(metadata['locs'])
  top_locs = counter.most_common(num_top)
  for i, loc in enumerate(top_locs):
    metadata[f'top_loc_{i}'] = loc[0]
    metadata[f'top_loc_{i}_per'] = loc[1] / len(metadata['locs'])

  # time meta
  for OP in UNIQUE_OPS:
    if not metadata.get("times"):
      break
    op_times = []
    if metadata["times"].get(OP):
      op_times = [t['timestamp'] for t in sorted([op for op in metadata["times"][OP]], key=lambda operation: operation["timestamp"])]

    
    # time buckets and FFT for specific operations
    get_time_based_meta(NOW - timedelta(days=7), NOW, timedelta(hours=1), metadata, OP, 'hour', op_times)
    get_time_based_meta(NOW - timedelta(weeks=4), NOW, timedelta(days=1), metadata, OP, 'day', op_times)
    get_time_based_meta(NOW - timedelta(weeks=52), NOW, timedelta(weeks=4), metadata, OP, 'month', op_times)

  # time buckets and FFT for 'all' operations
  get_time_based_meta(NOW - timedelta(days=7), NOW, timedelta(hours=1), metadata, "all_ops", 'hour', sorted_ops)
  get_time_based_meta(NOW - timedelta(weeks=4), NOW, timedelta(days=1), metadata, "all_ops", 'day', sorted_ops)
  get_time_based_meta(NOW - timedelta(weeks=52), NOW, timedelta(weeks=4), metadata, "all_ops", 'month', sorted_ops)

  # burstiness stats
  get_burstiness(interarrivals, metadata)

def clean_up_metadata(metadata):
  # Clean up metadata we don't need
  del metadata['locs']
  del metadata['times']['all_ops']
  for OP in UNIQUE_OPS:
    if metadata['times'].get(OP):
      del metadata['times'][OP]

In [None]:
# USER: Extract meta features from audit log
from tqdm import tqdm

def get_user_info_from_auditlog(users, ra):
  print("Iterate Audit Log")
  for row in tqdm(ra.iterrows(), total=len(ra)):
    row_info = row[1]
    uid = row_info['user_id']
    user_info = {
      "file_infos": [],
      "embedding": []
    }
    if uid in users:
      user_info: dict = users[uid]
    
    # Create metadata object for user if doesn't exist
    metadata = user_info.setdefault("metadata", {})
    
    setup_metadata_from_audit_log(metadata, row_info)


def get_meta_features_users(users, num_top_locs=3, limit=1000000000):
  ra = DFS['resource_auditrecord']
  ra = ra.head(limit)

  # Delete any existing metadata so we don't mix between runs
  for user, user_info in users.items():
    if user_info.get("metadata"):
      del user_info['metadata']

  get_user_info_from_auditlog(users, ra)

  print("Update Users Metadata")
  for user, user_info in tqdm(users.items(), total=len(users)):
    if not user_info.get('metadata'):
      # print(f"No metadata for file {file}")
      continue
    metadata = user_info['metadata']
    if not metadata.get('locs'):
      print(f"No activity for user {user}")
      continue

    fill_metadata_stats_for_file_or_users(metadata, [user])
    clean_up_metadata(metadata)

  return users

In [None]:
# FILE: Extract meta features from audit log
from collections import Counter

def get_file_info_from_auditlog(files, ra):
  print("Iterate Audit Log")
  for row in tqdm(ra.iterrows(), total=len(ra)):
    row_info = row[1]

    audit_log_id = row_info['id']
    if audit_log_id not in AUDIT_TO_FILE_MAPPING:
      # print("No file info available")
      continue
    file_hash = AUDIT_TO_FILE_MAPPING[row_info['id']]

    if file_hash not in files:
      # print(f"Didn't find file hash in files... adding file")
      files[file_hash] = {}
    file_info = files[file_hash]
    
    # Create metadata object for file if doesn't exist
    metadata = file_info.setdefault("metadata", {})

    setup_metadata_from_audit_log(metadata, row_info)

def get_meta_features_files(files, num_top_locs=3, limit=1000000000):
  ra = DFS['resource_auditrecord']
  ra = ra.head(limit)

  # Delete any existing metadata so we don't mix between runs
  for file, file_info in files.items():
    if file_info.get("metadata"):
      del file_info['metadata']

  get_file_info_from_auditlog(files, ra)

  print("Update Files Metadata")
  for file, file_info in tqdm(files.items(), total=len(files)):
    if not file_info.get('metadata'):
      # print(f"No metadata for file {file}")
      continue
    metadata = file_info['metadata']
    if not metadata.get('locs'):
      # print(f"No activity for file {file}")
      continue

    users = [op['user'] for op in metadata['times']['all_ops']]
    fill_metadata_stats_for_file_or_users(metadata, users)
    clean_up_metadata(metadata)
  return files

In [None]:
limit = 10000000000000
USERS = get_meta_features_users(USERS, limit=limit)
FILES = get_meta_features_files(FILES, limit=limit)

In [None]:
save_to_cache(AUDIT_TO_FILE_MAPPING, EMBEDDINGS_CACHE, FILES, USERS)

In [None]:
# Explore data

from tqdm import tqdm

ra = DFS['resource_auditrecord']
rrn = DFS['resource_resourcenode']
rr = DFS['resource_resource']
rl = DFS['resource_label']
rh = DFS['resource_hash']
# rrn[rrn['id'] == 2444061]
# rr[rr['id'] == 814033]
# rr[rr['parent_id'] == 814033]
# rl[rl['hash_id'] == 78048]
# ra.head()
# ra[ra['audited_id'] == 2444061.0]
# len(rr['id'].unique())
# rr[rr['resource_type'] == "FILE"]
# rrn = dfs['resource_resourcenode']
# len(rrn['resource_id'].unique())

# Get valid audit record ids preemptively
340 in set(AUDIT_TO_FILE_MAPPING.values())

index = 4000
# users_list = list(USERS.keys())
# user_name = users_list[index]
files_list = list(FILES.keys())
file_hash = files_list[index]
# USERS[user_name]['metadata']
count = 0
for hash, file in FILES.items():
  if file.get("metadata"):
    count += 1

print(count)
# file_hash


# Training

In [None]:
# PCA

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

def perform_pca_on_metadata(data, metadata_features, n_components=None, variance_threshold=0.95):
    """
    Perform PCA on metadata features of the dataset, properly handling both numerical and categorical features.
    
    Args:
        data (pd.DataFrame): The input DataFrame containing all features
        metadata_features (list): List of column names for metadata features
        n_components (int, optional): Number of components to keep. If None, determined by variance_threshold
        variance_threshold (float, optional): Minimum cumulative explained variance ratio to maintain
        
    Returns:
        tuple: (transformed_data, pca_object, preprocessor_object)
            - transformed_data: PCA-transformed features
            - pca_object: Fitted PCA object for future use
            - preprocessor_object: Fitted ColumnTransformer object for future use
    """
    # Extract metadata features
    X = data[metadata_features].copy()
    
    # Print information about missing values
    n_missing = X.isnull().sum()
    if n_missing.any():
        print("\nMissing values per feature:")
        print(n_missing[n_missing > 0])
    
    # Identify numeric and categorical columns
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object', 'category', 'bool']).columns
    
    # Create preprocessing steps for both numeric and categorical features
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(drop='if_binary', sparse_output=False, handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='drop'  # Drop any columns that don't fit either category
    )
    
    # Fit and transform the data
    X_preprocessed = preprocessor.fit_transform(X)
    
    # Get feature names after preprocessing
    numeric_features_out = list(numeric_features)
    if len(categorical_features) > 0:
        categorical_features_out = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
    else:
        categorical_features_out = []
    feature_names = numeric_features_out + list(categorical_features_out)
    
    # If n_components not specified, use variance threshold
    if n_components is None:
        # Start with min(n_samples, n_features) components
        n_features = min(X_preprocessed.shape[0], X_preprocessed.shape[1])
        pca_full = PCA(n_components=n_features)
        pca_full.fit(X_preprocessed)
        
        # Find number of components needed to explain variance_threshold of variance
        cumsum = np.cumsum(pca_full.explained_variance_ratio_)
        n_components = np.argmax(cumsum >= variance_threshold) + 1
    
    # Perform PCA with determined number of components
    pca = PCA(n_components=n_components)
    X_transformed = pca.fit_transform(X_preprocessed)
    
    # Create DataFrame with transformed features
    columns = [f'PC{i+1}' for i in range(n_components)]
    X_pca = pd.DataFrame(X_transformed, columns=columns, index=data.index)
    
    # Print information about the transformation
    print(f"\nNumber of numerical features: {len(numeric_features)}")
    print(f"Number of categorical features: {len(categorical_features)}")
    print(f"Total features after one-hot encoding: {X_preprocessed.shape[1]}")
    print(f"Number of PCA components: {n_components}")
    print(f"Total variance explained: {pca.explained_variance_ratio_.sum():.3f}")
    
    # Optional: Print top feature contributions to first few components
    if len(feature_names) > 0:
        n_top = 3  # Number of top features to show
        n_components_to_show = min(3, n_components)  # Number of components to show
        
        print("\nTop feature contributions to first few principal components:")
        for i in range(n_components_to_show):
            loadings = pd.Series(
                pca.components_[i],
                index=feature_names
            ).abs().sort_values(ascending=False)
            
            print(f"\nPC{i+1} top {n_top} features:")
            print(loadings.head(n_top))
    
    return X_pca, pca, preprocessor

In [None]:
# Concatenate embeddings

def get_df_entry(metadata_info, id, is_user):
    entry = {}
    entry['is_user'] = is_user
    entry['object_id'] = id
    for k, v in metadata_info.items():
        if k == "times":
            for time_key, time_value in v.items():
                entry[time_key] = time_value
        else:
            entry[k] = v
    return entry

def create_metadata_df(users, files):
    # Remove nan keys...
    files = {k: v for k,v in files.items() if not np.isnan(k)}
    # Get all metadata objects
    metadata_objs = []
    for user_id, user_info in users.items():
        metadata_objs.append(get_df_entry(user_info['metadata'], user_id, True))
    for file_hash, file_info in files.items():
        if file_info.get("metadata"):
            metadata_objs.append(get_df_entry(file_info['metadata'], file_hash, False))
    print(f"Number of metadata objs {len(metadata_objs)}")
    df = pd.DataFrame(metadata_objs)
    df = df.add_prefix("META_")
    return df

def create_topic_embeddings_df(users, files):
  # Add object ID and user ID so we can join with metadata embeddings
  user_embeddings = [get_embedding(user_info) + [user_id, True] for user_id, user_info in users.items()]
  file_embeddings = [get_embedding(file_info) + [file_id, False] for file_id, file_info in files.items()]
  print(f"Number of topic embeddings {len(user_embeddings) + len(file_embeddings)}")
  df = pd.DataFrame(user_embeddings + file_embeddings)
  df = df.rename(columns={1024: 'object_id', 1025: 'is_user'}).add_prefix("TOPIC_")
  return df

# Feature concatenation
def get_embedding(user_info, embedding_size=1024):
  if 'embedding' in user_info:
    return user_info['embedding'].tolist()
  else:
    return [0] * embedding_size

def concatenate_topic_meta_embeddings(topic_df, meta_df):
  # Ensure object_id is the right type in both dataframes
  topic_df['TOPIC_object_id'] = topic_df['TOPIC_object_id'].astype(str)
  meta_df['META_object_id'] = meta_df['META_object_id'].astype(str)
  
  # LEFT join to keep all rows from meta_df and only matching rows from topic_df
  merged_df = pd.merge(meta_df, topic_df, left_on=['META_object_id'], right_on=['TOPIC_object_id'], how='left')
  return merged_df

In [None]:
metadata_df = create_metadata_df(USERS, FILES)
topic_embeddings_df = create_topic_embeddings_df(USERS, FILES)

In [None]:
# Keep the identification columns
id_columns = ["META_is_user", "META_object_id"]
features_names = list(metadata_df.columns)
for col in id_columns:
    features_names.remove(col)

# Perform PCA
metadata_embeddings_df, pca, preprocessor = perform_pca_on_metadata(metadata_df, features_names)

# Add back the identification columns
for col in id_columns:
    metadata_embeddings_df[col] = metadata_df[col]

In [None]:
# Merge topic/meta embeddings
merged_df = concatenate_topic_meta_embeddings(topic_embeddings_df, metadata_embeddings_df)

In [None]:
# Contrastive loss learning

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

# Neural network to learn optimal lambda for combining embeddings
class EmbeddingBalancer(nn.Module):
    def __init__(self, topic_dim, meta_dim, hidden_dim=128):
        super().__init__()
        self.topic_encoder = nn.Sequential(
            nn.Linear(topic_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)
        )
        
        self.meta_encoder = nn.Sequential(
            nn.Linear(meta_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)
        )
        
        self.lambda_net = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()  # Lambda should be between 0 and 1
        )
        
    def forward(self, topic_emb, meta_emb):
        topic_encoded = self.topic_encoder(topic_emb)
        meta_encoded = self.meta_encoder(meta_emb)
        combined = torch.cat([topic_encoded, meta_encoded], dim=1)
        lambda_weight = self.lambda_net(combined)
        
        # Weighted combination of embeddings
        weighted_emb = lambda_weight * topic_emb + (1 - lambda_weight) * meta_emb
        return weighted_emb, lambda_weight

# Dataset for contrastive learning
class ContrastiveDataset(Dataset):
    def __init__(self, merged_df, num_negatives=5):
        self.df = merged_df
        self.num_negatives = num_negatives
        
        # Separate topic and metadata embeddings
        self.topic_cols = [str(i) for i in range(1024)]  # First 1024 columns are topic embeddings
        self.meta_cols = [col for col in merged_df.columns if col.startswith('PC')]  # PCA components
        
        self.topic_embeddings = torch.FloatTensor(merged_df[self.topic_cols].values)
        self.meta_embeddings = torch.FloatTensor(merged_df[self.meta_cols].values)
        
        # Group by user vs file
        self.user_indices = merged_df[merged_df['is_user'] == True].index.tolist()
        self.file_indices = merged_df[merged_df['is_user'] == False].index.tolist()
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        # Get anchor embeddings
        anchor_topic = self.topic_embeddings[idx]
        anchor_meta = self.meta_embeddings[idx]
        is_user = idx in self.user_indices
        
        # Get positive example (same type - user or file)
        pos_pool = self.user_indices if is_user else self.file_indices
        pos_idx = np.random.choice([i for i in pos_pool if i != idx])
        positive_topic = self.topic_embeddings[pos_idx]
        positive_meta = self.meta_embeddings[pos_idx]
        
        # Get negative examples (opposite type)
        neg_pool = self.file_indices if is_user else self.user_indices
        neg_indices = np.random.choice(neg_pool, size=self.num_negatives)
        negative_topics = self.topic_embeddings[neg_indices]
        negative_metas = self.meta_embeddings[neg_indices]
        
        return {
            'anchor_topic': anchor_topic,
            'anchor_meta': anchor_meta,
            'positive_topic': positive_topic,
            'positive_meta': positive_meta,
            'negative_topics': negative_topics,
            'negative_metas': negative_metas
        }

# Contrastive loss function
def contrastive_loss(anchor, positive, negatives, temperature=0.07):
    # Compute similarities
    pos_sim = torch.cosine_similarity(anchor, positive, dim=1)
    neg_sims = torch.cosine_similarity(anchor.unsqueeze(1), negatives, dim=2)
    
    # Scale by temperature
    pos_sim = pos_sim / temperature
    neg_sims = neg_sims / temperature
    
    # Compute loss
    logits = torch.cat([pos_sim.unsqueeze(1), neg_sims], dim=1)
    labels = torch.zeros(len(anchor), dtype=torch.long, device=anchor.device)  # First index (0) is positive
    return nn.CrossEntropyLoss()(logits, labels)

# Training function
def train_balancer(model, train_loader, optimizer, device, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_loader:
            # Move data to device
            anchor_topic = batch['anchor_topic'].to(device)
            anchor_meta = batch['anchor_meta'].to(device)
            positive_topic = batch['positive_topic'].to(device)
            positive_meta = batch['positive_meta'].to(device)
            negative_topics = batch['negative_topics'].to(device)
            negative_metas = batch['negative_metas'].to(device)
            
            # Forward pass for anchor and positive pairs
            anchor_combined, anchor_lambda = model(anchor_topic, anchor_meta)
            positive_combined, pos_lambda = model(positive_topic, positive_meta)
            
            # Forward pass for negatives
            neg_combined = []
            for i in range(negative_topics.size(1)):
                neg_emb, neg_lambda = model(negative_topics[:, i], negative_metas[:, i])
                neg_combined.append(neg_emb)
            neg_combined = torch.stack(neg_combined, dim=1)
            
            # Compute loss
            loss = contrastive_loss(anchor_combined, positive_combined, neg_combined)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

# Semantic Similarity

In [None]:
def get_semantic_similarity(emb1, emb2, metric='cosine'):
  """
  Calculate semantic similarity between two embeddings.
  
  Args:
      emb1 (numpy.ndarray): First embedding vector
      emb2 (numpy.ndarray): Second embedding vector
      metric (str): Similarity metric to use. Options: 'cosine', 'euclidean', 'dot'
  
  Returns:
      float: Similarity score between the embeddings
  """
  # Convert to numpy arrays if they aren't already
  emb1 = np.array(emb1)
  emb2 = np.array(emb2)
  
  # Ensure vectors have the same shape
  assert emb1.shape == emb2.shape, f"Embeddings must have same shape. Got {emb1.shape} and {emb2.shape}"
  
  if metric == 'cosine':
      # Compute cosine similarity
      norm1 = np.linalg.norm(emb1)
      norm2 = np.linalg.norm(emb2)
      if norm1 == 0 or norm2 == 0:
          return 0.0
      return np.dot(emb1, emb2) / (norm1 * norm2)
  
  elif metric == 'euclidean':
      # Compute negative euclidean distance (converted to similarity)
      # We use negative distance so that smaller distances = higher similarity
      return -np.linalg.norm(emb1 - emb2)
  
  elif metric == 'dot':
      # Simple dot product
      return np.dot(emb1, emb2)
  
  else:
      raise ValueError(f"Unknown metric: {metric}. Choose from 'cosine', 'euclidean', or 'dot'")

# Analysis

In [None]:
"""
Question: Given clustered user representations, can we map users in the same cluster
          to the organization org chart?
Approach: 1) Perform clustering on user representations - matching topic/meta embeddings
          2) Compare the user_ids in these clusters to actual org chart, what's the match percentage
"""

In [None]:
# Spectral clustering for user embeddings

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import SpectralClustering

# Get all user embeddings
users_reps = merged_df[merged_df['TOPIC_is_user'] == True].copy()

# Separate each into meta/topic features (topic feature columns start with TOPIC, everything else is META)
id_columns = ["META_is_user", "META_object_id", "TOPIC_is_user", "TOPIC_object_id"]
topic_columns = [col for col in users_reps.columns if col.startswith('TOPIC_') and col not in id_columns]
meta_columns = [col for col in users_reps.columns if not col.startswith('TOPIC_') and col not in id_columns]

# Process topic features (already numerical)
topic_features = users_reps[topic_columns].copy()

# Process meta features with categorical encoding
meta_features = users_reps[meta_columns].copy()

def compute_similarity_matrix(topic_features, meta_features, alpha=0.5):
    """
    Compute similarity matrix between users using both topic and meta features
    alpha: weight for topic similarity (1-alpha will be weight for meta similarity)
    """
    # Normalize features
    scaler = StandardScaler()
    topic_features_norm = scaler.fit_transform(topic_features)
    meta_features_norm = scaler.fit_transform(meta_features)
    
    # Compute similarities
    topic_sim = cosine_similarity(topic_features_norm)
    meta_sim = cosine_similarity(meta_features_norm)
    
    # Late fusion with weighted combination
    total_sim = alpha * topic_sim + (1 - alpha) * meta_sim
    return total_sim

# Compute similarity matrix with encoded features
similarity_matrix = compute_similarity_matrix(topic_features, meta_features, alpha=0.6)

# Rescale [0,1]
similarity_matrix = (similarity_matrix - similarity_matrix.min()) / (similarity_matrix.max() - similarity_matrix.min())

# Perform spectral clustering
n_clusters = 10  # Adjust based on expected org chart structure
clustering = SpectralClustering(n_clusters=n_clusters, 
                              affinity='precomputed',
                              random_state=42)
cluster_labels = clustering.fit_predict(similarity_matrix)

# Map users to clusters
user_buckets = [[] for _ in range(max(cluster_labels)+1)]
for user_id, bucket in zip(users_reps['TOPIC_object_id'], cluster_labels):
    user_buckets[bucket].append(user_id)

In [None]:
# === Elbow method for spectal clustering ===

import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import SpectralClustering
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score
)
from sklearn.manifold import SpectralEmbedding

def evaluate_k_elbow(similarity_matrix, k_values, random_state=42):
    similarity_matrix = np.array(similarity_matrix)
    distance_matrix = (1 - similarity_matrix) / 2  # handle cosine-like sims safely

    silhouette_scores = []
    calinski_scores = []
    davies_scores = []

    # Optional embedding for metrics that need features
    embedding = SpectralEmbedding(n_components=10, affinity='precomputed', random_state=random_state)
    X_embed = embedding.fit_transform(similarity_matrix)

    for k in k_values:
        clustering = SpectralClustering(
            n_clusters=k,
            affinity='precomputed',
            random_state=random_state
        )
        labels = clustering.fit_predict(similarity_matrix)

        # --- Silhouette (higher is better)
        sil = silhouette_score(distance_matrix, labels, metric="precomputed")
        silhouette_scores.append(sil)

        # --- Calinski-Harabasz (higher is better)
        calinski_scores.append(calinski_harabasz_score(X_embed, labels))

        # --- Davies-Bouldin (lower is better)
        davies_scores.append(davies_bouldin_score(X_embed, labels))

        print(f"k={k:2d} | Silhouette={sil:.3f} | CH={calinski_scores[-1]:.3f} | DB={davies_scores[-1]:.3f}")

    # --- Plot results ---
    fig, ax1 = plt.subplots(figsize=(8,5))
    ax1.plot(k_values, silhouette_scores, 'o-', label='Silhouette', color='C0')
    ax1.plot(k_values, calinski_scores, 'o-', label='Calinski-Harabasz', color='C1')
    ax1.set_xlabel("Number of clusters (k)")
    ax1.set_ylabel("Score")
    ax1.legend(loc='best')
    ax1.grid(True)
    plt.title("Elbow Analysis for Spectral Clustering")
    plt.show()

    plt.figure(figsize=(8,5))
    plt.plot(k_values, davies_scores, 'o-', color='C2', label='Davies-Bouldin (lower is better)')
    plt.xlabel("Number of clusters (k)")
    plt.ylabel("Score")
    plt.legend()
    plt.grid(True)
    plt.show()

    print("✅ Done! Look for the elbow / peak in the plots.")

k_values = range(2, 20)
evaluate_k_elbow(similarity_matrix, k_values)

In [None]:
# === Clustering Quality Visualization Toolkit ===

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    silhouette_score,
    silhouette_samples,
    calinski_harabasz_score,
    davies_bouldin_score
)
from sklearn.decomposition import PCA
from sklearn.manifold import SpectralEmbedding

def evaluate_clustering_from_similarity(similarity_matrix, labels, random_state=42):
    """
    Evaluate clustering quality given a precomputed similarity matrix.
    Includes silhouette analysis (on distance form), cluster distribution,
    and 2D embedding visualization (using spectral embedding).
    """
    similarity_matrix = np.array(similarity_matrix)
    n_clusters = len(np.unique(labels))
    if n_clusters < 2:
        print("Need at least 2 clusters for evaluation.")
        return

    # --- Convert to distance matrix ---
    distance_matrix = 1 - similarity_matrix
    distance_matrix = np.clip(distance_matrix, 0, None)

    # --- Compute metrics ---
    print(f"Number of clusters: {n_clusters}")
    print("----- Cluster Metrics -----")
    try:
        sil = silhouette_score(distance_matrix, labels, metric="precomputed")
        print(f"Silhouette Score:        {sil:.3f}")
    except Exception as e:
        print(f"Silhouette Score:        (failed: {e})")

    # Calinski-Harabasz and Davies-Bouldin require feature-like space, so we embed
    embed = SpectralEmbedding(n_components=10, affinity='precomputed', random_state=random_state)
    X_embed = embed.fit_transform(similarity_matrix)

    print(f"Calinski-Harabasz Score: {calinski_harabasz_score(X_embed, labels):.3f}")
    print(f"Davies-Bouldin Score:    {davies_bouldin_score(X_embed, labels):.3f}")

    # --- Cluster size distribution ---
    unique, counts = np.unique(labels, return_counts=True)
    plt.figure(figsize=(6,4))
    sns.barplot(x=unique, y=counts, palette="viridis")
    plt.title("Cluster Size Distribution")
    plt.xlabel("Cluster Label")
    plt.ylabel("Count")
    plt.show()

    # --- Silhouette plot ---
    try:
        sil_samples = silhouette_samples(distance_matrix, labels, metric="precomputed")
        y_lower = 10
        plt.figure(figsize=(8, 6))
        for i in range(n_clusters):
            ith_vals = sil_samples[labels == i]
            ith_vals.sort()
            size_i = ith_vals.shape[0]
            y_upper = y_lower + size_i
            plt.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_vals, alpha=0.7)
            plt.text(-0.05, y_lower + 0.5 * size_i, str(i))
            y_lower = y_upper + 10
        plt.axvline(x=np.mean(sil_samples), color="red", linestyle="--")
        plt.title("Silhouette Plot per Cluster (from similarity matrix)")
        plt.xlabel("Silhouette Coefficient Values")
        plt.ylabel("Cluster")
        plt.show()
    except Exception as e:
        print(f"Silhouette plot skipped: {e}")

    # --- 2D Visualization (Spectral Embedding) ---
    from sklearn.decomposition import PCA
    X_2d = PCA(n_components=2, random_state=random_state).fit_transform(X_embed)

    plt.figure(figsize=(7,6))
    sns.scatterplot(x=X_2d[:,0], y=X_2d[:,1], hue=labels, palette="tab10", s=40, alpha=0.8)
    plt.title("Cluster Visualization (2D Spectral Embedding)")
    plt.xlabel("Component 1")
    plt.ylabel("Component 2")
    plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.show()

    print("✅ Visualization complete.")

# ---- Example usage ----
evaluate_clustering_from_similarity(similarity_matrix, cluster_labels)

In [None]:
"""
Question: Given user topic/meta embeddings and file topic/meta embeddings, how well
          does cosine similarity reflect actual file access patterns?
Approach: 1) Get cosine similarity of every user with every file
          2) Test with different thresholds, where above threshold means we label as user accessed file and below labeled as no 
          3) Get confusion matrix
"""

In [None]:
"""
Question: Given x month of unobserved data for an organization randomly mixed with fake data,
          can we predict which user/file accesses actually happened?
Approach: 1) Get snapshot of data as of x months ago, create file/user embeddings at that time
          2) For each event starting from x months ago, fetch the file/user embedding (real data)
          3) Create fake events (n:1? what should the ratio be? Needs to be researched) by mismatching users and file historys
          4) Using cosine similarity of user/file make an assessment, play around with thresholds, get the performance
"""