# Representing Users and Files
#### Design Document: https://docs.google.com/document/d/1F84Nj3IQ-f_36bmmsOTuOo9u65gfD1WU5LKdnw8ShcY/edit?tab=t.x97j5jy1kop1#heading=h.1vu1g9fe3ujo
#### Optimizations:
- Re-ranking topics
- More information in file embeddings

# Dependencies

In [None]:
! pip install pandas
! pip install openai
! pip install sentence-transformers
! pip install torch

# Data

In [None]:
# Load CSV files into pandas DataFrames
import pandas as pd
import os

csv_dir = "./csvs"

dfs = {}
for filename in os.listdir(csv_dir):
    path = csv_dir + "/" + filename
    try:
        name_no_ext = filename.split('.')[0]
        dfs[name_no_ext] = pd.read_csv(path)
    except Exception as e:
        print(f"Failed to read {path}: {e}")

In [None]:
# Print brief summary and show first few rows for each loaded dataframe
from IPython.display import display

for name, df in dfs.items():
    print(f"{name}: {df.shape}")
    display(df.head())

# User/File Embeddings

In [None]:
# Load embedding model
from sentence_transformers import SentenceTransformer

model_name = 'BAAI/bge-large-zh-v1.5'
model = SentenceTransformer(model_name)

In [None]:
# 0.5) Cache any embeddings we calculate.
import os
import pickle
from datetime import datetime
import glob

cache_dir = "./pickle/"

def _most_recent_file(pattern):
  # Return the most recently modified file in cache_dir matching the glob pattern, or None if none found
  matches = glob.glob(os.path.join(cache_dir, pattern))
  if not matches:
    return None
  return max(matches, key=os.path.getmtime)

def get_cache():
  audit_to_file_mapping_file_path = _most_recent_file("audit_to_file_mapping_*.pkl")
  topic_embeddings_file_path = _most_recent_file("topic_embeddings_bge-large-zh-v1.5_*.pkl")
  file_embeddings_file_path = _most_recent_file("file_embeddings_*.pkl")
  user_embeddings_file_path = _most_recent_file("user_embeddings_*.pkl")

  audit_to_file_mapping, embeddings_cache, files, users = {}, {}, {}, {}

  if audit_to_file_mapping_file_path and os.path.exists(audit_to_file_mapping_file_path):
    with open(audit_to_file_mapping_file_path, "rb") as p:
      audit_to_file_mapping = pickle.load(p)

  if topic_embeddings_file_path and os.path.exists(topic_embeddings_file_path):
    with open(topic_embeddings_file_path, "rb") as p:
      embeddings_cache = pickle.load(p)

  if file_embeddings_file_path and os.path.exists(file_embeddings_file_path):
    with open(file_embeddings_file_path, "rb") as p:
      files = pickle.load(p)

  if user_embeddings_file_path and os.path.exists(user_embeddings_file_path):
    with open(user_embeddings_file_path, "rb") as p:
      users = pickle.load(p)

  print(f"Audit to file mapping size: {len(audit_to_file_mapping)}")
  print(f"Topic embeddings cache size: {len(embeddings_cache) if embeddings_cache is not None else 0}")
  print(f"Files size: {len(files) if files is not None else 0}")
  print(f"Users size: {len(users) if users is not None else 0}")

  return audit_to_file_mapping, embeddings_cache, files, users


def save_to_cache(audit_to_file_mapping, embeddings_cache, files, users):
  # Ensure cache dir exists
  os.makedirs(cache_dir, exist_ok=True)
  NOW = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
  with open(os.path.join(cache_dir, f"topic_embeddings_bge-large-zh-v1.5_{now}.pkl"), "wb") as f:
    pickle.dump(embeddings_cache, f)
  with open(os.path.join(cache_dir, f"file_embeddings_{now}.pkl"), "wb") as f:
    pickle.dump(files, f)
  with open(os.path.join(cache_dir, f"user_embeddings_{now}.pkl"), "wb") as f:
    pickle.dump(users, f)
  with open(os.path.join(cache_dir, f"audit_to_file_mapping_{now}.pkl"), "wb") as f:
    pickle.dump(audit_to_file_mapping, f)

In [None]:
files[81523]

In [None]:
# 1) For each file, get its topics. Create dict of {file_hash_id: {labels: [{label_name: str, label_embedding: [int]}], embedding: int}]
# 1.1) Create topic embedding with embedding model, store in cache
# 1.2) Average those embeddings to get the file representation
import numpy as np
from tqdm import tqdm

def average_embeddings(embeddings):
  return np.mean(embeddings, axis=0)

def get_embedding_for_text(text: str):
  text = text.lower()

  if text in embeddings_cache:
    return embeddings_cache[text]
  
  embeddings_cache[text] = model.encode(text, normalize_embeddings=True) # normalize for cosine similarity
  return embeddings_cache[text]

def get_file_embeddings(limit=None):
  files = {}
  rl = dfs['resource_label']
  rl = rl[rl['name'] == "topic"]

  print("Number of resources to work through: ", len(rl))
  for row in tqdm(rl.iterrows()):

    # Break out early if we want
    if limit is not None:
      limit -= 1
      if limit <= 0:
        break

    resource_info = row[1]
    file_info = files.setdefault(resource_info['hash_id'], {
      "labels": [],
      "embedding": []
    })

    if (resource_info["name"] != "topic"):
      continue

    file_info["labels"].append({
      "id": resource_info["id"],
      "name": resource_info["value"]
    })

  print("Number of files to work through")
  for _, info in tqdm(files.items()):
    info['embedding'] = average_embeddings([get_embedding_for_text(label['name']) for label in info['labels']])
  return files

# Didn't load from cache
if len(files) == 0:
  files = get_file_embeddings()

In [None]:
# Define function to get valid audit ids

from functools import cache

@cache
def get_valid_audit_ids():
  rr, rrn, ra = dfs["resource_resource"], dfs["resource_resourcenode"], dfs["resource_auditrecord"]
  valid_resource_ids = set()
  print("Get valid resources")
  for row in tqdm(rr.iterrows(), total=len(rr)):
    row_info = row[1]
    id = row_info['id']

    if len(rr[rr['parent_id'] == id]) != 0:
      valid_resource_ids.add(id)

  valid_resource_node_ids = set()
  print("Get valid resource nodes")
  for row in tqdm(rrn.iterrows(), total=len(rrn)):
    row_info = row[1]
    if row_info['resource_id'] in valid_resource_ids:
      valid_resource_node_ids.add(row_info['id'])

  valid_audit_ids = set()
  ra.head()
  print("Get valid audit ids")
  for row in tqdm(ra.iterrows(), total=len(ra)):
    row_info = row[1]
    if row_info['audited_id'] in valid_resource_node_ids:
      valid_audit_ids.add(row_info['id'])
  return valid_audit_ids, valid_resource_node_ids, valid_resource_ids

In [None]:
# 2) For each user, time-weight the relevant file-embeddings. Create dict of {user_id: embedding}
import math
from tqdm import tqdm

def get_stream(audit_id, file_id, streams, timestamp, operation):
    # File has been changed in some way, we want to find the first STREAM that is after the audit log timestamp
    should_check_after = operation in ['MODIFIED', 'FILE_UPLOADED', 'RENAMED']
    if len(streams) == 0:
      raise ValueError(f"No streams for audit id: {audit_id}, file id: {file_id} ")
    
    # 1 stream, always pick it
    if len(streams) == 1:
      stream = streams.iloc[0]
    else:
      streams = streams.sort_values(by="timestamp")
      if should_check_after:
        filtered_streams = streams[streams['timestamp'] >= timestamp]
        if len(filtered_streams) == 0:
          # Fallback to filtering opposite way
          filtered_streams = streams[streams['timestamp'] <= timestamp]
          stream = filtered_streams.iloc[-1]
        else:
          stream = filtered_streams.iloc[0]
      else:
        filtered_streams = streams[streams['timestamp'] <= timestamp]
        if len(filtered_streams) == 0:
          # Fallback to filtering opposite way
          filtered_streams = streams[streams['timestamp'] >= timestamp]
          stream = filtered_streams.iloc[0]
        else:
          stream = filtered_streams.iloc[-1]
    return stream

def get_file_hash_as_of_audit(audit_id, operation, timestamp):
  rrn, rr = dfs['resource_resourcenode'], dfs['resource_resource']
  resource_node = rrn[rrn['id'] == audit_id]
  resource_id = resource_node['resource_id']

  # Assume only one resource id associated with audit id?
  if len(resource_id) != 1:
      raise ValueError(f"Found more than one resource with ID: {resource_id}. Audit id: {audit_id}")
  resource_id = resource_id.iloc[0]

  # Grab the resource reference by the audit log event
  resource = rr[rr['id'] == resource_id]
  if len(resource) != 1:
      raise ValueError(f"Resource length greater than 1: {resource}. Audit id: {audit_id}")

  # Work our way down to the relevant STREAM, or in other words actual data, relevant for this audit log event
  file_or_stream = resource.iloc[0]

  if file_or_stream['resource_type'] != "STREAM":
      # Must be a file
      assert file_or_stream['resource_type'] == "FILE"
      file = file_or_stream
      streams = rr[rr['parent_id'] == file['id']]
      stream = get_stream(audit_id, file['id'], streams, timestamp, operation)
      if stream is None:
          # Found no stream for file id
          raise ValueError(f"Found no stream for audit id: {audit_id}, file_id: {file['id']}")
      hash_id = stream['hash_id']
  else:
      # If stream, directly get hash id
      hash_id = file_or_stream['hash_id']

  return hash_id

# Cache mapping from audit log record to file
audit_id_to_file_hash = {}

def aggregate_file_embeddings_per_user(valid_audit_ids=None):
  limit = 10000000000
  users = {}
  ar = dfs['resource_auditrecord']

  # Only consider audit ids that point to actual files when dealing with topic embeddings for user
  if not valid_audit_ids:
    valid_audit_ids = get_valid_audit_ids()

  ar = ar[ar['id'].isin(valid_audit_ids)]
  
  bad_resource_ids = set()
  good_resource_ids = set()

  print("Processing audit log records:")
  for row in tqdm(ar.iterrows(), total=len(ar)):
    
    row_info = row[1]
    if row_info['audited_id'] in bad_resource_ids:
      continue
    
    if len(bad_resource_ids) > limit:
      break
    
    user_info = users.setdefault(row_info['user_id'], {
      "file_infos": []
    })
    time_of_operation = row_info["timestamp"]
    try:
      # Get most relevant file version as of audit time
      hash_id = get_file_hash_as_of_audit(row_info['audited_id'], row_info['operation'], time_of_operation)
    except Exception as e:
      # print(f"Exception caught: {e}")
      bad_resource_ids.add(row_info['audited_id'])
      continue

    if hash_id not in files:
      # print(f"Didn't find {hash_id} in file, audit id: {row_info['audited_id']}, must have failed topic extraction")
      continue

    # Cache audit id to file mapping for later
    audit_id_to_file_hash[row_info['id']] = hash_id

    good_resource_ids.add(row_info['audited_id'])

    # If successfully got hash, lookup in files table and add to user info
    file_info = files[hash_id]
    file_embedding = file_info["embedding"]
    user_info['file_infos'].append({
      "timestamp": datetime.fromisoformat(time_of_operation),
      "embedding": file_embedding
    })

  # print(f"Success: {len(good_resource_ids)}, Error: {len(bad_resource_ids)}")
  return users

valid_audit_ids, _, _ = get_valid_audit_ids()
users = aggregate_file_embeddings_per_user(valid_audit_ids)

In [None]:
len(ra['id'].unique())

In [None]:
from datetime import datetime

def get_user_embedding(uid, user, tau=60*60*24*30): 
  '''
  1) Default tau to 1 month, in other words, we lose 37% of old information if 
     a month has passed since last indexing. Exponential decay.
  2) Unit of time is seconds
  '''
  assert len(user['file_infos']) > 0, f"User has no files {uid}"

  user_embedding = np.array(user['file_infos'][0]['embedding'])
  embedding_time = user['file_infos'][0]['timestamp']

  # Build up the index recursively
  for file_info in user['file_infos'][1:]:
    # Get difference from last time and update time variable
    d_t = datetime.fromisoformat(file_info['timestamp']) - datetime.fromisoformat(embedding_time)
    d_t = d_t.total_seconds()
    embedding_time = file_info['timestamp']

    assert d_t >= 0, f"User file access times must be ordered: from - {embedding_time}, to - {file_info['timestamp']}"

    # Apply the weighting
    # d_t and tau should both be in seconds
    alpha = 1 - math.exp(-d_t / tau)
    user_embedding = alpha * np.array(file_info['embedding']) + (1 - alpha) * user_embedding

  return user_embedding

def get_user_embeddings(users):
  # 2.1) Time weight the file embeddings
  print("Calculating user embeddings: ")
  for user, user_info in tqdm(users.items()):
    user_info['file_infos'] = sorted(user_info['file_infos'], key=lambda event: event['timestamp'])
    try:
      user_info["embedding"] = get_user_embedding(user, user_info)
    except Exception as e:
      print(f"Caught exception with error: {e}")
  return users

users = get_user_embeddings(users)

In [None]:
save_to_cache(audit_id_to_file_hash, embeddings_cache, files, users)

In [None]:
get_file_hash_as_of_audit(2444061, "FILE_DOWNLOADED", '2025-09-27 23:13:29.745+00')
# users

In [None]:
from tqdm import tqdm

ra = dfs['resource_auditrecord']
rrn = dfs['resource_resourcenode']
rr = dfs['resource_resource']
rl = dfs['resource_label']
# rrn[rrn['id'] == 2444061]
# rr[rr['id'] == 814033]
# rr[rr['parent_id'] == 814033]
# rl[rl['hash_id'] == 78048]
# ra.head()
# ra[ra['audited_id'] == 2444061.0]
# len(rr['id'].unique())
# rr[rr['resource_type'] == "FILE"]
# rrn = dfs['resource_resourcenode']
# len(rrn['resource_id'].unique())

# Get valid audit record ids preemptively
ra.head()


# Meta Embeddings

In [None]:
audit_to_file_mapping, embeddings_cache, files, users  = get_cache()

In [342]:
UNIQUE_OPS = ra['operation'].unique()

In [None]:
from datetime import datetime, timezone
NOW = datetime.now(timezone.utc)

In [None]:
# USER: Extract meta features from audit log
from tqdm import tqdm
from datetime import datetime, timezone, timedelta
import math
from collections import Counter
import numpy as np

def get_interarrivals(file_infos):
  interarrivals = []
  if len(file_infos) <= 1:
    return []
  start_time = file_infos[0]["timestamp"]
  for fi in file_infos[1:]:
    next_time = fi['timestamp']
    diff = (next_time - start_time).total_seconds()
    assert diff >= 0, f"not ordered: start_time: {start_time}, next_time: {next_time}"
    interarrivals.append(diff)
    start_time = next_time
  return interarrivals

def get_sit(interarrivals, mean):
  deviations = [math.pow(i - mean, 2) for i in interarrivals]
  if len(deviations) == 0:
    return -1
  return math.sqrt(sum(deviations) / len(deviations))

def add_meta_to_user_info(user_info):
    # Sort file operations by time if not sorted already
    user_info['file_infos'] = sorted(user_info['file_infos'], key=lambda file_info: file_info['timestamp'])

    metadata = user_info.setdefault("metadata", {})
    # mean interarrival time
    interarrivals: list = get_interarrivals(user_info['file_infos'])
    metadata['mit'] = sum(interarrivals) / len(interarrivals) if len(interarrivals) > 0 else -1
    # std interarrival time
    metadata['sit'] = get_sit(interarrivals, metadata['mit'])
    # recency
    # Should be ordered by time
    if len(user_info['file_infos']) == 0:
      metadata['recency'] = -1
    else:
      metadata['recency'] = (NOW - user_info['file_infos'][-1]['timestamp']).total_seconds()
    # user distribution entropy
    metadata['ud_entropy'] = 0
    # unique users
    metadata['unique_users'] = 1
    # top users
    metadata['top_users'] = 1  

def get_user_info_from_auditlog(users, ra):
  for row in tqdm(ra.iterrows(), total=len(ra)):
    row_info = row[1]
    uid = row_info['user_id']
    user_info = {
      "file_infos": [],
      "embedding": []
    }
    if uid in users:
      user_info: dict = users[uid]
    
    # Create metadata object for user if doesn't exist
    metadata = user_info.setdefault("metadata", {})
    
    # Add location information
    locs = metadata.setdefault('locs', list())
    locs.append(f'{row_info['geolocation']}-{row_info['client_ip']}')

    # Add timestamps per operations
    operation = row_info['operation']
    times = metadata.setdefault('times', {})
    operation_times = times.setdefault(operation, [])
    operation_times.append(datetime.fromisoformat(row_info['timestamp']))

def get_fft_info(buckets):
  # Calculate top 3 frequencies/amplitudes/phases of time series using FFT
  bucket_arr = np.array(buckets)
  fft_result = np.fft.fft(bucket_arr)
  fft_freqs = np.fft.fftfreq(len(bucket_arr), d=1)
  amplitudes = np.abs(fft_result)
  phases = np.angle(fft_result)
  # Ignore the zero frequency (DC component)
  indices = np.argsort(amplitudes[1:])[::-1][:3] + 1 if len(amplitudes) > 1 else []
  return [(fft_freqs[i], amplitudes[i], phases[i]) for rank, i in enumerate(indices)]

def get_time_based_meta(start, end, interval, meta, operation, time_str):
  # Sort datetimes so it's O(n) operation to create buckets
  times = sorted(meta['times'][operation])
  buckets = []
  current = start
  idx = 0
  n = len(times)
  while current < end:
      next_bucket = current + interval
      count = 0
      # Count how many times fall into [current, next_bucket)
      while idx < n and times[idx] < next_bucket:
          if times[idx] >= current:
              count += 1
          idx += 1
      buckets.append(count)
      current = next_bucket
  
  for i, bucket in enumerate(buckets):
    meta['times'][f'{operation}_{time_str}_{i}'] = bucket
  fft_infos = get_fft_info(buckets)
  for i, info in enumerate(fft_infos):
    meta['times'][f'{operation}_{time_str}_freq_{i}'] = info[0]
    meta['times'][f'{operation}_{time_str}_amp_{i}'] = info[1]
    meta['times'][f'{operation}_{time_str}_phase_{i}'] = info[2]

def get_burstiness(interarrivals, meta):
  # TODO
  pass


def get_meta_features_users(users, num_top_locs=3):
  ra = dfs['resource_auditrecord']
  # ra = ra.head(10000)

  for user, user_info in users.items():
    if user_info.get("metadata"):
      del user_info['metadata']

  print("Iterate Audit Log")
  get_user_info_from_auditlog(users, ra)

  print("Update Users Metadata")
  for user, user_info in tqdm(users.items(), total=len(users)):
    add_meta_to_user_info(user_info)

    meta = user_info['metadata']
    if not meta.get('locs'):
      print(f"No locs for user {user}")
      continue

    # unique locations
    meta['unique_locs'] = len(set(meta['locs']))

    # top locations
    counter = Counter(meta['locs'])
    top_locs = counter.most_common(num_top_locs)
    for i, loc in enumerate(top_locs):
      meta[f'top_loc_{i}'] = loc[0]
      meta[f'top_loc_{i}_per'] = loc[1] / len(meta['locs'])

    # time meta
    for OP in UNIQUE_OPS:
      if not meta.get("times"):
        break
      times = meta["times"]
      if not times.get(OP):
        times[OP] = []
      
      # time buckets and FFT
      get_time_based_meta(NOW - timedelta(days=7), NOW, timedelta(hours=1), meta, OP, 'hour')
      get_time_based_meta(NOW - timedelta(weeks=4), NOW, timedelta(days=1), meta, OP, 'day')
      get_time_based_meta(NOW - timedelta(weeks=52), NOW, timedelta(weeks=4), meta, OP, 'month')

    get_burstiness(get_interarrivals(user_info['file_infos']), meta)

    # Clean up metadata we don't need
    del meta['locs']
    for OP in UNIQUE_OPS:
      del meta['times'][OP]

  return users

users = get_meta_features_users(users) 

In [343]:
ra.head()

Unnamed: 0,id,seqnum,timestamp,params,name,record_id,operation,geolocation,client_ip,user_id,user_agent,batch_id,auditor_id,audited_id,application_name
0,16225255,3,2025-09-28 01:29:31.77+00,,QBR - Q4'2025- Cloud Team Template - Farmer Ga...,6863310073061781342,FILE_DOWNLOADED,US-VA,172.190.156.130,gabib@terasky.com,,,813555,2443907.0,
1,16227582,3,2025-09-27 23:13:29.745+00,,TeraSky Baltic Rebilling - Customer List,2798679710274239636,FILE_DOWNLOADED,US-VA,172.190.156.130,gediminas@terasky.com,,,813555,2444061.0,
2,16229912,3,2025-09-27 21:31:40.133+00,,VCF Migration Plan for Existing vSphere Enviro...,2454216044330707803,FILE_DOWNLOADED,US-VA,172.190.156.130,mandeep@terasky.com,,,813555,2444217.0,
3,16230034,3,2025-09-27 21:31:10.928+00,,VCF Migration Plan for Existing vSphere Enviro...,-4490931525388484148,FILE_DOWNLOADED,US-VA,172.190.156.130,mandeep@terasky.com,,,813555,2444217.0,
4,16222754,3,2025-09-28 04:19:24.837+00,,Vinted License Count,8789196147665239826,READ,,,google-workspace-api@vinted-it.iam.gserviceacc...,,,813555,2443585.0,


In [None]:
# FILE: Extract meta features from audit log

def get_interarrivals(all_ops):
  interarrivals = []
  if len(all_ops) <= 1:
    return []
  start_time = all_ops[0]
  for next_time in all_ops[1:]:
    diff = (next_time - start_time).total_seconds()
    assert diff >= 0, f"not ordered: start_time: {start_time}, next_time: {next_time}"
    interarrivals.append(diff)
    start_time = next_time
  return interarrivals

def get_sit(interarrivals, mean):
  deviations = [math.pow(i - mean, 2) for i in interarrivals]
  if len(deviations) == 0:
    return -1
  return math.sqrt(sum(deviations) / len(deviations))

def get_file_info_from_auditlog(files, ra):
  for row in tqdm(ra.iterrows(), total=len(ra)):
    row_info = row[1]

    file_hash = audit_to_file_mapping[row_info['audited_id']]

    if file_hash not in files:
      print(f"Didn't find file hash in files... {file_hash}")
      continue
    file_info = files[file_hash]
    
    # Create metadata object for user if doesn't exist
    metadata = file_info.setdefault("metadata", {})
    
    # Add location information
    locs = metadata.setdefault('locs', list())
    locs.append(f'{row_info['geolocation']}-{row_info['client_ip']}')

    # Add timestamps per operations
    operation = row_info['operation']
    times = metadata.setdefault('times', {})

    # TODO: Include user information in operation
    all_operations = times.setdefault('all_ops', [])
    operation_times = times.setdefault(operation, [])
    timestamp = datetime.fromisoformat(row_info['timestamp'])
    # Per operation
    operation_times.append(timestamp)
    # All operations
    all_operations.append(timestamp)
  
  for file, file_info in files.items():
    metadata = file_info['metadata']
    # Ensure times are sorted...
    file_info['times']['all_ops'] = sorted(file_info['times']['all_ops'])
    all_ops = file_info['times']['all_ops']
    interarrivals = get_interarrivals(file_info['times']['all_ops'])
    # mean interarrival time
    metadata['mit'] = sum(interarrivals) / len(interarrivals) if len(interarrivals) > 0 else -1
    # std interarrival time
    metadata['sit'] = get_sit(interarrivals, metadata['mit'])
    # recency
    metadata['recency'] = (NOW - all_ops[-1]).total_seconds()
    # user distribution entropy
    metadata['ud_entropy'] = 0
    # unique users
    metadata['unique_users'] = 1
    # top users
    metadata['top_users'] = 1

def get_meta_features_files(files, num_top_locs=3):
  ra = dfs['resource_auditrecord']
  # ra = ra.head(10000)

  UNIQUE_OPS = ra['operation'].unique()

  for file, file_info in files.items():
    if file_info.get("metadata"):
      del file_info['metadata']

  print("Iterate Audit Log")
  get_file_info_from_auditlog(files, ra)

  print("Update Files Metadata")
  for file, file_info in tqdm(files.items(), total=len(files)):
    if not file_info.get('metadata'):
      print(f"No metadata for user {file}")
      continue
    meta = file_info['metadata']
    if not meta.get('locs'):
      print(f"No locs for user {file}")
      continue

    # unique locations
    meta['unique_locs'] = len(set(meta['locs']))

    # top locations
    counter = Counter(meta['locs'])
    top_locs = counter.most_common(num_top_locs)
    for i, loc in enumerate(top_locs):
      meta[f'top_loc_{i}'] = loc[0]
      meta[f'top_loc_{i}_per'] = loc[1] / len(meta['locs'])

    # time meta
    for OP in UNIQUE_OPS:
      if not meta.get("times"):
        break
      times = meta["times"]
      if not times.get(OP):
        times[OP] = []
      
      # time buckets and FFT
      get_time_based_meta(NOW - timedelta(days=7), NOW, timedelta(hours=1), meta, OP, 'hour')
      get_time_based_meta(NOW - timedelta(weeks=4), NOW, timedelta(days=1), meta, OP, 'day')
      get_time_based_meta(NOW - timedelta(weeks=52), NOW, timedelta(weeks=4), meta, OP, 'month')

    get_burstiness(get_interarrivals(file_info['file_infos']), meta)

    # Clean up metadata we don't need
    del meta['locs']
    for OP in UNIQUE_OPS:
      del meta['times'][OP]

  return files

files = get_meta_features_files(files)

Iterate Audit Log


  0%|          | 0/1109643 [00:00<?, ?it/s]


KeyError: 2443907.0

In [None]:
users[list(users.keys())[1]]['metadata']

# Semantic Similarity

# Training

# Quality Control

# Anomaly Detection