In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
import json
import numpy as np
import glob
from ast import literal_eval
import plotly.express as px
import matplotlib.pyplot as plt
from datetime import datetime

In [2]:
# Find all chunk files (assuming they follow a consistent naming pattern)
file_paths = glob.glob('../datasets/2025_csv/_chunk_*.csv')
# Read each file (limiting rows if necessary) and concatenate
dfs = [pd.read_csv(fp, nrows=10000) for fp in file_paths]
df = pd.concat(dfs, ignore_index=True)

# Printing the original columns
print("Dataset columns:")
print(df.columns)
# print(df.head)

Dataset columns:
Index(['$insert_id', 'amplitude_id', 'app', 'city', 'client_event_time',
       'client_upload_time', 'country', 'data', 'data_type', 'device_family',
       'device_id', 'device_type', 'dma', 'event_id', 'event_properties',
       'event_time', 'event_type', 'language', 'library', 'os_name',
       'os_version', 'platform', 'processed_time', 'region',
       'server_received_time', 'server_upload_time', 'session_id', 'user_id',
       'user_properties', 'uuid'],
      dtype='object')


In [3]:
# Code checks which columns may have nested JSON data
print("Columns with potential JSON data:")
for col in df.columns:
    if df[col].dtype == 'object':
        try:
            sample = df[col].dropna().iloc[0]
            if sample.startswith('{') or sample.startswith('['):
                print(f"- {col} (Possible JSON)")
        except Exception:
            pass

Columns with potential JSON data:
- data (Possible JSON)
- event_properties (Possible JSON)
- user_properties (Possible JSON)


In [4]:
# Ensure each value is a dictionary
# Determine the union of all keys
# Create new columns for each key
# Drop the original nested JSON columns
def ensure_dict(x):
    if isinstance(x, dict):
        return x
    try:
        # String representation of a dict turned into dictionary object
        return literal_eval(x)
    except Exception:
        # If parsing fails, return an empty dict
        return {}

In [5]:
# List of nested columns to process
nested_columns = ['data', 'event_properties', 'user_properties']

# Process each nested column
# ONLY RUN THIS ONCE!
for col in nested_columns:
    # Ensure each value in the column is a dictionary
    df[col] = df[col].apply(ensure_dict)
    
    # Determine the union of all keys in the column
    all_keys = set()
    df[col].dropna().apply(lambda d: all_keys.update(d.keys()) if isinstance(d, dict) else None)
    
    # For each key, create a new column only if it has at least one valid (non-empty) value
    for key in all_keys:
        new_col_name = f"{col}_{key}"
        # Create a temporary Series for this key
        series = df[col].apply(lambda d: d.get(key, None) if isinstance(d, dict) else None)
        # Check if there's at least one valid value (not None, {}, [], or 'EMPTY')
        if series.apply(lambda v: v not in (None, {}, [], 'EMPTY')).any():
            df[new_col_name] = series
    
    # After processing, drop the original nested JSON column.
    df.drop(columns=[col], inplace=True)

# Convert specified timestamp columns to datetime objects
time_cols = ['client_event_time', 'event_time', 'server_received_time']
for col in time_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')

In [6]:
# Printing the columns (new additional ones added from flattening nested columns)
print("Dataset columns:")
print(df.columns)
print("# of columns:", len(df.columns))

Dataset columns:
Index(['$insert_id', 'amplitude_id', 'app', 'city', 'client_event_time',
       'client_upload_time', 'country', 'data_type', 'device_family',
       'device_id', 'device_type', 'dma', 'event_id', 'event_time',
       'event_type', 'language', 'library', 'os_name', 'os_version',
       'platform', 'processed_time', 'region', 'server_received_time',
       'server_upload_time', 'session_id', 'user_id', 'uuid',
       'data_user_properties_updated', 'data_path', 'event_properties_formId',
       'event_properties_status',
       'event_properties_[Amplitude] Session Replay ID',
       'event_properties_line-of-business', 'event_properties_type',
       'event_properties_version', 'event_properties_accountId',
       'event_properties_hasAssignees', 'event_properties_policyId',
       'event_properties_id', 'event_properties_messageId',
       'event_properties_displayName', 'event_properties_variant',
       'event_properties_attachmentId', 'event_properties_templateName

In [7]:
# Single example looks like:
# Display entire column content without truncation
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
print(df.iloc[101].to_string())

$insert_id                                                     63573c4a-c77c-4cdd-91ce-1c074c33d224
amplitude_id                                                                          1090135225326
app                                                                                          591532
city                                                                                            NaN
client_event_time                                                        2025-01-13 20:45:10.077000
client_upload_time                                                       2025-01-13 20:45:10.818000
country                                                                               United States
data_type                                                                                     event
device_family                                                                               Windows
device_id                                                      76c54070-44c1-461f-854c-8c3082635877


In [8]:
# What does atleast a single valid entry look like?
# Dictionary to store a valid value for each column
valid_entries = {}
# Iterate over each column in the DataFrame
for col in df.columns:
    # Iterate over each row value for this column using .items()
    for idx, value in df[col].items():
        # Check if the value is not empty (None, empty dict, empty list, or 'EMPTY')
        if value not in (None, {}, [], 'EMPTY'):
            # Once a valid value is found, record it and its type, then break out of the loop for this column
            valid_entries[col] = (value, type(value))
            break

# Print the results
for col, (value, dtype) in valid_entries.items():
    print(f"Column: {col}\n  Value: {value}\n  Type: {dtype}\n")
print(len(valid_entries))

Column: $insert_id
  Value: 74413f76-5071-4c44-bd3b-5ded989dec39
  Type: <class 'str'>

Column: amplitude_id
  Value: 1090135225326
  Type: <class 'int'>

Column: app
  Value: 591532
  Type: <class 'int'>

Column: city
  Value: nan
  Type: <class 'float'>

Column: client_event_time
  Value: 2025-01-13 20:32:46.571000
  Type: <class 'pandas._libs.tslibs.timestamps.Timestamp'>

Column: client_upload_time
  Value: 2025-01-13 20:32:47.664000
  Type: <class 'str'>

Column: country
  Value: United States
  Type: <class 'str'>

Column: data_type
  Value: event
  Type: <class 'str'>

Column: device_family
  Value: Windows
  Type: <class 'str'>

Column: device_id
  Value: 76c54070-44c1-461f-854c-8c3082635877
  Type: <class 'str'>

Column: device_type
  Value: Windows
  Type: <class 'str'>

Column: dma
  Value: nan
  Type: <class 'float'>

Column: event_id
  Value: 6023
  Type: <class 'int'>

Column: event_time
  Value: 2025-01-13 20:32:46.571000
  Type: <class 'pandas._libs.tslibs.timestamps.Ti

In [9]:
# Check that we dont have columns with zero valid pieces of data
# First, determine the columns with valid data (as already stored in valid_entries)
valid_columns = set(valid_entries.keys())
# Get all DataFrame columns
all_columns = set(df.columns)
# Determine columns that did not yield any valid entry
invalid_columns = all_columns - valid_columns
print("Total number of columns in df:", len(df.columns))
print("Number of columns with at least one valid entry:", len(valid_columns))
print("Number of columns with NO valid data:", len(invalid_columns))
print("Columns with no valid data:")
for col in invalid_columns:
    print("  -", col)


Total number of columns in df: 67
Number of columns with at least one valid entry: 67
Number of columns with NO valid data: 0
Columns with no valid data:


In [10]:
# Convert event_time to datetime and sort
df['event_time'] = pd.to_datetime(df['event_time'])
df.sort_values(by=['user_id', 'event_time'], inplace=True)
# Group by user_id
user_groups = df.groupby('user_id')

In [11]:
# Calculate maximum number of timesteps over all users
max_timesteps = user_groups.size().max()
print("Max timesteps in any user group:", max_timesteps)

Max timesteps in any user group: 4900


In [12]:
features = [
    "city", "client_event_time", "client_upload_time", "country", "data_type",
    "device_family", "device_type", "language", "event_type", "os_name", "os_version",
    "platform", "region", "event_properties_type", "event_properties_line-of-business",
    "event_properties_accountId", "event_properties_status", "event_properties_lineOfBusiness",
    "event_properties_action", "event_properties_error", "event_properties_templateName",
    "event_properties_hasAssignees", "event_properties_hasAccounts", "event_properties_displayName",
    "event_properties_variant", "event_properties_tableId", "event_properties_menu",
    "event_properties_filename", "event_properties_attachmentId", "user_properties_isInternalUser",
    "user_properties_roles", "user_properties_referring_domain", "user_properties_businessUnit",
    "user_properties_hostname"
]

In [13]:
# Each mapping: value (as string) -> learnable tensor of shape (128,)
embedding_dim = 128
feature_embeddings = {}
default_embeddings = {}

In [14]:
# For each feature, extract unique values (convert lists to tuple for hashability)
for feat in features:
    if feat == 'user_properties_roles':
        # Convert lists to tuple so that they're hashable
        unique_vals = (
            df[feat]
            .dropna()
            .apply(lambda x: tuple(x) if isinstance(x, list) else x)
            .unique()
            .tolist()
        )
    else:
        unique_vals = df[feat].dropna().astype(str).unique().tolist()
    
    mapping = {}
    # For each unique value, assign a learnable tensor (initialized randomly)
    for val in unique_vals:
        # If the value is not already a string, convert it to a string for consistency.
        key = str(val)
        mapping[key] = torch.randn(embedding_dim, requires_grad=True)
    feature_embeddings[feat] = mapping

    # Create a default embedding for missing or unseen values
    default_embeddings[feat] = torch.randn(embedding_dim, requires_grad=True)


In [15]:
feature_embeddings['city']['Tomball'].shape

torch.Size([128])

In [16]:
default_embeddings['city'].shape

torch.Size([128])

In [17]:
# We will need an integer mapping for the event_properties_action values to be used as labels.
action_feat = "event_properties_action"
action_vals = df[action_feat].dropna().astype(str).unique().tolist()
action_to_idx = {val: idx for idx, val in enumerate(action_vals)}
default_action_idx = -1

In [18]:
str(action_to_idx)

"{'task': 0, 'accountTask': 1, 'urgentNote': 2, 'underwriterReferral': 3, 'generateQuoteDocument': 4, 'modelRequest': 5, 'endorsementRequest': 6, 'updateSubmissionStatus': 7, 'quoteRequest': 8, 'referral': 9, 'addNote': 10, 'markAsQuoted': 11, 'bindRequest': 12, 'markAsQuotedLost': 13, 'calculateCommercialCreditTier': 14, 'finalUnderwriting': 15, 'accountReview': 16, 'verifyIssuance': 17, 'referralNote': 18, 'quoteAction': 19, 'recalculateFmcsaModel': 20, 'rmsEarthquakeModel': 21, 'generateForms': 22, 'generatePolicyDocument': 23, 'initialSubmissionResponse': 24, 'processDriverAndVehicleList': 25, 'cancelRequest': 26, 'processLossRun': 27, 'bookingPrep': 28, 'renewPolicy': 29, 'cancelPolicyAction': 30, 'endorsePolicy': 31, 'generateBinderDocument': 32, 'quotePolicy': 33, 'renewalTask': 34, 'markAsErroneous': 35}"

In [19]:
# For each event (timestep), we will build a vector by concatenating the embedding for each feature.
# The resulting vector length = len(features) * embedding_dim

# key: user_id, value: list of event vectors (torch tensors)
concatenated_sequences = {}

# Over all users
for user_id, group in user_groups:
    # Sort group by event_time ascending
    group_sorted = group.sort_values(by='event_time')
    
    event_vectors = []
    # Iterate over all events for this user
    for _, row in group_sorted.iterrows():
        
        # All event vectors stored in a list for now
        event_vecs = []

        # Over every feature
        for feat in features:
            # Get the corresponding value of that feature
            val = row[feat]

            # Two cases to handle -> list entry or NULL
            # If the value is a list or np.ndarray, handle it accordingly
            if isinstance(val, (list, np.ndarray)):
                if len(val) == 0:
                    emb = default_embeddings[feat]
                else:
                    # Convert list to tuple (or a string representation) for consistent lookup
                    val_str = str(tuple(val))
                    emb = feature_embeddings[feat].get(val_str, default_embeddings[feat])

            else:
                # For non-array values, check for null normally.
                if pd.isnull(val):
                    emb = default_embeddings[feat]
                else:
                    val_str = str(val)
                    emb = feature_embeddings[feat].get(val_str, default_embeddings[feat])

            # Append to evnet list
            event_vecs.append(emb)

        # Concatenate all feature embeddings into one vector (dim: len(features)*embedding_dim)
        # Likely automatically converts from list of lists to a vector
        event_vector = torch.cat(event_vecs, dim=0)
        event_vectors.append(event_vector)
    
    # Save the sequence for the user if there is at least one event (corresponding to the user_id)
    if len(event_vectors) > 0:
        concatenated_sequences[user_id] = event_vectors

In [20]:
print(len(concatenated_sequences['00edf9a8-949a-415d-8ed2-ae04b8f4d326']))

76


In [21]:
concatenated_sequences['00edf9a8-949a-415d-8ed2-ae04b8f4d326'][0].shape

torch.Size([4352])

In [22]:
# Set window size (number of timesteps) for each example
time_steps = 128
X_examples = []  # list of tensors of shape (time_steps, hidden_size)
y_examples = []  # list of labels (integer indices from action mapping)
max_examples = 200  # Limit to 10 examples for speed/demonstration

# For each user sequence, slide a window of size 128
for user_id, events in concatenated_sequences.items():
    
    # Ensure that the sequence is long enough to form at least one example
    if len(events) <= time_steps:
        continue

    # Get the sorted rows for this user (we assume these are already sorted by event_time)
    user_df = df[df['user_id'] == user_id].sort_values(by='event_time')
    
    # For each possible sliding window
    for i in range(len(events) - time_steps):
        # Input window: events i to i+time_steps-1
        window = events[i:i + time_steps]
        
        # The label is the event_properties_action of the event at position (i+time_steps)
        target_row = user_df.iloc[i + time_steps]
        
        # Check if event_properties_action exists; if not, skip this example
        if pd.isnull(target_row[action_feat]):
            continue
        
        target_val = str(target_row[action_feat])
        if target_val not in action_to_idx:
            continue
        
        label_idx = action_to_idx[target_val]
        
        # Concatenate the window into a tensor of shape (time_steps, hidden_size)
        example_tensor = torch.stack(window, dim=0)
        X_examples.append(example_tensor)
        y_examples.append(label_idx)
        
        # Stop once we've reached the maximum desired examples
        if len(X_examples) >= max_examples:
            break
    if len(X_examples) >= max_examples:
        break

print("Generated examples:", len(X_examples))

Generated examples: 200


In [23]:
# Convert lists to tensors
if len(X_examples) == 0:
    raise ValueError("No valid examples found!")
X_all = torch.stack(X_examples, dim=0)  # shape: (num_examples, time_steps, hidden_size)
y_all = torch.tensor(y_examples, dtype=torch.long)  # shape: (num_examples,)

print("Total examples:", X_all.shape[0])
print("Example shape (time_steps, hidden_size):", X_all.shape[1], X_all.shape[2])

num_examples = X_all.shape[0]
indices = np.arange(num_examples)
np.random.shuffle(indices)

train_end = int(0.8 * num_examples)
test_end = int(0.9 * num_examples)

train_idx = indices[:train_end]
test_idx = indices[train_end:test_end]
val_idx = indices[test_end:]

X_train = X_all[train_idx]
y_train = y_all[train_idx]
X_test = X_all[test_idx]
y_test = y_all[test_idx]
X_val = X_all[val_idx]
y_val = y_all[val_idx]

print("Train examples:", X_train.shape[0])
print("Test examples:", X_test.shape[0])
print("Validation examples:", X_val.shape[0])

Total examples: 200
Example shape (time_steps, hidden_size): 128 4352
Train examples: 160
Test examples: 20
Validation examples: 20
Number of training batches: 5
Shape of one training batch (example): torch.Size([32, 128, 4352])
Shape of one label batch (example): torch.Size([32])


In [24]:
batch_size = 32
def create_batches(X, y, batch_size):
    num_batches = int(np.ceil(X.shape[0] / batch_size))
    X_batches = []
    y_batches = []
    for i in range(num_batches):
        start = i * batch_size
        end = start + batch_size
        X_batches.append(X[start:end])
        y_batches.append(y[start:end])
    return X_batches, y_batches

X_train_batches, y_train_batches = create_batches(X_train, y_train, batch_size)
X_test_batches, y_test_batches = create_batches(X_test, y_test, batch_size)
X_val_batches, y_val_batches = create_batches(X_val, y_val, batch_size)

print("Number of training batches:", len(X_train_batches))
print("Shape of one training batch (example):", X_train_batches[0].shape)  # (batch_size, time_steps, hidden_size)
print("Shape of one label batch (example):", y_train_batches[0].shape)     # (batch_size,)
# - X_train_batches: list of training tensors of shape (batch, 128, num_features*128)
# - y_train_batches: corresponding labels for next event_properties_action (as indices)

Number of training batches: 5
Shape of one training batch (example): torch.Size([32, 128, 4352])
Shape of one label batch (example): torch.Size([32])
