In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import pandas as pd
import json
import numpy as np
import glob
from ast import literal_eval
import plotly.express as px
import matplotlib.pyplot as plt
from datetime import datetime

In [2]:
# Find all chunk files (assuming they follow a consistent naming pattern)
file_paths = glob.glob('../datasets/2025_csv/_chunk_*.csv')
# Read each file (limiting rows if necessary) and concatenate
dfs = [pd.read_csv(fp, nrows=10000) for fp in file_paths]
df = pd.concat(dfs, ignore_index=True)

# Printing the original columns
print("Dataset columns:")
print(df.columns)
print(df.head)

Dataset columns:
Index(['$insert_id', 'amplitude_id', 'app', 'city', 'client_event_time',
       'client_upload_time', 'country', 'data', 'data_type', 'device_family',
       'device_id', 'device_type', 'dma', 'event_id', 'event_properties',
       'event_time', 'event_type', 'language', 'library', 'os_name',
       'os_version', 'platform', 'processed_time', 'region',
       'server_received_time', 'server_upload_time', 'session_id', 'user_id',
       'user_properties', 'uuid'],
      dtype='object')
<bound method NDFrame.head of                                   $insert_id  amplitude_id     app  \
0       538ab3f8-09df-4750-90e0-abe8ad8f6a0c  857540493886  591532   
1       2d9442b9-a8fe-442a-9761-f63fe0171ac5  857540493886  591532   
2       b39d7ac2-76af-48a6-a367-9e0050359961  857540493886  591532   
3       57a434dd-d570-4ba2-a77c-de2a4f6be9cf  857540493886  591532   
4       fa8af5ee-f45f-4d1a-b1e6-5ea7d293a628  857540493886  591532   
...                                      ..

In [3]:
# Code checks which columns may have nested JSON data
print("Columns with potential JSON data:")
for col in df.columns:
    if df[col].dtype == 'object':
        try:
            sample = df[col].dropna().iloc[0]
            if sample.startswith('{') or sample.startswith('['):
                print(f"- {col} (Possible JSON)")
        except Exception:
            pass

Columns with potential JSON data:
- data (Possible JSON)
- event_properties (Possible JSON)
- user_properties (Possible JSON)


In [4]:
# Ensure each value is a dictionary
# Determine the union of all keys
# Create new columns for each key
# Drop the original nested JSON columns
def ensure_dict(x):
    if isinstance(x, dict):
        return x
    try:
        # String representation of a dict turned into dictionary object
        return literal_eval(x)
    except Exception:
        # If parsing fails, return an empty dict
        return {}

In [5]:
# List of nested columns to process
nested_columns = ['data', 'event_properties', 'user_properties']

# Process each nested column
# ONLY RUN THIS ONCE!
for col in nested_columns:
    # Ensure each value in the column is a dictionary
    df[col] = df[col].apply(ensure_dict)
    
    # Determine the union of all keys in the column
    all_keys = set()
    df[col].dropna().apply(lambda d: all_keys.update(d.keys()) if isinstance(d, dict) else None)
    
    # For each key, create a new column only if it has at least one valid (non-empty) value
    for key in all_keys:
        new_col_name = f"{col}_{key}"
        # Create a temporary Series for this key
        series = df[col].apply(lambda d: d.get(key, None) if isinstance(d, dict) else None)
        # Check if there's at least one valid value (not None, {}, [], or 'EMPTY')
        if series.apply(lambda v: v not in (None, {}, [], 'EMPTY')).any():
            df[new_col_name] = series
    
    # After processing, drop the original nested JSON column.
    df.drop(columns=[col], inplace=True)

# Convert specified timestamp columns to datetime objects
time_cols = ['client_event_time', 'event_time', 'server_received_time']
for col in time_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')

In [6]:
# Printing the columns (new additional ones added from flattening nested columns)
print("Dataset columns:")
print(df.columns)
print("# of columns:", len(df.columns))

Dataset columns:
Index(['$insert_id', 'amplitude_id', 'app', 'city', 'client_event_time',
       'client_upload_time', 'country', 'data_type', 'device_family',
       'device_id', 'device_type', 'dma', 'event_id', 'event_time',
       'event_type', 'language', 'library', 'os_name', 'os_version',
       'platform', 'processed_time', 'region', 'server_received_time',
       'server_upload_time', 'session_id', 'user_id', 'uuid', 'data_path',
       'data_user_properties_updated', 'event_properties_formId',
       'event_properties_[Amplitude] Session Replay ID',
       'event_properties_policy-id', 'event_properties_messageId',
       'event_properties_internalEmailId', 'event_properties_displayName',
       'event_properties_quoteName', 'event_properties_version',
       'event_properties_policyId', 'event_properties_hasAccounts',
       'event_properties_id', 'event_properties_account-id',
       'event_properties_templateName', 'event_properties_filename',
       'event_properties_line

In [7]:
# Single example looks like:
# Display entire column content without truncation
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
print(df.iloc[140].to_string())

$insert_id                                        c1d05ff4-d1cb-48ee-b4a4-d15c42906116
amplitude_id                                                              857540771367
app                                                                             591532
city                                                                           Detroit
client_event_time                                           2025-01-16 13:06:26.842000
client_upload_time                                          2025-01-16 13:06:27.495000
country                                                                  United States
data_type                                                                        event
device_family                                                                  Windows
device_id                                         7448ddbc-a304-4f4c-9c1e-0e7b0adf93ff
device_type                                                                    Windows
dma                                        

In [8]:
# What does atleast a single valid entry look like?
# Dictionary to store a valid value for each column
valid_entries = {}
# Iterate over each column in the DataFrame
for col in df.columns:
    # Iterate over each row value for this column using .items()
    for idx, value in df[col].items():
        # Check if the value is not empty (None, empty dict, empty list, or 'EMPTY')
        if value not in (None, {}, [], 'EMPTY'):
            # Once a valid value is found, record it and its type, then break out of the loop for this column
            valid_entries[col] = (value, type(value))
            break

# Print the results
for col, (value, dtype) in valid_entries.items():
    print(f"Column: {col}\n  Value: {value}\n  Type: {dtype}\n")
print(len(valid_entries))

Column: $insert_id
  Value: 538ab3f8-09df-4750-90e0-abe8ad8f6a0c
  Type: <class 'str'>

Column: amplitude_id
  Value: 857540493886
  Type: <class 'int'>

Column: app
  Value: 591532
  Type: <class 'int'>

Column: city
  Value: Mumbai
  Type: <class 'str'>

Column: client_event_time
  Value: 2025-01-16 13:28:02.427000
  Type: <class 'pandas._libs.tslibs.timestamps.Timestamp'>

Column: client_upload_time
  Value: 2025-01-16 13:28:04.133000
  Type: <class 'str'>

Column: country
  Value: India
  Type: <class 'str'>

Column: data_type
  Value: event
  Type: <class 'str'>

Column: device_family
  Value: Windows
  Type: <class 'str'>

Column: device_id
  Value: a798db5f-d66a-468c-8e51-4f1412853e4d
  Type: <class 'str'>

Column: device_type
  Value: Windows
  Type: <class 'str'>

Column: dma
  Value: nan
  Type: <class 'float'>

Column: event_id
  Value: 6156
  Type: <class 'int'>

Column: event_time
  Value: 2025-01-16 13:28:02.427000
  Type: <class 'pandas._libs.tslibs.timestamps.Timestamp'

In [10]:
# Check that we dont have columns with zero valid pieces of data
# First, determine the columns with valid data (as already stored in valid_entries)
valid_columns = set(valid_entries.keys())
# Get all DataFrame columns
all_columns = set(df.columns)
# Determine columns that did not yield any valid entry
invalid_columns = all_columns - valid_columns
print("Total number of columns in df:", len(df.columns))
print("Number of columns with at least one valid entry:", len(valid_columns))
print("Number of columns with NO valid data:", len(invalid_columns))
print("Columns with no valid data:")
for col in invalid_columns:
    print("  -", col)


Total number of columns in df: 67
Number of columns with at least one valid entry: 67
Number of columns with NO valid data: 0
Columns with no valid data:


### Prediction Modelling

In [11]:
# Convert event_time to datetime and sort
df['event_time'] = pd.to_datetime(df['event_time'])
df.sort_values(by=['session_id', 'event_time'], inplace=True)  # Changed to session_id

In [12]:
# Group by session_id instead of user_id
session_groups = df.groupby('session_id')

In [13]:
features = [
    "city", "client_event_time", "client_upload_time", "country", "data_type",
    "device_family", "device_type", "language", "event_type", "os_name", "os_version",
    "platform", "region", "event_properties_type", "event_properties_line-of-business",
    "event_properties_accountId", "event_properties_status", "event_properties_lineOfBusiness",
    "event_properties_action", "event_properties_error", "event_properties_templateName",
    "event_properties_hasAssignees", "event_properties_hasAccounts", "event_properties_displayName",
    "event_properties_variant", "event_properties_tableId", "event_properties_menu",
    "event_properties_filename", "event_properties_attachmentId", "user_properties_isInternalUser",
    "user_properties_roles", "user_properties_referring_domain", "user_properties_businessUnit",
    "user_properties_hostname"
]
print(len(features))

34


In [14]:
# Create index mappings for each feature (modified for list handling)
feature_to_idx = {}

for feat in features:
    if feat == 'user_properties_roles':
        # Handle list values by taking first role
        unique_vals = (
            df[feat]
            .dropna()
            .apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else np.nan)
            .dropna()
            .astype(str)
            .unique()
            .tolist()
        )
    else:
        unique_vals = df[feat].dropna().astype(str).unique().tolist()
    
    # Create mapping with default index 0
    idx_map = {'__default__': 0}
    current_idx = 1
    for val in unique_vals:
        key = str(val)
        if key not in idx_map:
            idx_map[key] = current_idx
            current_idx += 1
    feature_to_idx[feat] = idx_map

In [16]:
# Process data into indices (with error handling)
concatenated_sequences = {}

for session_id, group in session_groups:  # Now using session_id
    group_sorted = group.sort_values(by='event_time')
    event_indices = []
    
    for _, row in group_sorted.iterrows():
        indices = []
        for feat in features:
            val = row[feat]
            
            # Handle null/empty values
            if isinstance(val, (list, np.ndarray)):
                
                # Handle empty lists/arrays
                if len(val) == 0:
                    indices.append(0)
                    continue
                
                # Special handling for user_properties_roles
                if feat == 'user_properties_roles':
                    # Take first role if available
                    val = val[0] if len(val) > 0 else np.nan
                else:
                    # Convert list to string representation
                    val = str(val)
            
            elif pd.isnull(val):
                indices.append(0)
                continue
            else:
                val = str(val)
            
            # Get index from mapping
            idx = feature_to_idx[feat].get(val, 0)
            indices.append(idx)
        
        event_indices.append(torch.tensor(indices, dtype=torch.long))
    
    if event_indices:
        concatenated_sequences[session_id] = event_indices
print(len(concatenated_sequences))

8931


In [17]:
# Set the window size and maximum examples
time_steps = 8
max_examples = 133713371337

# Create action to index mapping
action_feat = "event_type"
action_vals = df[action_feat].dropna().astype(str).unique().tolist()
action_to_idx = {val: idx for idx, val in enumerate(action_vals)}

# Initialize lists to store the examples and labels
X_examples = []
y_examples = []

too_short_count = 0
invalid_label_count = 0
total_sessions = len(concatenated_sequences)

for session_id, events in concatenated_sequences.items():
    if len(events) <= time_steps:
        too_short_count += 1
        continue  # Skip too short sessions
    
    session_df = df[df['session_id'] == session_id].sort_values(by='event_time')
    
    for i in range(len(events) - time_steps):
        window = events[i:i + time_steps]
        example = torch.stack(window)  # shape: (time_steps, hidden_size)
        
        target_row = session_df.iloc[i + time_steps]
        if pd.isnull(target_row[action_feat]):
            invalid_label_count += 1
            continue  # Skip missing labels
        
        target_val = str(target_row[action_feat])
        if target_val not in action_to_idx:
            invalid_label_count += 1
            continue  # Skip unknown labels
        
        label_idx = action_to_idx[target_val]
        
        X_examples.append(example)
        y_examples.append(label_idx)
        
        if len(X_examples) >= max_examples:
            break
    if len(X_examples) >= max_examples:
        break

# Print debugging info
print(f"Total sessions: {total_sessions}")
print(f"Too short sessions (≤{time_steps}): {too_short_count}")
print(f"Invalid or missing labels: {invalid_label_count}")
print(f"Valid examples collected: {len(X_examples)}")

Total sessions: 8931
Too short sessions (≤8): 5373
Invalid or missing labels: 0
Valid examples collected: 151527


In [18]:
str(action_vals)

"['session_end', 'application-window-opened', 'session_start', 'agency-dashboard::layout:render', 'agency-dashboard:::view', 'agency-dashboard::widget:render', 'agency-dashboard::configurable-table:render', '::nav-header:user-signed-out', 'dashboard:my-book:configurable-table:render', 'dashboard:my-book:widget:render', 'triaged-submission-list:my-book:configurable-table:render', 'triaged-submission-list:my-book::view', 'dashboard:my-book:layout:render', 'dashboard:my-book::view', '::nav-header:action-center-click', 'action-center:::view', 'account:::view', 'account-lines:::view', 'account-lines::layout:render', 'account-lines::widget:render', 'account-lines::configurable-table:render', ':all-accounts:configurable-table:render', ':all-accounts:widget:render', ':all-accounts:layout:render', ':all-accounts::view', 'submissions:policy-definition::submit-click', 'submissions:all-policy:configurable-table:render', 'submissions:all-policy::view', 'submissions:triaged_submissions-definition::v

In [49]:
len(action_vals)

168

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_all = torch.stack(X_examples) # (num_examples, time_steps, 34)
y_all = torch.tensor(y_examples, dtype=torch.long)

In [20]:
print(X_all.shape)
print(y_all.shape)

torch.Size([151527, 8, 34])
torch.Size([151527])


In [21]:
# Train/val/test split
num_examples = len(X_examples)
indices = torch.randperm(num_examples)

train_idx = indices[:int(0.8*num_examples)]
val_idx = indices[int(0.8*num_examples):int(0.9*num_examples)]
test_idx = indices[int(0.9*num_examples):]

X_train, y_train = X_all[train_idx], y_all[train_idx]
X_val, y_val = X_all[val_idx], y_all[val_idx]
X_test, y_test = X_all[test_idx], y_all[test_idx]

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test.shape}")

Training set: torch.Size([121221, 8, 34])
Validation set: torch.Size([15153, 8, 34])
Test set: torch.Size([15153, 8, 34])


In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LSTM(nn.Module):
    def __init__(self, feature_sizes, embedding_dim=64, hidden_size=128, dropout=0.5):
        super().__init__()
        # Create an embedding for each feature
        self.embeddings = nn.ModuleList([
            nn.Embedding(num_embeddings=size, embedding_dim=embedding_dim)
            for size in feature_sizes
        ])
        
        # Dropout module for regularization
        self.dropout = nn.Dropout(dropout)
        
        # First LSTM layer: input is concatenated embeddings
        self.lstm1 = nn.LSTM(
            input_size=embedding_dim * len(feature_sizes),
            hidden_size=hidden_size,
            batch_first=True
        )
        # A linear projection to match the dimensions for the first residual connection
        self.residual_proj1 = nn.Linear(embedding_dim * len(feature_sizes), hidden_size)
        
        # Second LSTM layer: input and output are both hidden_size
        self.lstm2 = nn.LSTM(
            input_size=hidden_size,
            hidden_size=hidden_size,
            batch_first=True
        )
        
        # Activation function
        self.activation = nn.ReLU()
        
        # Final fully-connected layer to produce logits over actions
        self.fc = nn.Linear(hidden_size, len(action_to_idx))
        
    def forward(self, x):
        # x shape: (batch_size, time_steps, num_features)
        batch_size, seq_len, num_feats = x.size()
        
        # Process each feature through its embedding
        embedded = []
        for i in range(num_feats):
            emb = self.embeddings[i](x[:, :, i])  # (batch_size, seq_len, embedding_dim)
            embedded.append(emb)
        x_emb = torch.cat(embedded, dim=-1)  # (batch_size, seq_len, embedding_dim*num_feats)
        x_emb = self.dropout(x_emb)
        
        # First LSTM layer
        out1, _ = self.lstm1(x_emb)  # (batch_size, seq_len, hidden_size)
        # Residual: project input embeddings and add to LSTM output
        res1 = self.residual_proj1(x_emb)  # (batch_size, seq_len, hidden_size)
        out1 = self.activation(out1 + res1)
        out1 = self.dropout(out1)
        
        # Second LSTM layer
        out2, _ = self.lstm2(out1)  # (batch_size, seq_len, hidden_size)
        # Residual: add the output of the first LSTM layer (out1) to the output of the second
        out2 = self.activation(out2 + out1)
        out2 = self.dropout(out2)
        
        # Use the output of the final time step for prediction
        logits = self.fc(out2[:, -1, :])  # (batch_size, num_actions)
        return logits


In [23]:
# Initialize model
feature_sizes = [len(feature_to_idx[feat]) for feat in features]
model = LSTM(feature_sizes).to(device)

In [24]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params / 1e6:.2f}M")

Total parameters: 16.13M


In [25]:
# Create dataloaders
batch_size = 128
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = TensorDataset(X_val, y_val, )
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [26]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
num_epochs = 10

In [27]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * inputs.size(0)
    
    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * inputs.size(0)
            
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {train_loss/len(train_loader.dataset):.4f}")
    print(f"Val Loss: {val_loss/len(val_loader.dataset):.4f}")
    print(f"Val Acc: {100*correct/total:.2f}%")
    print("-------------------")

Epoch 1/10
Train Loss: 2.0229
Val Loss: 1.3255
Val Acc: 60.52%
-------------------
Epoch 2/10
Train Loss: 1.4323
Val Loss: 1.0468
Val Acc: 69.48%
-------------------
Epoch 3/10
Train Loss: 1.2462
Val Loss: 0.9221
Val Acc: 73.54%
-------------------
Epoch 4/10
Train Loss: 1.1455
Val Loss: 0.8475
Val Acc: 75.64%
-------------------
Epoch 5/10
Train Loss: 1.0809
Val Loss: 0.8063
Val Acc: 77.14%
-------------------
Epoch 6/10
Train Loss: 1.0367
Val Loss: 0.7720
Val Acc: 77.74%
-------------------
Epoch 7/10
Train Loss: 1.0031
Val Loss: 0.7355
Val Acc: 78.77%
-------------------
Epoch 8/10
Train Loss: 0.9717
Val Loss: 0.7257
Val Acc: 78.97%
-------------------
Epoch 9/10
Train Loss: 0.9496
Val Loss: 0.7074
Val Acc: 79.56%
-------------------
Epoch 10/10
Train Loss: 0.9311
Val Loss: 0.6999
Val Acc: 79.86%
-------------------


In [28]:
# Test evaluation
model.eval()
test_loss = 0.0
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        test_loss += loss.item() * inputs.size(0)
        
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
print(f"Test Loss: {test_loss/len(test_loader.dataset):.4f}")
print(f"Test Acc: {100*correct/total:.2f}%")

Test Loss: 0.7038
Test Acc: 79.34%


In [31]:
type(model.state_dict())

collections.OrderedDict

In [33]:
type(feature_to_idx)

dict

In [34]:
type(action_to_idx)

dict

In [35]:
type([len(v) for v in feature_to_idx.values()])

list

In [38]:
type(features)

list

In [41]:
# Save model, mappings, and metadata
import torch

save_path = "model.pth"
torch.save({
    'model_state_dict': model.state_dict(),
    'feature_to_idx': feature_to_idx,
    'action_to_idx': action_to_idx,
    'feature_sizes': [len(v) for v in feature_to_idx.values()],
    'features_order': features 
}, save_path)

In [45]:
import pickle

In [46]:
# Save the dictionary to a file using pickle
with open('action_to_idx.pkl', 'wb') as f:
    pickle.dump(action_to_idx, f)

In [48]:
event_actions

NameError: name 'event_actions' is not defined