In [1]:
import pandas as pd
import json
from ast import literal_eval
import plotly.express as px
import matplotlib.pyplot as plt
from datetime import datetime

In [2]:
# Load data (adjust path)
# df = pd.read_csv('../datasets/2025_csv/_chunk_0_100000.csv', nrows=5000)
df = pd.read_csv('../datasets/2025_csv/_chunk_0_100000.csv')

# Printing the columns
print("Dataset columns:")
print(df.columns)
print(df.shape)

Dataset columns:
Index(['$insert_id', 'amplitude_id', 'app', 'city', 'client_event_time',
       'client_upload_time', 'country', 'data', 'data_type', 'device_family',
       'device_id', 'device_type', 'dma', 'event_id', 'event_properties',
       'event_time', 'event_type', 'language', 'library', 'os_name',
       'os_version', 'platform', 'processed_time', 'region',
       'server_received_time', 'server_upload_time', 'session_id', 'user_id',
       'user_properties', 'uuid'],
      dtype='object')
(100000, 30)


In [None]:
# Want to make sure that:
# All nested JSON columns are fully parsed.
# All relevant fields are extracted and flattened.
# No data is lost or incorrectly parsed during the process.

In [3]:
# Parse nested JSON columns
json_cols = ['data', 'event_properties', 'user_properties']
for col in json_cols:
    df[col] = df[col].apply(lambda x: literal_eval(str(x)) if pd.notna(x) else {})

# Flatten nested columns
df['event_slug'] = df['event_properties'].apply(lambda x: x.get('slug', 'unknown'))
df['user_roles'] = df['user_properties'].apply(lambda x: x.get('roles', []))
df['utm_source'] = df['user_properties'].apply(lambda x: x.get('initial_utm_source', 'EMPTY'))

# Convert timestamps to a parsable time value
time_cols = ['client_event_time', 'event_time', 'server_received_time']
for col in time_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')

In [4]:
# Printing the columns
print("Dataset columns:")
print(df.columns)

Dataset columns:
Index(['$insert_id', 'amplitude_id', 'app', 'city', 'client_event_time',
       'client_upload_time', 'country', 'data', 'data_type', 'device_family',
       'device_id', 'device_type', 'dma', 'event_id', 'event_properties',
       'event_time', 'event_type', 'language', 'library', 'os_name',
       'os_version', 'platform', 'processed_time', 'region',
       'server_received_time', 'server_upload_time', 'session_id', 'user_id',
       'user_properties', 'uuid', 'event_slug', 'user_roles', 'utm_source'],
      dtype='object')


# EVENT TYPES
- What event types are more popular?
- After splitting by the colon : delimeter and seperating to get "event type levels", can we find relationships between layers?
- Make a Sankey diagram

In [5]:
# Event Types Metadata
print("Before splitting by : delimiter")
num_event_types = df['event_type'].nunique()
event_types = df['event_type'].unique()
print(f"Number of unique event types: {num_event_types}")

print("\nAfter splitting by : delimiter")

split_event_types = df.copy()

# Split the 'event_type' column by one or more colons (:)    AND IGNORE WHEN COLON IS AT THE BEGINNING
split_event_types['event_type'] = split_event_types['event_type'].str.lstrip(':')
split_event_types = split_event_types['event_type'].str.split(r':+', expand=True)

# Get the number of columns after the split
print(f"Split event_types into {split_event_types.shape[1]} parts")

# Rename the columns (update based on expected number of parts)
split_event_types.columns = [f'event_type_lvl_{i}' for i in range(split_event_types.shape[1])]
split_event_types = split_event_types.replace('', 'None').fillna('None')


Before splitting by : delimiter
Number of unique event types: 174

After splitting by : delimiter
Split event_types into 4 parts


In [6]:
# Aside: I wonder if the unique values in event_type_lvl_0 to event_type_lvl_3 are mutually exclusive?
# Get distinct values from each of the 4 columns
distinct_values_lvl_0 = set(split_event_types['event_type_lvl_0'].unique())
distinct_values_lvl_1 = set(split_event_types['event_type_lvl_1'].unique())
distinct_values_lvl_2 = set(split_event_types['event_type_lvl_2'].unique())
distinct_values_lvl_3 = set(split_event_types['event_type_lvl_3'].unique())

# Check intersections between all pairs of sets
intersection_1_2 = distinct_values_lvl_0 & distinct_values_lvl_1
intersection_1_3 = distinct_values_lvl_0 & distinct_values_lvl_2
intersection_1_4 = distinct_values_lvl_0 & distinct_values_lvl_3
intersection_2_3 = distinct_values_lvl_1 & distinct_values_lvl_2
intersection_2_4 = distinct_values_lvl_1 & distinct_values_lvl_3
intersection_3_4 = distinct_values_lvl_2 & distinct_values_lvl_3

# Store intersections and corresponding set names
intersections = {
    "lvl_0 & lvl_1": intersection_1_2,
    "lvl_0 & lvl_2": intersection_1_3,
    "lvl_0 & lvl_3": intersection_1_4,
    "lvl_1 & lvl_2": intersection_2_3,
    "lvl_1 & lvl_3": intersection_2_4,
    "lvl_2 & lvl_3": intersection_3_4
}

# Check if there are any intersections
mutually_exclusive = not any(intersections.values())

# Output result
if mutually_exclusive:
    print("The columns are mutually exclusive (no overlap of distinct values).")
else:
    print("The columns are NOT mutually exclusive (there is overlap of distinct values).")
    for pair, intersection in intersections.items():
        if intersection:
            print(f"Intersection between {pair}: {intersection}")


The columns are NOT mutually exclusive (there is overlap of distinct values).
Intersection between lvl_0 & lvl_1: {'configurable-table', 'widget', 'layout', 'duplicate-policy-modal'}
Intersection between lvl_0 & lvl_2: {'configurable-table', 'widget', 'layout'}
Intersection between lvl_1 & lvl_2: {'view', 'configurable-table', 'widget', 'submit-click', 'close-click', 'duplicate-rating', 'None', 'layout', 'render'}
Intersection between lvl_1 & lvl_3: {'None', 'submit-click', 'render'}
Intersection between lvl_2 & lvl_3: {'submit-click', 'None', 'save-click', 'delete-click', 'action-click', 'render'}


In [7]:
# Plot the treemap to explain event types hierarchy
device_fig = px.treemap(
    split_event_types,
    path=['event_type_lvl_0', 'event_type_lvl_1', 'event_type_lvl_2', 'event_type_lvl_3'],
    title='Event Types Hierarchy (NOT A GOOD REPRESENTATION)'
)
device_fig.update_traces(root_color="lightgrey")
device_fig.show()

In [8]:
import plotly.graph_objects as go

# Create a list of unique nodes from the event type levels
nodes = list(pd.concat([split_event_types['event_type_lvl_0'], split_event_types['event_type_lvl_1'], split_event_types['event_type_lvl_2'], split_event_types['event_type_lvl_3']]).unique())
NONE_IDX = nodes.index('None')

nodes_lvl_0 = split_event_types['event_type_lvl_0'].unique()
nodes_lvl_1 = split_event_types['event_type_lvl_1'].unique()
nodes_lvl_2 = split_event_types['event_type_lvl_2'].unique()
nodes_lvl_3 = split_event_types['event_type_lvl_3'].unique()

# print(f"nodes_lvl_0 {len(nodes_lvl_0)}")
# print(f"nodes_lvl_1 {len(nodes_lvl_1)}")
# print(f"nodes_lvl_2 {len(nodes_lvl_2)}")
# print(f"nodes_lvl_3 {len(nodes_lvl_3)}")
# print(f"nodes sum {len(nodes_lvl_0) + len(nodes_lvl_1) + len(nodes_lvl_2) + len(nodes_lvl_3)}")
# print(f"nodes {nodes}")
# print(f"number of nodes {len(nodes)}")

# Map the event type levels to node indices
split_event_types['event_type_lvl_0_idx'] = split_event_types['event_type_lvl_0'].apply(lambda x: nodes.index(x))
split_event_types['event_type_lvl_1_idx'] = split_event_types['event_type_lvl_1'].apply(lambda x: nodes.index(x))
split_event_types['event_type_lvl_2_idx'] = split_event_types['event_type_lvl_2'].apply(lambda x: nodes.index(x))
split_event_types['event_type_lvl_3_idx'] = split_event_types['event_type_lvl_3'].apply(lambda x: nodes.index(x))

# Count the occurrences between each level
link_data = [
    {'source': split_event_types['event_type_lvl_0_idx'], 'target': split_event_types['event_type_lvl_1_idx']},
    {'source': split_event_types['event_type_lvl_1_idx'], 'target': split_event_types['event_type_lvl_2_idx']},
    {'source': split_event_types['event_type_lvl_2_idx'], 'target': split_event_types['event_type_lvl_3_idx']},
]

# Count the occurrences for each link
links = []
for data in link_data:

    link_counts = pd.Series(data['source']).value_counts().to_dict()

    for source_idx, count in link_counts.items():
        if (source_idx == NONE_IDX):
            continue
        # boolean masking to find rows where source_idx matches
        # The Sankey diagram needs connections between nodes, and this finds the each target index associated with each source node.
        target_indices = data['target'][data['source'] == source_idx].unique()

        for target_idx in target_indices:
            source_target_count = data['source'][data['source'] == source_idx][data['target'] == target_idx].count()
            if (target_idx == NONE_IDX):
                continue
        
            links.append({
                'source': source_idx,
                'target': target_idx,
                'value': source_target_count
            })

# Create the Sankey diagram
fig = go.Figure(go.Sankey(
    node=dict(
        pad=10,
        thickness=20,
        line=dict(color="black", width=1),
        label=nodes,
        x=[0.9 for _ in nodes],
    ),
    link=dict(
        source=[link['source'] for link in links],
        target=[link['target'] for link in links],
        value=[link['value'] for link in links]
    )
))

fig.update_layout(title_text="Event Types Hierarchy (APPROPRIATE REPRESENTATION)", font_size=12, height=800)
fig.show()

# EVENT PROPERTIES

In [9]:
# Normalize the event_properties JSON data into separate columns
split_event_properties = pd.json_normalize(df['event_properties'])
split_event_properties = split_event_properties.fillna('None')
split_event_properties['test'] = split_event_properties['displayName'].str.replace(' ', '-').str.lower()

slugs = split_event_properties['slug'].unique()
displayNames = split_event_properties['displayName'].unique()
print("slugs:", len(slugs), "displayNames:", len(displayNames))

# Combine event types and properties into one dataframe
df_event = pd.concat([split_event_types, split_event_properties], axis=1)
df_event = df_event[['displayName', 'slug', 'event_type_lvl_0', 'event_type_lvl_1', 'event_type_lvl_2', 'event_type_lvl_3']]

df_event.head(5)

slugs: 268 displayNames: 286


Unnamed: 0,displayName,slug,event_type_lvl_0,event_type_lvl_1,event_type_lvl_2,event_type_lvl_3
0,,,session_end,,,
1,,,application-window-opened,,,
2,,,dashboard,my-book,view,
3,,my-book,dashboard,my-book,layout,render
4,Actions,actions,dashboard,my-book,widget,render


In [10]:
# Determines how thick a link has to be to show up in the Sankey diagram (set to 0 to show all data)
LINK_COUNT_CUTOFF = 100

import plotly.graph_objects as go

# Create a list of unique nodes from the event type levels
df_event['/displayName'] = 'DISPLAYNAME_' + df_event['displayName'].astype(str)

nodes = list(pd.concat([df_event['/displayName'], df_event['event_type_lvl_0'], df_event['event_type_lvl_1'], df_event['event_type_lvl_2'], df_event['event_type_lvl_3']]).unique())
NONE_IDX = nodes.index('None')

nodes_prop = df_event['/displayName'].unique()
nodes_lvl_0 = df_event['event_type_lvl_0'].unique()
nodes_lvl_1 = df_event['event_type_lvl_1'].unique()
nodes_lvl_2 = df_event['event_type_lvl_2'].unique()
nodes_lvl_3 = df_event['event_type_lvl_3'].unique()

print(f"nodes_prop {len(nodes_prop)}")
print(f"nodes_lvl_0 {len(nodes_lvl_0)}")
print(f"nodes_lvl_1 {len(nodes_lvl_1)}")
print(f"nodes_lvl_2 {len(nodes_lvl_2)}")
print(f"nodes_lvl_3 {len(nodes_lvl_3)}")

# Map the event type levels to node indices
df_event['prop_idx'] = df_event['/displayName'].apply(lambda x: nodes.index(x))
df_event['event_type_lvl_0_idx'] = df_event['event_type_lvl_0'].apply(lambda x: nodes.index(x))
df_event['event_type_lvl_1_idx'] = df_event['event_type_lvl_1'].apply(lambda x: nodes.index(x))
df_event['event_type_lvl_2_idx'] = df_event['event_type_lvl_2'].apply(lambda x: nodes.index(x))
df_event['event_type_lvl_3_idx'] = df_event['event_type_lvl_3'].apply(lambda x: nodes.index(x))

# Count the occurrences between each level
link_data = [
    {'source': df_event['prop_idx'], 'target': df_event['event_type_lvl_0_idx']},
    {'source': df_event['event_type_lvl_0_idx'], 'target': df_event['event_type_lvl_1_idx']},
    {'source': df_event['event_type_lvl_1_idx'], 'target': df_event['event_type_lvl_2_idx']},
    {'source': df_event['event_type_lvl_2_idx'], 'target': df_event['event_type_lvl_3_idx']},
]

# Count the occurrences for each link
links = []
for data in link_data:

    link_counts = pd.Series(data['source']).value_counts().to_dict()

    for source_idx, count in link_counts.items():
        if (source_idx == NONE_IDX):
            continue
        # boolean masking to find rows where source_idx matches
        # The Sankey diagram needs connections between nodes, and this finds the each target index associated with each source node.
        target_indices = data['target'][data['source'] == source_idx].unique()

        for target_idx in target_indices:
            source_target_count = data['source'][data['source'] == source_idx][data['target'] == target_idx].count()
            if (target_idx == NONE_IDX):
                continue
        
            if (source_target_count > LINK_COUNT_CUTOFF):
                links.append({
                    'source': source_idx,
                    'target': target_idx,
                    'value': source_target_count
                })

# Create the Sankey diagram
fig = go.Figure(go.Sankey(
    node=dict(
        pad=3,
        thickness=20,
        line=dict(color="black", width=1),
        label=nodes,
        x=[0.9 for _ in nodes],
    ),
    link=dict(
        source=[link['source'] for link in links],
        target=[link['target'] for link in links],
        value=[link['value'] for link in links]
    )
))

fig.update_layout(title_text="Event Properties > Event Types Hierarchy (APPROPRIATE REPRESENTATION)", font_size=10, height=1000)
fig.show()

nodes_prop 286
nodes_lvl_0 34
nodes_lvl_1 62
nodes_lvl_2 26
nodes_lvl_3 9
