First let's pull the data in a format that can be used for clustering and propensity modeling.

In [None]:
# Config
import cjapy
cjapy.importConfigFile("python_config.json")
cja = cjapy.CJA()
data_view = "dv_62ba17d5a5d7845496f5fb4d"
dateRange = "2024-01-01T00:00:00.000/2024-01-17T00:00:00.000"

# Define the report request with important/interesting/differentiating metrics
myRequest = cjapy.RequestCreator()
myRequest.setDataViewId(data_view)
myRequest.setDimension("variables/adobe_personid")
myRequest.addMetric("metrics/orders")
myRequest.addMetric("metrics/revenue")
myRequest.addMetric("metrics/visits")
myRequest.addMetric("metrics/occurrences")
myRequest.addMetric("metrics/adobe_timespent")
myRequest.addGlobalFilter(dateRange)

# Pull and print the report from CJA
myReport = cja.getReport(myRequest)
myReport.dataframe

Now let's feed this data into a clustering model. There are many types of clustering approaches and we don't have time to go through all of them in this lab, but there are some excellent resources here:
https://sites.northwestern.edu/researchcomputing/2022/03/14/online-learning-resources-clustering/

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE
import pandas as pd
import plotly.express as px

# Drop the 'personid' column to create a DataFrame with only numerical columns
df = myReport.dataframe.drop(columns=["itemId"])
X = myReport.dataframe.drop(columns=['variables/adobe_personid', "itemId"])

# Create t-SNE instance
tsne = TSNE(n_components=2)

# Apply t-SNE
X_tsne = tsne.fit_transform(X)

# Convert the t-SNE results to a DataFrame
df_tsne = pd.DataFrame(X_tsne, columns=['tsne_1', 'tsne_2'])

Let's visualize what these transformed columns look like - tSNE was designed for easily visualizing multi-dimensional vectors in two dimensions.

In [None]:
# Create the scatter plot
fig = px.scatter(df_tsne, x='tsne_1', y='tsne_2', title='t-SNE Visualization')
fig.update_xaxes(scaleanchor="y", scaleratio=1)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_layout(height=600)

# Show the plot
fig.show()

With the transformed data, we can now cluster the data more easily

In [None]:
# Create DBSCAN instance (you can customize parameters like eps and min_samples)
dbscan = DBSCAN(eps=4.0, min_samples=20)

# Fit DBSCAN to the t-SNE results
dbscan.fit(df_tsne)

# Add the cluster labels to the original DataFrame
df['cluster'] = dbscan.labels_

# Convert the 'cluster' column to a categorical type
df['cluster'] = df['cluster'].astype('category')
df = pd.concat([df, df_tsne], axis=1)

# Create the scatter plot
fig = px.scatter(df, x='tsne_1', y='tsne_2', color='cluster', title='t-SNE Visualization with DBSCAN Clusters')
fig.update_xaxes(scaleanchor="y", scaleratio=1)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_layout(height=600)

# Show the plot
fig.show()

Now we can visualize how each of these clusters perform on a specific metric like revenue

In [None]:
# Create the 3D scatter plot
fig = px.scatter_3d(df, x='tsne_1', y='tsne_2', z='metrics/revenue', color='cluster', title='3D t-SNE Visualization with DBSCAN Clusters')
fig.update_layout(height=700)

# Show the plot
fig.show()