In [184]:
import pandas as pd

In [185]:
#load in all the sessions, and sessions where people product clicked
df_filter = pd.read_csv('export-chat-generated-unique-sessions-by-events-topic-no-filter.csv')
df_clickers = pd.read_csv('export-chat-generated-unique-sessions-by-events-topic-only-clickers.csv')

In [186]:
df_both = df_filter.join(df_clickers, lsuffix="all", rsuffix="clicker").drop("seriesclicker", axis=1).fillna(0)
df_both['ratio'] = df_both['total countclicker'] / df_both['total countall']

In [187]:
df_both

Unnamed: 0,seriesall,total countall,total countclicker,ratio
0,chat generated - none,439,47.0,0.107062
1,chat generated - laptops,108,15.0,0.138889
2,chat generated - snowboards,63,8.0,0.126984
3,chat generated - guitars,51,4.0,0.078431
4,chat generated - laptop,23,4.0,0.173913
5,chat generated - Laptop,19,4.0,0.210526
6,chat generated - phone,10,3.0,0.3
7,chat generated - smartphone,10,3.0,0.3
8,chat generated - car,9,2.0,0.222222
9,chat generated - mobile,9,2.0,0.222222


In [188]:
def parse(x):
    return x.split('-')[-1].strip()

df_both['seriesall'] = df_both['seriesall'].apply(parse)

In [191]:
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
df_both['embeds'] = df_both['seriesall'].apply(lambda x: embedder.encode(x))


In [197]:
"""Tools to visualize embeddings."""

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans


def cluster(embeddings, labels, cluster_count=10):
    """Cluster the embeddings and return embeddings and cluster labels."""
    if not isinstance(embeddings, np.ndarray):
        embeddings = np.array(embeddings)
    kmeans = KMeans(n_clusters=cluster_count, random_state=42)
    labels = kmeans.fit_predict(embeddings)
    tsne = TSNE(n_components=2, random_state=42)
    embeddings_2d = tsne.fit_transform(embeddings)
    return embeddings_2d, labels


def plot_ploty(embeddings_2d, labels, labels_dict):
    """Plotly."""
    df = pd.DataFrame(
        {"x": embeddings_2d[:, 0], "y": embeddings_2d[:, 1], "label": labels})
    df["col"] = df["label"].map(labels_dict)
    fig = px.scatter(df, x="x", y="y", color="col", hover_name="label")
    fig.show()


def plot_ploty_third_axis(embeddings_2d, third, labels, labels_dict):
    """Same as above, but show a third axis"""
    df = pd.DataFrame(
        {"x": embeddings_2d[:, 0], "y": embeddings_2d[:, 1], "z": third, "label": labels})
    df["col"] = df["label"].map(labels_dict)
    fig = px.scatter_3d(df, x="x", y="y", z="z", color="col", hover_name="label")
    fig.show()  
    
    
def plot_3d_embeddings(embeddings, labels, labels_dict):
    """Project into three dimensions.
    Unlike prior functions, this one takes the embeddings straight up.
    """
    tsne = TSNE(n_components=3, random_state=42)
    embeddings_3d = tsne.fit_transform(embeddings)
    df = pd.DataFrame(
        {"x": embeddings_3d[:, 0], "y": embeddings_3d[:, 1], "z": embeddings_3d[:, 2], "label": labels})
    df["col"] = df["label"].map(labels_dict)
    fig = px.scatter_3d(df, x="x", y="y", z="z", text='label', color="col")
    fig.show()

In [193]:
embeds = np.concatenate(df_both['embeds'].to_numpy()).reshape(1425, 768)
twod, labels = cluster(embeds, df_both['seriesall'].to_numpy())
label_map = dict(zip(df_both['seriesall'], labels))
plot_ploty(twod, df_both['seriesall'].to_numpy(), label_map)






In [210]:
embeds = np.concatenate(df_both['embeds'].to_numpy()).reshape(1425, 768)
twod, labels = cluster(embeds, df_both['seriesall'].to_numpy())
label_map = dict(zip(df_both['seriesall'][1:], labels[1:]))
plot_ploty_third_axis(twod[1:], df_both['total countall'][1:], df_both['seriesall'][1:].to_numpy(), label_map)





In [211]:
embeds = np.concatenate(df_both['embeds'].to_numpy()).reshape(1425, 768)
twod, labels = cluster(embeds, df_both['seriesall'].to_numpy())
label_map = dict(zip(df_both['seriesall'][1:], labels[1:]))
plot_ploty_third_axis(twod[1:], df_both['total countall'][1:], df_both['seriesall'][1:].to_numpy(), label_map)





In [207]:
#analysis by cluster, not very helpful
embeds = np.concatenate(df_both['embeds'].to_numpy()).reshape(1425, 768)
twod, labels = cluster(embeds, df_both['seriesall'].to_numpy())
df_both['cluster'] = labels
df_cluster = df_both.groupby('cluster').sum()
df_cluster['ratio'] = df_cluster['total countclicker'] / df_cluster['total countall']
df_cluster




The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



Unnamed: 0_level_0,total countall,total countclicker,ratio
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,83,16.0,0.192771
1,181,17.0,0.093923
2,141,23.0,0.163121
3,33,33.0,1.0
4,103,21.0,0.203883
5,303,37.0,0.122112
6,306,34.0,0.111111
7,138,36.0,0.26087
8,185,33.0,0.178378
9,879,115.0,0.13083
