In [None]:
import os

import numpy as np
import polars as pl
from dotenv import load_dotenv
from langfuse.decorators import langfuse_context, observe

# from openai import AzureOpenAI
# from langfuse.openai import openai
from langfuse.openai import AzureOpenAI


# https://cookbook.openai.com/examples/get_embeddings_from_dataset
# https://learn.microsoft.com/en-us/azure/ai-services/openai/tutorials/embeddings?tabs=python-new%2Ccommand-line&pivots=programming-language-python
# from utils.third_party.embeddings_utils import get_embedding  # requires OPEN_API_KEY in env

In [None]:
# Set up the environment
load_dotenv()
# openai.api_key = os.environ["AZURE_OPENAI_API_KEY"]

endpoint = os.getenv("ENDPOINT_URL", None)
deployment = os.getenv("DEPLOYMENT_NAME", "gpt-35-turbo")
subscription_key = os.getenv("AZURE_OPENAI_API_KEY")

In [2]:
def break_down(convo, cid):
    """
    # Break down the conversation into smaller parts and save it into a new dataframe with these columns:
    # 'cid', 'role', 'content', 'turn'
    # 'cid' is the conversation ID, 'role' is either 'user' or 'assistant', 'content' is the text of the message,
    # and 'turn' is the turn number
    """
    # Create a new dataframe with the columns 'cid', 'role', 'content', and 'turn'
    df = pl.DataFrame()
    # Split the conversation into individual messages

    for idx in range(len(convo)):
        role = str(convo[idx]["role"])
        content = str(convo[idx]["content"])
        cid = cid
        turn = int(convo[idx]["turn"])
        # Append the message to the dataframe
        df = df.vstack(
            pl.DataFrame(
                {"cid": [cid], "role": [role], "content": [content], "turn": [turn]}
            )
        )
    # Return the dataframe
    # print(df)
    return df

In [None]:
import json

MERGED_DATA_PATH = "../data/dataset/dataset-2.csv"
df = pl.read_json(MERGED_DATA_PATH)
# Iterate through the rows of the DataFrame
# print(df.head())
# df.head()

In [4]:
agg_df = pl.DataFrame()
for row in df.iter_rows(named=True):
    # Extract the text from the 'conversation' column

    convo = row["conversation"]
    cid = row["conversation_id"]

    # Parse the conversation column (assuming it's a JSON string or list of dictionaries)
    structured_convo = json.loads(convo) if isinstance(convo, str) else convo

    cdf = break_down(structured_convo, cid)
    # Append the new dataframe to the aggregated dataframe
    agg_df = agg_df.vstack(cdf)
print(agg_df.head())

shape: (5, 4)
┌─────┬───────────┬─────────────────────────────────┬──────┐
│ cid ┆ role      ┆ content                         ┆ turn │
│ --- ┆ ---       ┆ ---                             ┆ ---  │
│ str ┆ str       ┆ str                             ┆ i64  │
╞═════╪═══════════╪═════════════════════════════════╪══════╡
│ c0  ┆ clinician ┆ I'm feeling extremely low and … ┆ 0    │
│ c0  ┆ companion ┆ I'm really glad you reached ou… ┆ 0    │
│ c0  ┆ clinician ┆ Have you ever taken any action… ┆ 1    │
│ c0  ┆ companion ┆ I don't have personal experien… ┆ 1    │
│ c0  ┆ clinician ┆ Is there anything causing you … ┆ 2    │
└─────┴───────────┴─────────────────────────────────┴──────┘


In [5]:
embedding_model = "text-embedding-3-small"
embedding_encoding = "cl100k_base"
max_tokens = 8000  # the maximum for text-embedding-3-small is 8191

# tokenizer = tiktoken.get_encoding("cl100k_base")
# df['n_tokens'] = df["final_text"].apply(lambda x: len(tokenizer.encode(x)))

In [6]:
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-02-01",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
)


@observe(as_type="generation")
def generate_embeddings(text, model="text-embedding-3-small"):
    embeddings = client.embeddings.create(input=[text], model=model).data[0].embedding
    langfuse_context.update_current_observation(
        model="text-embedding-3-small",
    )
    return embeddings


agg_df = agg_df.with_columns(
    pl.Series(
        "emb",
        [
            generate_embeddings(content, model="text-embedding-3-small")
            for content in agg_df["content"].to_list()
        ],
    )
)

API error occurred: Internal server error occurred. For help, please contact support: https://langfuse.com/support
API error occurred: Internal server error occurred. For help, please contact support: https://langfuse.com/support
API error occurred: Internal server error occurred. For help, please contact support: https://langfuse.com/support
API error occurred: Internal server error occurred. For help, please contact support: https://langfuse.com/support
API error occurred: Internal server error occurred. For help, please contact support: https://langfuse.com/support
API error occurred: Internal server error occurred. For help, please contact support: https://langfuse.com/support
API error occurred: Internal server error occurred. For help, please contact support: https://langfuse.com/support
API error occurred: Internal server error occurred. For help, please contact support: https://langfuse.com/support
API error occurred: Internal server error occurred. For help, please contact sup

In [12]:
agg_df.shape[0]

4957

In [None]:
# add a new column with indexes
agg_df = agg_df.with_columns(pl.Series("index", range(agg_df.shape[0])))
agg_df.head()

# write the dataframe to a json file
agg_df.write_json("../data/dataset/gpt4o_guided_embeddings.json")

In [15]:
# load that json nfile
mdf = pl.read_json("../data/dataset/gpt4o_guided_embeddings.json")
mdf.head()

cid,role,content,turn,emb,index
str,str,str,i64,list[f64],i64
"""c0""","""clinician""","""I'm feeling extremely low and …",0,"[-0.007844, -0.002921, … 0.028327]",0
"""c0""","""companion""","""I'm really glad you reached ou…",0,"[0.018988, -0.03178, … 0.044438]",1
"""c0""","""clinician""","""Have you ever taken any action…",1,"[0.022273, -0.011298, … -0.010838]",2
"""c0""","""companion""","""I don't have personal experien…",1,"[0.03026, -0.022518, … 0.025142]",3
"""c0""","""clinician""","""Is there anything causing you …",2,"[0.010532, -0.022993, … 0.003161]",4


In [None]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


def get_embedding(text, model="text-embedding-ada-002"):  # model = "deployment_name"
    return client.embeddings.create(input=[text], model=model).data[0].embedding


def search_docs(df, user_query, top_n=4, to_print=True):
    embedding = get_embedding(
        user_query,
        model="text-embedding-ada-002",  # model should be set to the deployment name you chose when you deployed the text-embedding-ada-002 (Version 2) model
    )
    df["similarities"] = df.ada_v2.apply(lambda x: cosine_similarity(x, embedding))

    res = df.sort_values("similarities", ascending=False).head(top_n)
    if to_print:
        display(res)
    return res

In [None]:
# cluster each sentences in the embedding space using polars dataframe 'emb' column
def cluster_embeddings(df, n_clusters=4):
    from sklearn.cluster import KMeans

    # Convert the embeddings to a numpy array
    embeddings = np.array(df["emb"].to_list())

    # Perform KMeans clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    df["cluster"] = kmeans.fit_predict(embeddings)

    return df


# cluster the embeddings
agg_df = cluster_embeddings(agg_df, n_clusters=4)
# print the clusters
agg_df = agg_df.with_columns(
    pl.Series("cluster", [int(cluster) for cluster in agg_df["cluster"].to_list()])
)

In [None]:
# save the dataframe to a csv file
agg_df.write_json("sample_convo_embeddings.json")

In [None]:
# cluster each sentences in the embedding space
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE


def plot_clusters(df, n_clusters=2):
    tsne = TSNE(n_components=2, random_state=0, perplexity=min(30, len(df) - 1))
    embeddings = np.array(
        df["emb"].to_list()
    )  # Convert Polars Series to a list and then to a numpy array
    X_2d = tsne.fit_transform(embeddings)

    # Perform KMeans clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    clusters = kmeans.fit_predict(embeddings)
    df = df.with_columns(pl.Series("cluster", clusters))

    plt.figure(figsize=(10, 8))
    sns.scatterplot(
        x=X_2d[:, 0],
        y=X_2d[:, 1],
        hue=df["cluster"].to_list(),  # Ensure 'cluster' is converted to a list
        palette=sns.color_palette("hsv", n_clusters),
        legend="full",
        alpha=0.3,
    )

    # Add labels to each point
    for i, content in enumerate(df["content"].to_list()):
        plt.text(
            X_2d[i, 0],
            X_2d[i, 1],
            content,
            fontsize=8,
            alpha=0.7,
            ha="right",
        )

    plt.show()


plot_clusters(agg_df, n_clusters=4)

In [None]:
# Draw an interactive plot using plotly which visualizes the clusters
import plotly.express as px
import plotly.io as pio

pio.renderers.default = "browser"  # Set the default renderer to 'browser'


def plot_clusters_interactive(df, n_clusters=2):
    tsne = TSNE(n_components=2, random_state=0, perplexity=min(30, len(df) - 1))
    embeddings = np.array(
        df["emb"].to_list()
    )  # Convert Polars Series to a list and then to a numpy array
    X_2d = tsne.fit_transform(embeddings)

    # Perform KMeans clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    clusters = kmeans.fit_predict(embeddings)
    df = df.with_columns(pl.Series("cluster", clusters))

    fig = px.scatter(
        x=X_2d[:, 0],
        y=X_2d[:, 1],
        color=df["cluster"].to_list(),  # Ensure 'cluster' is converted to a list
        hover_name=df["content"].to_list(),
        title="t-SNE visualization of clustered embeddings",
        labels={"x": "t-SNE Component 1", "y": "t-SNE Component 2"},
    )

    fig.show()


plot_clusters_interactive(agg_df, n_clusters=4)
# save the plot to a file
plot_clusters_interactive(agg_df, n_clusters=4)
# save the plot to a file
fig.write_html("clusters.html")