In [None]:
import numpy as np
import lancedb
import umap
import plotly.express as px
import pyarrow as pa 
import pandas as pd
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
model_name = 'MODEL_PATH' 
embedding = HuggingFaceEmbeddings(model_name=model_name)

In [None]:
data = pd.read_csv('DATA_PATH')
data

In [None]:
def embed(text):
    return embedding.embed_query(text)

# Add a vector column to the DataFrame
data['vector'] = data['Sentence'].apply(embed)

# Convert the DataFrame to a PyArrow Table
table = pa.Table.from_pandas(data)

# Connect to LanceDB and create the table
db = lancedb.connect("./lancedb")
lance_table = db.create_table(
    "my_table",
    data=table,
    mode="overwrite"
)

In [None]:
df = lance_table.to_pandas()

In [None]:
# Convert the list of vectors into a NumPy array
vectors = np.array(df['vector'].tolist())

# Perform UMAP dimensionality reduction to 3 dimensions
reducer = umap.UMAP(n_components=3, random_state=42)
umap_embeddings = reducer.fit_transform(vectors)

# Create a DataFrame for the UMAP results
umap_df = pd.DataFrame(umap_embeddings, columns=['x', 'y', 'z'])
umap_df['text'] = df['Topic']

# Plotting using Plotly
fig = px.scatter_3d(
    umap_df, 
    x='x', 
    y='y', 
    z='z',
    text='text', 
    hover_name='text', 
    color='text', 
    color_continuous_scale='Plasma')

# Adjusting point size and making them semi-transparent
fig.update_traces(hoverinfo='text', marker=dict(size=3, opacity=0.5))

# Setting the figure size to be square
fig.update_layout(width=1600, height=1000)
fig.show()

In [None]:
result = lance_table.search(embedding.embed_query('Gaming')).limit(100)

In [None]:
query_df = result.to_pandas()

In [None]:
len(query_df)

In [None]:
query_df.head()

In [None]:
min_size, max_size = 0.1, 20
# query_df['normalized_size'] = min_size + (query_df['_distance'] - query_df['_distance'].min()) / (query_df['_distance'].max() - query_df['_distance'].min()) * (max_size - min_size)
query_df['normalized_size'] = 1/(query_df['_distance'] - query_df['_distance'].min()) / (query_df['_distance'].max() - query_df['_distance'].min())


In [None]:
# Convert the list of vectors into a NumPy array
vectors = np.array(query_df['vector'].tolist())

# Perform UMAP dimensionality reduction to 3 dimensions
reducer = umap.UMAP(n_components=3, random_state=42)
umap_embeddings = reducer.fit_transform(vectors)

# Create a DataFrame for the UMAP results
umap_df = pd.DataFrame(umap_embeddings, columns=['x', 'y', 'z'])
umap_df['text'] = query_df['Topic']  # Replace 'df' with your DataFrame
umap_df['size'] = query_df['normalized_size']  # Add the normalized size column
umap_df['_distance'] = query_df['_distance']

# Plotting using Plotly
fig = px.scatter_3d(
    umap_df,
    x='x',
    y='y',
    z='z',
    text='text',
    color='text',
    size='size',  # Set the size of points based on the normalized float column
    color_continuous_scale='Plasma',
    hover_data = ['text', '_distance']
)

# Customizing hover information
fig.update_traces(marker=dict(opacity=0.5))

# Setting the figure size to be square
fig.update_layout(width=1600, height=1000)

fig.show()