# Embedding

In [20]:
# imports
import pandas as pd
import tiktoken
import openai

from openai.embeddings_utils import get_embedding

In [23]:
# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

In [24]:
# load & inspect dataset
input_data = "papers.csv"
df = pd.read_csv(input_data, index_col=0)
df["combined"] = (
    "Title: " + df.title.str.strip() + "; Abstract: " + df.abstract.str.strip()
)
df.head(2)

Unnamed: 0_level_0,title,author,venue,year,citationCount,url,abstract,combined
paperId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
162e2e9ac70702c146c0aa8432e4a6806bb8c42e,Coupling Large Language Models with Logic Prog...,Zhun Yang,Annual Meeting of the Association for Computat...,2023,1,https://arxiv.org/abs/2307.07696,"While large language models (LLMs), such as GP...",Title: Coupling Large Language Models with Log...
38fe8f324d2162e63a967a9ac6648974fc4c66f3,PaLM-E: An Embodied Multimodal Language Model,Danny Driess,International Conference on Machine Learning,2023,274,https://arxiv.org/abs/2303.03378,Large language models excel at a wide range of...,Title: PaLM-E: An Embodied Multimodal Language...


In [25]:
encoding = tiktoken.get_encoding(embedding_encoding)

# This may take a few minutes
df["embedding"] = df.combined.apply(lambda x: get_embedding(x, engine=embedding_model))
df.to_csv("papers_embedded.csv")

# Data Visualization

In [87]:
markers = ['o', 's', '^', 'v', '<', '>', 'p', '*', 'D', 'H', '+', 'x', '|', '_', '1', '2']
colors = [
    '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', 
    '#e377c2', '#7f7f7f', '#bcbd22', '#17becf', '#40004b', '#762a83', 
    '#9970ab', '#c2a5cf', '#e7d4e8', '#f7f7f7'
]


## OpenAI Embedding + KMeans

In [88]:
from sklearn.manifold import TSNE
import matplotlib
import matplotlib.pyplot as plt
from scipy.spatial import ConvexHull
import numpy as np
import ipywidgets as widgets
from IPython.display import display, clear_output
from sklearn.metrics import silhouette_score
import pandas as pd
from ast import literal_eval
from sklearn.cluster import KMeans

# load data
datafile_path = "papers_embedded.csv"
df = pd.read_csv(datafile_path)
df["embedding"] = df.embedding.apply(literal_eval).apply(np.array)  # convert string to numpy array
matrix = np.vstack(df.embedding.values)

tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200)
vis_dims2 = tsne.fit_transform(matrix)

# 2D 데이터 포인트
x = [x for x, y in vis_dims2]
y = [y for x, y in vis_dims2]

# 중앙점 계산
avg_x = np.mean(x)
avg_y = np.mean(y)

# 각 데이터 포인트와 중앙점 사이의 거리 계산
distances = np.sqrt((x - avg_x)**2 + (y - avg_y)**2)

# 임계값 설정 (예: 평균 거리의 2배)
threshold = 2 * np.mean(distances)

# 아웃라이어 인덱스 찾기
outliers = np.where(distances > threshold)[0]

# 아웃라이어를 제외한 데이터프레임 설정
filtered_df = df.drop(outliers)
filtered_matrix = np.vstack(filtered_df.embedding.values)

vis_dims2 = tsne.fit_transform(filtered_matrix)
filtered_x = [x for x, y in vis_dims2]
filtered_y = [y for x, y in vis_dims2]

def visualize_clusters(n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
    filtered_df['Cluster'] = kmeans.fit_predict(filtered_matrix)
    
    silhouette_avg = silhouette_score(filtered_matrix, filtered_df['Cluster'])
    print(f"Silhouette Score for {n_clusters} clusters: {silhouette_avg:.4f}")
    
    cluster_counts = filtered_df['Cluster'].value_counts()
    print("\nNumber of data points in each cluster:")
    print(cluster_counts)
    
    plt.figure(figsize=(10, 8))
    for category, (marker, color) in enumerate(zip(markers, colors)):
        xs = np.array(filtered_x)[filtered_df.Cluster == category]
        ys = np.array(filtered_y)[filtered_df.Cluster == category]
        
        if len(xs) > 0:
            plt.scatter(xs, ys, color=color, alpha=0.3, marker=marker, label=f'Cluster {category}')
            if len(xs) > 2:
                hull = ConvexHull(np.column_stack((xs, ys)))
                plt.fill(xs[hull.vertices], ys[hull.vertices], color=color, alpha=0.1)
            avg_x = xs.mean()
            avg_y = ys.mean()
            plt.scatter(avg_x, avg_y, marker="x", color=color, s=100)
    plt.title(f"Clusters identified with {n_clusters} clusters")
    plt.legend()
    plt.show()

widgets.interactive(visualize_clusters, n_clusters=widgets.IntSlider(min=2, max=16, step=1, value=4))


interactive(children=(IntSlider(value=4, description='n_clusters', max=16, min=2), Output()), _dom_classes=('w…

## Semantic Scholar Embedding + KMeans

In [112]:
# imports
import numpy as np
import pandas as pd
from ast import literal_eval

# load data
datafile_path = "papers.csv"

df = pd.read_csv(datafile_path)
df["embedding"] = df.embedding.apply(literal_eval).apply(np.array)  # convert string to numpy array
matrix = np.vstack(df.embedding.values)
matrix.shape

(201, 768)

In [142]:
df.keys()

Index(['paperId', 'title', 'author', 'venue', 'year', 'citationCount', 'url',
       'abstract', 'embedding', 'Cluster', 'category'],
      dtype='object')

In [113]:
from sklearn.metrics import silhouette_score

def visualize_clusters(n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
    df['Cluster'] = kmeans.fit_predict(matrix)
    
    tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200)
    vis_dims2 = tsne.fit_transform(matrix)

    # 2D 데이터 포인트
    x = [x for x, y in vis_dims2]
    y = [y for x, y in vis_dims2]    
    
    plt.figure(figsize=(10, 8))
    for category, (marker, color) in enumerate(zip(markers, colors)):        
        xs = np.array(x)[df.Cluster == category]
        ys = np.array(y)[df.Cluster == category]
        
        # 데이터 포인트가 있는 경우에만 시각화 및 평균 계산
        if len(xs) > 0:
            plt.scatter(xs, ys, color=color, alpha=0.3, marker=marker, label=f'Cluster {category}')
            if len(xs) > 2:
                hull = ConvexHull(np.column_stack((xs, ys)))
                plt.fill(xs[hull.vertices], ys[hull.vertices], color=color, alpha=0.1)
            avg_x = xs.mean()
            avg_y = ys.mean()
            plt.scatter(avg_x, avg_y, marker="x", color=color, s=100)
    plt.title(f"Clusters identified with {n_clusters} clusters")
    plt.legend()
    plt.show()
    
    # 각 클러스터의 데이터 개수 출력
    print("Number of data points in each cluster:")
    print(df['Cluster'].value_counts())
    
    # 실루엣 점수 출력
    silhouette_avg = silhouette_score(matrix, df['Cluster'])
    print(f"Silhouette Score for {n_clusters} clusters: {silhouette_avg:.4f}")

widgets.interactive(visualize_clusters, n_clusters=widgets.IntSlider(min=2, max=16, step=1, value=4))

interactive(children=(IntSlider(value=4, description='n_clusters', max=16, min=2), Output()), _dom_classes=('w…

In [129]:
import openai

kmeans = KMeans(n_clusters=4, n_init=10, random_state=42)
df['Cluster'] = kmeans.fit_predict(matrix)

# Reading a review which belong to each group.
abstracts_per_cluster = 10

for i in range(n_clusters):
    print(f"Cluster {i} Theme:", end=" ")
    
    # Filter out NaN abstracts and then sample
    valid_abstracts = df[(df.Cluster == i) & (df.abstract.notna())]
    sampled_abstracts = valid_abstracts.abstract.sample(min(abstracts_per_cluster, len(valid_abstracts)), random_state=42).values
    
    abstracts = "\n".join(sampled_abstracts)
    
    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=f'What\'s a common topic regarding robotics in these researches?\n\nAbstracts:\n"""\n{abstracts}\n"""\n\nCategory:',
        temperature=0,
        max_tokens=64,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )
    print(response["choices"][0]["text"].replace("\n", ""))

    sample_cluster_rows = df[df.Cluster == i].sample(abstracts_per_cluster, random_state=42)
    for j in range(abstracts_per_cluster):
        print(sample_cluster_rows.title.values[j], end='\n\n')

    print("-" * 100)


Cluster 0 Theme:  Human-Robot Interaction
BabyAI: A Platform to Study the Sample Efficiency of Grounded Language Learning

Correcting Robot Plans with Natural Language Feedback

Coupling Large Language Models with Logic Programming for Robust and General Reasoning from Text

Language Instructed Reinforcement Learning for Human-AI Coordination

Grounding Large Language Models in Interactive Environments with Online Reinforcement Learning

Reshaping Robot Trajectories Using Natural Language Commands: A Study of Multi-Modal Data Alignment Using Transformers

Generalized Planning in PDDL Domains with Pretrained Large Language Models

Chat with the Environment: Interactive Multimodal Perception using Large Language Models

RMM: A Recursive Mental Model for Dialog Navigation

Visually-Grounded Planning without Vision: Language Models Infer Detailed Plans from High-level Instructions

----------------------------------------------------------------------------------------------------
Cluster 

In [135]:
# Define a mapping of cluster numbers to themes
cluster_to_theme = {
    0: "Human-Robot Interaction",
    1: "Robot Manipulation",
    2: "Robot Task and Motion Planning",
    3: "Vision-Language Robotics"
}

# Add the 'category' column to the DataFrame
df['category'] = df['Cluster'].map(cluster_to_theme)

In [136]:
from sklearn.metrics import silhouette_score

def visualize_clusters_by_category(category, n_clusters):
    # Filter the data for the selected category
    category_data = df[df['category'] == category].copy()
    category_matrix = np.vstack(category_data.embedding.values)
    
    # Compute the 2D representation using t-SNE
    tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200)
    vis_dims2 = tsne.fit_transform(category_matrix)
    
    # Calculate distances to centroid for outlier removal
    avg_x = np.mean([x for x, y in vis_dims2])
    avg_y = np.mean([y for x, y in vis_dims2])
    distances = np.sqrt((np.array([x for x, y in vis_dims2]) - avg_x)**2 + (np.array([y for x, y in vis_dims2]) - avg_y)**2)
    threshold = 2 * np.mean(distances)
    outliers = np.where(distances > threshold)[0]
    
    # Adjust the indices of outliers to match the indices in category_data
    outlier_indices = category_data.iloc[outliers].index
    
    # Remove outliers
    category_data.drop(outlier_indices, inplace=True)
    category_matrix = np.vstack(category_data.embedding.values)
    vis_dims2 = np.delete(vis_dims2, outliers, axis=0)
    
    # Perform clustering
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
    category_data['Cluster'] = kmeans.fit_predict(category_matrix)
    
    # 2D data points
    x = [x for x, y in vis_dims2]
    y = [y for x, y in vis_dims2]    
    
    plt.figure(figsize=(10, 8))
    for cluster, (marker, color) in enumerate(zip(markers, colors)):        
        xs = np.array(x)[category_data.Cluster == cluster]
        ys = np.array(y)[category_data.Cluster == cluster]
        
        # Only visualize and compute average for data points that exist
        if len(xs) > 0:
            plt.scatter(xs, ys, color=color, alpha=0.3, marker=marker, label=f'Cluster {cluster}')
            if len(xs) > 2:
                hull = ConvexHull(np.column_stack((xs, ys)))
                plt.fill(xs[hull.vertices], ys[hull.vertices], color=color, alpha=0.1)
            avg_x = xs.mean()
            avg_y = ys.mean()
            plt.scatter(avg_x, avg_y, marker="x", color=color, s=100)
    plt.title(f"Clusters for {category} with {n_clusters} clusters")
    plt.legend()
    plt.show()
    
    # Print the number of data points in each cluster
    print("Number of data points in each cluster:")
    print(category_data['Cluster'].value_counts())
    
    # Print the silhouette score
    silhouette_avg = silhouette_score(category_matrix, category_data['Cluster'])
    print(f"Silhouette Score for {n_clusters} clusters: {silhouette_avg:.4f}")

# Use ipywidgets to create an interactive visualization
categories = df['category'].unique()
widgets.interactive(visualize_clusters_by_category, category=widgets.Dropdown(options=categories), n_clusters=widgets.IntSlider(min=2, max=16, step=1, value=4))


interactive(children=(Dropdown(description='category', options=('Human-Robot Interaction', 'Vision-Language Ro…

In [147]:
import openai

# Define the number of clusters for each category
category_cluster_map = {
    "Human-Robot Interaction": 3,
    "Robot Manipulation": 6,
    "Robot Task and Motion Planning": 4,
    "Vision-Language Robotics": 2
}

# Function to determine subcategories within each main category
def determine_subcategories(category, n_clusters):
    # Filter the data for the selected category
    category_data = df[df['category'] == category].copy()
    category_matrix = np.vstack(category_data.embedding.values)
    
    # Perform clustering
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
    category_data['Subcluster'] = kmeans.fit_predict(category_matrix)
    
    # Determine the theme for each subcluster
    abstracts_per_cluster = 5
    for i in range(n_clusters):
        print(f"Subcluster {i} Theme:", end=" ")
        
        # Filter out NaN abstracts and then sample
        valid_abstracts = category_data[(category_data.Subcluster == i) & (category_data.abstract.notna())]
        sampled_abstracts = valid_abstracts.abstract.sample(min(abstracts_per_cluster, len(valid_abstracts)), random_state=42).values
        
        abstracts = "\n".join(sampled_abstracts)
        
        response = openai.Completion.create(
            engine="text-davinci-003",
            prompt=f'What\'s a single common subtopic of {category} that encompasses these researches?\n\nAbstracts:\n"""\n{abstracts}\n"""\n\nSubTopic:',
            temperature=0,
            max_tokens=64,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
        )
        print(response["choices"][0]["text"].replace("\n", ""))

        sample_subcluster_rows = category_data[category_data.Subcluster == i].sample(min(abstracts_per_cluster, len(valid_abstracts)), random_state=42)
        for j in range(min(abstracts_per_cluster, len(valid_abstracts))):
            print(sample_subcluster_rows.title.values[j], end='\n\n')

        print("-" * 100)
        
    # Update the original df with the Subcluster values for the current category
    df.loc[category_data.index, 'Subcluster'] = category_data['Subcluster']

# Iterate over each main category and determine its subcategories
for category, n_clusters in category_cluster_map.items():
    print(f"Category: {category}")
    determine_subcategories(category, n_clusters)


Category: Human-Robot Interaction
Subcluster 0 Theme:  Foundation Models for Decision Making
Grounding Large Language Models in Interactive Environments with Online Reinforcement Learning

Do Embodied Agents Dream of Pixelated Sheep?: Embodied Decision Making using Language Guided World Modelling

Towards A Unified Agent with Foundation Models

Keep CALM and Explore: Language Models for Action Generation in Text-based Games

Foundation Models for Decision Making: Problems, Methods, and Opportunities

----------------------------------------------------------------------------------------------------
Subcluster 1 Theme:  Human-Robot Interaction with Large Language Models (LLMs)
Describe, Explain, Plan and Select: Interactive Planning with Large Language Models Enables Open-World Multi-Task Agents

FILM: Following Instructions in Language with Modular Methods

LLM as A Robotic Brain: Unifying Egocentric Memory and Control

Planning with Large Language Models via Corrective Re-prompting



In [148]:
import pandas as pd

# Mapping of clusters to subcategories
subcategory_mapping = {
    'Human-Robot Interaction': {
        0: 'Foundation Models for Decision Making',
        1: 'LLM-based Interaction',
        2: 'Natural Language Interfaces'
    },
    'Robot Manipulation': {
        0: 'Generalization',
        1: 'Data Augmentation',
        2: 'Language Conditioned Robot Manipulation',
        3: 'Multi-Task Learning',
        4: 'Model-Free Learning',
        5: 'Real-Robot Manipulation'
    },
    'Robot Task and Motion Planning': {
        0: 'Simulation Environments',
        1: 'Bilevel Planning with Abstractions',
        2: 'Robot Motion Planning',
        3: 'Dexterous Robotic Manipulation'
    },
    'Vision-Language Robotics': {
        0: 'Object Manipulation and Navigation',
        1: 'Embodied Language Models'
    }
}

# Assign subcategories to the dataframe based on the category and cluster
df['subcategory'] = df.apply(lambda row: subcategory_mapping[row['category']][row['Subcluster']], axis=1)

# Reorder columns to have Category and SubCategory at the beginning
df = df[['category', 'subcategory', 'paperId', 'title', 'author', 'venue', 'year', 'citationCount', 'url',
       'abstract', 'embedding']]

# Save the dataframe to a CSV file
df.to_csv('paper_categorized.csv', index=False)


## Semantic Scholar Embedding + 계층적 클러스터링

In [91]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

def visualize_hierarchical_clusters(n_clusters):
    agglomerative = AgglomerativeClustering(n_clusters=n_clusters)
    df['Cluster'] = agglomerative.fit_predict(matrix)
    
    tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200)
    vis_dims2 = tsne.fit_transform(matrix)

    # 2D 데이터 포인트
    x = [x for x, y in vis_dims2]
    y = [y for x, y in vis_dims2]    
    
    plt.figure(figsize=(10, 8))
    for category, (marker, color) in enumerate(zip(markers, colors)):        
        xs = np.array(x)[df.Cluster == category]
        ys = np.array(y)[df.Cluster == category]
        
        # 데이터 포인트가 있는 경우에만 시각화 및 평균 계산
        if len(xs) > 0:
            plt.scatter(xs, ys, color=color, alpha=0.3, marker=marker, label=f'Cluster {category}')
            if len(xs) > 2:
                hull = ConvexHull(np.column_stack((xs, ys)))
                plt.fill(xs[hull.vertices], ys[hull.vertices], color=color, alpha=0.1)
            avg_x = xs.mean()
            avg_y = ys.mean()
            plt.scatter(avg_x, avg_y, marker="x", color=color, s=100)
    plt.title(f"Clusters identified with {n_clusters} clusters (Hierarchical)")
    plt.legend()
    plt.show()
    
    # 각 클러스터의 데이터 개수 출력
    print("Number of data points in each cluster:")
    print(df['Cluster'].value_counts())
    
    # 실루엣 점수 출력
    silhouette_avg = silhouette_score(matrix, df['Cluster'])
    print(f"Silhouette Score for {n_clusters} clusters: {silhouette_avg:.4f}")

widgets.interactive(visualize_hierarchical_clusters, n_clusters=widgets.IntSlider(min=2, max=16, step=1, value=4))


interactive(children=(IntSlider(value=4, description='n_clusters', max=16, min=2), Output()), _dom_classes=('w…

## Semantic Scholar API + DBSCAN

In [104]:
from sklearn.cluster import DBSCAN

def visualize_dbscan(eps=0.5, min_samples=5):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    df['Cluster'] = dbscan.fit_predict(matrix)
    
    tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200)
    vis_dims2 = tsne.fit_transform(matrix)

    # 2D 데이터 포인트
    x = [x for x, y in vis_dims2]
    y = [y for x, y in vis_dims2]    
    
    plt.figure(figsize=(10, 8))
    n_clusters = len(set(df['Cluster'])) - (1 if -1 in df['Cluster'] else 0)  # -1은 노이즈를 나타냅니다.
    colors = plt.cm.rainbow(np.linspace(0, 1, n_clusters))
    for category, color in zip(set(df['Cluster']), colors):        
        xs = np.array(x)[df.Cluster == category]
        ys = np.array(y)[df.Cluster == category]
        
        # 데이터 포인트가 있는 경우에만 시각화
        if len(xs) > 0:
            plt.scatter(xs, ys, color=color, alpha=0.3, label=f'Cluster {category}' if category != -1 else 'Noise')
            avg_x = xs.mean()
            avg_y = ys.mean()
            plt.scatter(avg_x, avg_y, marker="x", color=color, s=100)
    if n_clusters > 0:
        plt.legend()
    plt.title(f"Clusters identified with DBSCAN (eps={eps}, min_samples={min_samples})")
    plt.show()
    
    # 각 클러스터의 데이터 개수 출력
    print("Number of data points in each cluster:")
    print(df['Cluster'].value_counts())

widgets.interactive(visualize_dbscan, eps=widgets.FloatSlider(min=0.1, max=20, step=0.1, value=0.5), min_samples=widgets.IntSlider(min=2, max=20, step=1, value=5))


interactive(children=(FloatSlider(value=0.5, description='eps', max=20.0, min=0.1), IntSlider(value=5, descrip…