In [None]:
# Install required libraries
!pip install datasets transformers

# Import necessary libraries
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import requests
from io import BytesIO

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [None]:
class ShareGPT4VLoader:
    """
    A class for loading and interacting with the ShareGPT4V dataset
    """

    def __init__(self, cache_dir=None):
        """
        Initialize the dataset loader

        Args:
            cache_dir (str, optional): Directory to cache the dataset. Defaults to None.
        """
        self.dataset = load_dataset("Lin-Chen/ShareGPT4V", "ShareGPT4V", cache_dir=cache_dir)
        self.current_split = "train"

    def get_info(self):
        """
        Get basic information about the dataset

        Returns:
            dict: Dataset information
        """
        info = {
            "splits": list(self.dataset.keys()),
            "train_size": len(self.dataset["train"]),
            "features": list(self.dataset["train"].features.keys())
        }

        # Get a sample to analyze its structure
        sample = self.dataset["train"][0]
        if "conversations" in sample:
            # Display the structure of conversations
            conv_sample = sample["conversations"][0] if sample["conversations"] else None
            info["conversation_structure"] = list(conv_sample.keys()) if conv_sample else "Empty"

        return info

    def display_sample(self, idx=0, split="train", max_text_len=100):
        """
        Display a sample from the dataset

        Args:
            idx (int): Index of the sample to display
            split (str): Dataset split to use
            max_text_len (int): Maximum length of text to display

        Returns:
            dict: Sample data
        """
        if split not in self.dataset:
            print(f"Split '{split}' not found. Available splits: {list(self.dataset.keys())}")
            return None

        sample = self.dataset[split][idx]

        # Display basic sample info
        print(f"Sample ID: {sample.get('id', 'N/A')}")
        if "image" in sample:
            print(f"Image path: {sample['image']}")

        # Display conversation
        print("\nConversation:")
        if "conversations" in sample:
            for i, conversation in enumerate(sample["conversations"]):
                # Handle different conversation structures
                if "from" in conversation and "value" in conversation:
                    speaker = conversation["from"]
                    content = conversation["value"]
                elif "role" in conversation and "content" in conversation:
                    speaker = conversation["role"]
                    content = conversation["content"]
                else:
                    print(f"Unknown conversation structure: {conversation.keys()}")
                    continue

                if len(content) > max_text_len:
                    display_content = f"{content[:max_text_len]}..."
                else:
                    display_content = content
                print(f"{speaker}: {display_content}")
        else:
            print("No conversations found in this sample")

        # Display image if available
        if "image" in sample and sample["image"] is not None:
            try:
                # If image is a URL
                if isinstance(sample["image"], str):
                    if sample["image"].startswith("http"):
                        response = requests.get(sample["image"])
                        img = Image.open(BytesIO(response.content))
                    # If image is a local path (likely in the dataset)
                    else:
                        # Try to find the image in the Hugging Face cache
                        try:
                            from huggingface_hub import HfFileSystem
                            fs = HfFileSystem()
                            if fs.exists(f"datasets/Lin-Chen/ShareGPT4V/{sample['image']}"):
                                with fs.open(f"datasets/Lin-Chen/ShareGPT4V/{sample['image']}", "rb") as f:
                                    img = Image.open(BytesIO(f.read()))
                            else:
                                print(f"Image path exists but file not found in HF cache: {sample['image']}")
                                return sample
                        except ImportError:
                            print("Cannot access image directly. Install huggingface_hub for direct access.")
                            return sample

                    plt.figure(figsize=(10, 10))
                    plt.imshow(img)
                    plt.axis('off')
                    plt.title("Image in conversation")
                    plt.show()
                # If image is already a PIL Image or other format
                elif hasattr(sample["image"], "show"):
                    plt.figure(figsize=(10, 10))
                    plt.imshow(sample["image"])
                    plt.axis('off')
                    plt.title("Image in conversation")
                    plt.show()
                else:
                    print(f"Unknown image format: {type(sample['image'])}")
            except Exception as e:
                print(f"Failed to display image: {e}")

        return sample

    def get_sample(self, idx=0, split="train"):
        """
        Get a sample without displaying it

        Args:
            idx (int): Index of the sample to get
            split (str): Dataset split to use

        Returns:
            dict: Sample data
        """
        if split not in self.dataset:
            print(f"Split '{split}' not found. Available splits: {list(self.dataset.keys())}")
            return None

        return self.dataset[split][idx]

    def search_by_keyword(self, keyword, split="train", max_results=5):
        """
        Search for samples containing a specific keyword

        Args:
            keyword (str): Keyword to search for
            split (str): Dataset split to search in
            max_results (int): Maximum number of results to return

        Returns:
            list: List of sample indices containing the keyword
        """
        if split not in self.dataset:
            print(f"Split '{split}' not found. Available splits: {list(self.dataset.keys())}")
            return []

        results = []
        for i, sample in enumerate(self.dataset[split]):
            if "conversations" not in sample:
                continue

            for conv in sample["conversations"]:
                # Check in both possible structures
                content = None
                if "value" in conv:
                    content = conv["value"]
                elif "content" in conv:
                    content = conv["content"]

                if content and keyword.lower() in content.lower():
                    results.append(i)
                    break

            if len(results) >= max_results:
                break

        return results

    def convert_to_dataframe(self, split="train", max_samples=None):
        """
        Convert dataset to pandas DataFrame

        Args:
            split (str): Dataset split to convert
            max_samples (int, optional): Maximum number of samples to include

        Returns:
            pandas.DataFrame: DataFrame containing the dataset
        """
        if split not in self.dataset:
            print(f"Split '{split}' not found. Available splits: {list(self.dataset.keys())}")
            return None

        data = []
        samples_to_process = min(len(self.dataset[split]), max_samples) if max_samples else len(self.dataset[split])

        for i in range(samples_to_process):
            sample = self.dataset[split][i]

            if "conversations" not in sample:
                continue

            human_messages = []
            assistant_messages = []

            for conv in sample["conversations"]:
                # Handle different conversation structures
                if "from" in conv and "value" in conv:
                    if conv["from"] == "human":
                        human_messages.append(conv["value"])
                    elif conv["from"] == "gpt":
                        assistant_messages.append(conv["value"])
                elif "role" in conv and "content" in conv:
                    if conv["role"] == "human" or conv["role"] == "user":
                        human_messages.append(conv["content"])
                    elif conv["role"] == "assistant" or conv["role"] == "gpt":
                        assistant_messages.append(conv["content"])

            sample_id = sample.get("id", str(i))
            has_image = "image" in sample and sample["image"] is not None
            image_path = sample.get("image", None)

            data.append({
                "id": sample_id,
                "human_messages": human_messages,
                "assistant_messages": assistant_messages,
                "has_image": has_image,
                "image_path": image_path
            })

        return pd.DataFrame(data)

    def examine_data_structure(self, num_samples=5, split="train"):
        """
        Examine the structure of the dataset to understand its format

        Args:
            num_samples (int): Number of samples to examine
            split (str): Dataset split to examine

        Returns:
            dict: Information about the dataset structure
        """
        if split not in self.dataset:
            print(f"Split '{split}' not found. Available splits: {list(self.dataset.keys())}")
            return None

        # Check available features
        features = list(self.dataset[split].features.keys())
        print(f"Available features: {features}")

        # Examine conversation structure in multiple samples
        conversation_structures = []
        for i in range(min(num_samples, len(self.dataset[split]))):
            sample = self.dataset[split][i]

            if "conversations" in sample and sample["conversations"]:
                # Get keys in conversation objects
                conv_keys = list(sample["conversations"][0].keys())
                conversation_structures.append(conv_keys)

                # Show an example conversation
                if i == 0:
                    print("\nExample conversation structure:")
                    for j, conv in enumerate(sample["conversations"]):
                        print(f"  Message {j+1}: {conv}")
                        if j >= 1:  # Just show first two messages
                            break

        # Count different structures
        structure_counts = {}
        for struct in conversation_structures:
            struct_key = ", ".join(struct)
            if struct_key in structure_counts:
                structure_counts[struct_key] += 1
            else:
                structure_counts[struct_key] = 1

        print("\nConversation structure frequencies:")
        for struct, count in structure_counts.items():
            print(f"  Structure [{struct}]: {count} samples")

        # Check image paths
        image_path_examples = []
        for i in range(min(num_samples, len(self.dataset[split]))):
            sample = self.dataset[split][i]
            if "image" in sample and sample["image"]:
                image_path_examples.append(sample["image"])

        print("\nImage path examples:")
        for i, path in enumerate(image_path_examples):
            if i >= 3:  # Just show first three paths
                print("  ...")
                break
            print(f"  {path}")

        return {
            "features": features,
            "conversation_structures": conversation_structures,
            "structure_counts": structure_counts,
            "image_path_examples": image_path_examples
        }


In [None]:
loader = ShareGPT4VLoader()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/2.22k [00:00<?, ?B/s]

(…)egpt4v_instruct_gpt4-vision_cap100k.json:   0%|          | 0.00/134M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/102025 [00:00<?, ? examples/s]

In [None]:
info = loader.get_info()
print("Dataset Information:")
print(f"Available splits: {info['splits']}")
print(f"Train size: {info['train_size']}")
print(f"Features: {info['features']}")


Dataset Information:
Available splits: ['train']
Train size: 102025
Features: ['id', 'image', 'conversations']


In [None]:
print("\nDisplaying sample 0:")
loader.display_sample(0)


Displaying sample 0:
Sample ID: 000000000009
Image path: coco/train2017/000000000009.jpg

Conversation:
human: What do you see happening in this image?
<image>
gpt: In the center of the image, a vibrant blue lunch tray holds four containers, each brimming with a va...
Image path exists but file not found in HF cache: coco/train2017/000000000009.jpg


{'id': '000000000009',
 'image': 'coco/train2017/000000000009.jpg',
 'conversations': [{'from': 'human',
   'value': 'What do you see happening in this image?\n<image>'},
  {'from': 'gpt',
   'value': "In the center of the image, a vibrant blue lunch tray holds four containers, each brimming with a variety of food items. The containers, two in pink and two in yellow, are arranged in a 2x2 grid.\n\nIn the top left pink container, a slice of bread rests, lightly spread with butter and sprinkled with a handful of almonds. The bread is cut into a rectangle, and the almonds are scattered across its buttery surface.\n\nAdjacent to it in the top right corner, another pink container houses a mix of fruit. Sliced apples with their fresh white interiors exposed share the space with juicy chunks of pineapple. The colors of the apple slices and pineapple chunks contrast beautifully against the pink container.\n\nBelow these, in the bottom left corner of the tray, a yellow container holds a single 

In [None]:
print("\nSearching for samples with 'image':")
image_samples = loader.search_by_keyword("image")
print(f"Found {len(image_samples)} samples")


Searching for samples with 'image':
Found 5 samples


In [None]:
# Display the first search result
if image_samples:
    print("\nDisplaying first search result:")
    loader.display_sample(image_samples[0])

# Convert part of the dataset to DataFrame
print("\nConverting to DataFrame (first 10 samples):")
df = loader.convert_to_dataframe()
print(df.head())


Displaying first search result:
Sample ID: 000000000009
Image path: coco/train2017/000000000009.jpg

Conversation:
human: What do you see happening in this image?
<image>
gpt: In the center of the image, a vibrant blue lunch tray holds four containers, each brimming with a va...
Image path exists but file not found in HF cache: coco/train2017/000000000009.jpg

Converting to DataFrame (first 10 samples):
             id                                     human_messages  \
0  000000000009  [What do you see happening in this image?\n<im...   
1  000000000025  [Explain the visual content of the image in gr...   
2  000000000030  [What do you see happening in this image?\n<im...   
3  000000000034  [<image>\nExplain the visual content of the im...   
4  000000000036  [Can you elaborate on the elements of the pict...   

                                  assistant_messages  has_image  \
0  [In the center of the image, a vibrant blue lu...       True   
1  [This image captures a serene mome

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import requests
import io

class ClusterSampler:
    """
    Class for clustering dataset samples and sampling from the clusters
    """

    def __init__(self, df, embeddings=None):
        """
        Initialize with a DataFrame, optionally with precomputed embeddings

        Args:
            df (pd.DataFrame): DataFrame containing conversation samples
            embeddings (np.ndarray or None): Precomputed embeddings (optional)
        """
        self.df = df
        self.embeddings = embeddings
        self.labels = None
        self.kmeans = None

    def set_embeddings(self, embeddings):
        """
        Set custom embeddings manually

        Args:
            embeddings (np.ndarray): Embedding array
        """
        self.embeddings = embeddings
        print(f"Set embeddings with shape: {self.embeddings.shape}")

    def embed_texts(self, field="human_messages", encoder=None):
        """
        Generate embeddings for the text data

        Args:
            field (str): Field name to extract texts
            encoder (callable): Function to encode a list of texts into embeddings
        """
        if encoder is None:
            raise ValueError("You must provide an encoder function to embed the texts.")

        texts = self.df[field].apply(lambda x: " ".join(x) if isinstance(x, list) else str(x)).tolist()
        self.embeddings = encoder(texts)
        print(f"Generated embeddings for {len(texts)} samples with shape: {self.embeddings.shape}")

    def cluster(self, n_clusters=5, random_state=42):
        """
        Perform clustering using KMeans

        Args:
            n_clusters (int): Number of clusters
            random_state (int): Random state for reproducibility
        """
        if self.embeddings is None:
            raise ValueError("Embeddings not found. Please run .embed_texts() or .set_embeddings() first.")

        self.kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
        self.labels = self.kmeans.fit_predict(self.embeddings)
        self.df['cluster'] = self.labels
        print(f"Assigned cluster labels to samples")

    def evaluate_clustering(self):
        """
        Evaluate clustering using silhouette score

        Returns:
            float: Silhouette score
        """
        if self.labels is None or self.embeddings is None:
            raise ValueError("You must run .cluster() first before evaluating.")

        score = silhouette_score(self.embeddings, self.labels)
        print(f"Silhouette Score: {score:.4f}")
        return score

    def sample_from_clusters(self, samples_per_cluster=10):
        """
        Sample a specified number of items from each cluster

        Args:
            samples_per_cluster (int): Number of samples to pick per cluster

        Returns:
            pd.DataFrame: Sampled DataFrame
        """
        sampled_frames = []
        for cluster_id in np.unique(self.labels):
            cluster_df = self.df[self.df['cluster'] == cluster_id]
            sampled = cluster_df.sample(n=min(samples_per_cluster, len(cluster_df)), random_state=42)
            sampled_frames.append(sampled)
            print(f"Sampled {len(sampled)} items from cluster {cluster_id}")
        return pd.concat(sampled_frames).reset_index(drop=True)

    def export_to_parquet(self, sampled_df, file_path="sampled_data.parquet"):
        """
        Save sampled data to Parquet file

        Args:
            sampled_df (pd.DataFrame): DataFrame to export
            file_path (str): Output file path
        """
        table = pa.Table.from_pandas(sampled_df)
        pq.write_table(table, file_path)
        print(f"Exported sampled data to {file_path}")
        return file_path

    def upload_to_catbox(self, file_path):
        """
        Upload the parquet file to catbox.moe

        Args:
            file_path (str): Path to the Parquet file

        Returns:
            str: URL of the uploaded file or error message
        """
        with open(file_path, 'rb') as f:
            files = {'fileToUpload': (file_path, f)}
            response = requests.post("https://catbox.moe/user/api.php", data={"reqtype": "fileupload"}, files=files)
        if response.status_code == 200:
            url = response.text.strip()
            print(f"Upload success: {url}")
            return url
        else:
            print(f"Upload failed: {response.status_code}")
            return None


In [None]:
df

Unnamed: 0,id,human_messages,assistant_messages,has_image,image_path
0,000000000009,[What do you see happening in this image?\n<im...,"[In the center of the image, a vibrant blue lu...",True,coco/train2017/000000000009.jpg
1,000000000025,[Explain the visual content of the image in gr...,[This image captures a serene moment in a zoo ...,True,coco/train2017/000000000025.jpg
2,000000000030,[What do you see happening in this image?\n<im...,"[The image presents a serene garden scene, cen...",True,coco/train2017/000000000030.jpg
3,000000000034,[<image>\nExplain the visual content of the im...,[This is a detailed description of the image:\...,True,coco/train2017/000000000034.jpg
4,000000000036,[Can you elaborate on the elements of the pict...,"[In the image, there is a woman standing in fr...",True,coco/train2017/000000000036.jpg
...,...,...,...,...,...
102020,White_Cliffs_of_Dover2,[Explain the visual content of the image in gr...,[The image presents a breathtaking view of the...,True,web-landmark/images/White_Cliffs_of_Dover2.jpg
102021,Ziggurat_of_Ur,[Analyze the image in a comprehensive and deta...,[The image captures the imposing structure of ...,True,web-landmark/images/Ziggurat_of_Ur.jpg
102022,Ziggurat_of_Ur2,[Can you elaborate on the elements of the pict...,"[The image captures the Ziggurat of Ur, a sign...",True,web-landmark/images/Ziggurat_of_Ur2.jpg
102023,Þingvellir_National_Park,[<image>\nWhat is this photo about'?],[This image captures the breathtaking view of ...,True,web-landmark/images/Þingvellir_National_Park.jpg


In [None]:
df[df["image_path"].str.contains('coco', case=False)]

Unnamed: 0,id,human_messages,assistant_messages,has_image,image_path
0,000000000009,[What do you see happening in this image?\n<im...,"[In the center of the image, a vibrant blue lu...",True,coco/train2017/000000000009.jpg
1,000000000025,[Explain the visual content of the image in gr...,[This image captures a serene moment in a zoo ...,True,coco/train2017/000000000025.jpg
2,000000000030,[What do you see happening in this image?\n<im...,"[The image presents a serene garden scene, cen...",True,coco/train2017/000000000030.jpg
3,000000000034,[<image>\nExplain the visual content of the im...,[This is a detailed description of the image:\...,True,coco/train2017/000000000034.jpg
4,000000000036,[Can you elaborate on the elements of the pict...,"[In the image, there is a woman standing in fr...",True,coco/train2017/000000000036.jpg
...,...,...,...,...,...
50022,000000248910,[Describe the following image.\n<image>],"[In this black and white photo, two individual...",True,coco/train2017/000000248910.jpg
50023,000000248911,[<image>\nDescribe the following image.],[In the heart of a medieval jousting tournamen...,True,coco/train2017/000000248911.jpg
50024,000000248912,[Explain the visual content of the image in gr...,"[In the center of the image, a delicious sandw...",True,coco/train2017/000000248912.jpg
50025,000000248918,[What is this photo about'?\n<image>],[The image captures a serene and cozy bedroom ...,True,coco/train2017/000000248918.jpg


In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')


# Inisialisasi
sampler = ClusterSampler(df[df["image_path"].str.contains('coco', case=False)])

# Generate embeddings
sampler.embed_texts(field="human_messages", encoder=model.encode)

# Clustering
sampler.cluster(n_clusters=10)

# Evaluasi
sampler.evaluate_clustering()

# Sampling
sampled_df = sampler.sample_from_clusters(samples_per_cluster=1000)

# Export
sampler.export_to_parquet(sampled_df, "sharegptv4_10k.parquet")
sampler.upload_to_catbox("sharegptv4_10k.parquet")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generated embeddings for 50027 samples with shape: (50027, 384)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['cluster'] = self.labels


Assigned cluster labels to samples
Silhouette Score: 0.7246
Sampled 1000 items from cluster 0
Sampled 1000 items from cluster 1
Sampled 1000 items from cluster 2
Sampled 1000 items from cluster 3
Sampled 1000 items from cluster 4
Sampled 1000 items from cluster 5
Sampled 1000 items from cluster 6
Sampled 1000 items from cluster 7
Sampled 1000 items from cluster 8
Sampled 1000 items from cluster 9
Exported sampled data to sharegptv4_10k.parquet
Upload success: https://files.catbox.moe/yo19wi.parquet


'https://files.catbox.moe/yo19wi.parquet'