Description:
- This notebook shows some exploratory data analysis about the datasets used for the rumex1 paper.
- The EDA3-Preparation.ipynb should be run before because it produces the patches of the images need for the display on the embeddings.
- Output:
    - CSV file: the embedding file of the dataset from the lightly application (lightly api)
    - Figure: the embeddings layed out in PCA format with some sample pictures visualizations.

Where to run:
- Runs anywhere

In [None]:
import json
from sklearn.decomposition import PCA
import pandas as pd
import os
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
import lightly.utils.io as io
from lightly.api import ApiWorkflowClient


In [None]:
with open("../keys.json", "r") as f:
    data = json.load(f)

In [None]:
lightly_key = "9qkxtvrp7ib5tu7v8k0f4toti2bpk15fehw5d1z9ccetzt9r"
#lightly_key = "6ac2b927ae92934beaf5f1b31b5ce255ddb403202efe0cea"
dataset_id = "64a879435841bac2dfd98fbc"
embedding_name = "default_20230708_06h00m22s"

#os.environ['LIGHTLY_CA_CERTS'] = '/etc/ssl/certs/ca-certificates.crt'
print(os.environ['LIGHTLY_CA_CERTS'])
print(os.environ['HTTPS_PROXY'])


In [None]:

client = ApiWorkflowClient(token=lightly_key, dataset_id=dataset_id)
print('Client configured')
client.download_embeddings_csv(output_path="../assets/embedding_lightly.csv")

In [None]:
df = pd.read_csv("../assets/embedding_lightly.csv")
df

In [None]:
def compute_2d_pca_from_embeddings(df, embedding_prefix='embedding_', label_col='labels'):
    # Select embedding columns
    embedding_cols = [col for col in df.columns if col.startswith(embedding_prefix)]
    
    # Extract embedding matrix
    embeddings = df[embedding_cols].values

    # Perform PCA
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(embeddings)

    # Build output DataFrame
    output_df = pd.DataFrame({
        'filename': df['filenames'],
        'x': pca_result[:, 0],
        'y': pca_result[:, 1]
    })

    # Add labels if present
    if label_col in df.columns:
        output_df[label_col] = df[label_col]

    return output_df


In [15]:
df_pca = compute_2d_pca_from_embeddings(df)
df_pca

Unnamed: 0,filename,x,y,labels
0,20230504_HoratRitaUnterviertrlNord_S_20_F_50_O...,-0.300175,0.160513,0
1,20230504_HoratRitaUnterviertrlNord_S_20_F_50_O...,-0.346518,-0.180672,0
2,20230504_HoratRitaUnterviertrlNord_S_20_F_50_O...,-0.301432,-0.315442,0
3,20230421_Kuster_S_20_F_50_O_sama_ID1/20230421_...,0.289641,0.081476,0
4,20230426_SchneiderWallenwil_S_20_F_50_O_krma_I...,-0.192226,-0.443259,0
...,...,...,...,...
49995,20230614_SchneiderWallenwilZisterwis_S_20_F_60...,0.382468,-0.134772,0
49996,20230426_SchneiderWallenwil_S_20_F_50_O_krma_I...,-0.072733,0.119468,0
49997,20230620_HerrenpuentSuedwestStreifen_S_25_F_70...,-0.045495,0.009691,0
49998,20230609_HerrenpuentSuedwest_S_20_F_60_H_12_O_...,-0.084904,-0.082914,0


In [None]:
def plot_2d_pca_embeddings(df_pca, label_col='labels', figsize=(10, 8), save_path=None, title='2D PCA Projection'):
    """
    Plots 2D PCA embeddings with optional color coding by label.
    
    Args:
        df_pca: DataFrame with columns ['x', 'y', 'filename'] and optionally a label column.
        label_col: Name of the label column to color by.
        figsize: Size of the plot.
        save_path: If provided, will save the figure to this path.
        title: Title of the plot.
    """
    plt.figure(figsize=figsize)
    sns.set(style="whitegrid", font_scale=1.2)

    if label_col in df_pca.columns:
        unique_labels = sorted(df_pca[label_col].unique())
        palette = sns.color_palette("hsv", len(unique_labels))
        sns.scatterplot(
            data=df_pca,
            x='y', y='x',
            hue=label_col,
            palette=palette,
            s=40,
            alpha=0.8
        )
        plt.legend(title=label_col, bbox_to_anchor=(1.05, 1), loc='upper left')
    else:
        plt.scatter(df_pca['x'], df_pca['y'], s=30, alpha=0.8)

    plt.xlabel('PCA 1')
    plt.ylabel('PCA 2')
    plt.title(title)
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=300)
        print(f"Plot saved to: {save_path}")
    else:
        plt.show()


In [None]:
plot_2d_pca_embeddings(df_pca, label_col='labels', save_path='./figures/pca_plot.png')

In [13]:
def plot_pca_with_selection_and_images(
    df_pca,
    selected_paths_txt,
    image_dir,
    output_path='pca_bw_overlay.png'
):
    # Load selected image paths and extract filenames
    with open(selected_paths_txt, 'r') as f:
        selected_paths = f.read().splitlines()

    selected_filenames = {os.path.basename(p.strip()) for p in selected_paths}

    # Add boolean column for selected images
    df['is_selected'] = df['filename'].apply(lambda f: os.path.basename(f) in selected_filenames)

    # Split data
    df_selected = df[df['is_selected']]
    df_non_selected = df[~df['is_selected']]

    # Start plotting
    fig, ax = plt.subplots(figsize=(12, 10))
    sns.set(style="white", font_scale=1.2)

    # Plot non-selected in soft gray
    ax.scatter(
        df_non_selected['x'],
        df_non_selected['y'],
        c='#FFFF00',
        s=1,
        label='Non-Selected',
        alpha=0.9
    )

    # Plot selected in near-black
    ax.scatter(
        df_selected['x'],
        df_selected['y'],
        c='#1f1f1f',
        s=1,
        label='Selected',
        alpha=1.0
    )

    # Legend
    ax.legend(loc='upper right', fontsize=12)

    # Define PCA space midpoints
    x_mid = df['x'].median()
    y_mid = df['y'].median()

    # Define quadrants using only selected points
    quadrants = {
        'top_left': df_selected[(df_selected['x'] < x_mid) & (df_selected['y'] > y_mid)],
        'top_right': df_selected[(df_selected['x'] > x_mid) & (df_selected['y'] > y_mid)],
        'bottom_left': df_selected[(df_selected['x'] < x_mid) & (df_selected['y'] < y_mid)],
        'bottom_right': df_selected[(df_selected['x'] > x_mid) & (df_selected['y'] < y_mid)],
    }

    # Overlay one image per quadrant
    for qname, qdata in quadrants.items():
        if not qdata.empty:
            row = qdata.sample(1).iloc[0]
            img_name = os.path.basename(row['filename'])
            img_path = os.path.join(image_dir, img_name)
            if os.path.exists(img_path):
                try:
                    img = mpimg.imread(img_path)
                    imbox = OffsetImage(img, zoom=0.2)
                    ab = AnnotationBbox(imbox, (row['x'], row['y']), frameon=False)
                    ax.add_artist(ab)
                except Exception as e:
                    print(f"Failed to load image {img_path}: {e}")

    ax.legend(loc='upper right', fontsize=12)
    ax.set_xlabel('PCA 1')
    ax.set_ylabel('PCA 2')
    ax.set_title('2D PCA Projection with Selected Samples', fontsize=14)
    ax.grid(False)
    ax.set_facecolor('white')
    sns.despine()

    plt.tight_layout()
    plt.savefig(output_path, dpi=300)
    plt.close()
    print(f"Plot saved to: {output_path}")


In [14]:
images_path = '../assets/digital-production/lightly/images'
plot_pca_with_selection_and_images(
    df=df_pca,
    selected_paths_txt="../assets/filenames-rumex_4by_4_50k-coreset-samples_20000-1688802987585.txt",
    image_dir=images_path
)

Plot saved to: pca_bw_overlay.png


In [36]:
import os
import random
import cv2
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
import seaborn as sns
from glob import glob

def plot_pca_with_native_random_crops(
    df_pca,
    selected_paths_txt,
    dataset_csv,
    im_dir,
    output_path='pca_native_crops.png',
    crop_height=678,
    crop_width=678
):
    df = df_pca.copy()

    # Load selected filenames
    with open(selected_paths_txt, 'r') as f:
        selected_paths = f.read().splitlines()
    selected_filenames = {os.path.basename(p.strip()) for p in selected_paths}

    # Mark selected
    df['is_selected'] = df['filename'].apply(lambda f: os.path.basename(f) in selected_filenames)

    # Load dataset names
    dataset_df = pd.read_csv(dataset_csv)
    dataset_names = sorted(dataset_df['dataset'].unique(), key=len, reverse=True)

    # Match dataset name from filename (strip after last '.')
    def match_dataset(filename):
        base = os.path.basename(filename)
        base_trimmed = base.rsplit('.', 1)[0]
        for d in dataset_names:
            if base_trimmed.startswith(d):
                return d
        return None

    df['dataset'] = df['filename'].apply(match_dataset)
    df_selected = df[df['is_selected'] & df['dataset'].notnull()].copy()
    df_non_selected = df[~df['is_selected']]

    # Select 10 unique datasets and one row from each
    selected_datasets = random.sample(list(df_selected['dataset'].unique()), min(10, df_selected['dataset'].nunique()))
    sample_rows = []
    for d in selected_datasets:
        candidate = df_selected[df_selected['dataset'] == d]
        if not candidate.empty:
            sample_rows.append(candidate.sample(1).iloc[0])

    # Plot setup
    fig, ax = plt.subplots(figsize=(14, 12))
    sns.set(style="white", font_scale=1.2)

    ax.scatter(df_non_selected['x'], df_non_selected['y'], c='#d3d3d3', s=2, label='Non-Selected', alpha=0.9)
    ax.scatter(df_selected['x'], df_selected['y'], c='#1f1f1f', s=4, label='Selected', alpha=1.0)
    ax.set_xlabel('PCA 1')
    ax.set_ylabel('PCA 2')
    ax.set_title('2D PCA with Native Random Crop Callouts')
    ax.legend(loc='upper right')

    # Extend limits
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    x_buffer = (xlim[1] - xlim[0]) * 0.3
    y_buffer = (ylim[1] - ylim[0]) * 0.3
    ax.set_xlim(xlim[0] - x_buffer, xlim[1] + x_buffer)
    ax.set_ylim(ylim[0] - y_buffer, ylim[1] + y_buffer)

    # Image positions
    anchor_offsets = [
        (-x_buffer * 0.9, y_buffer * 0.9),
        (x_buffer * 0.9, y_buffer * 0.9),
        (-x_buffer * 0.9, -y_buffer * 0.9),
        (x_buffer * 0.9, -y_buffer * 0.9),
        (0, y_buffer * 1.1),
        (-x_buffer * 0.7, y_buffer * 0.3),
        (x_buffer * 0.7, y_buffer * 0.3),
        (-x_buffer * 0.7, -y_buffer * 0.3),
        (x_buffer * 0.7, -y_buffer * 0.3),
        (0, -y_buffer * 1.1)
    ]

    for i, row in enumerate(sample_rows):
        dataset = row['dataset']
        img_paths = glob(os.path.join(im_dir, f"{dataset}*.jpg")) + glob(os.path.join(im_dir, f"{dataset}*.png"))
        if not img_paths:
            print(f"No image found for dataset {dataset}")
            continue

        img_path = random.choice(img_paths)
        image = cv2.imread(img_path)
        if image is None:
            print(f"Could not read image: {img_path}")
            continue

        h, w, _ = image.shape
        if h < crop_height or w < crop_width:
            print(f"Image too small for crop: {img_path}")
            continue

        # Native random crop
        y1 = random.randint(0, h - crop_height)
        x1 = random.randint(0, w - crop_width)
        cropped_img = image[y1:y1+crop_height, x1:x1+crop_width]
        cropped_img = cv2.cvtColor(cropped_img, cv2.COLOR_BGR2RGB)

        try:
            imbox = OffsetImage(cropped_img, zoom=0.25)
            anchor_x = row['x'] + anchor_offsets[i][0]
            anchor_y = row['y'] + anchor_offsets[i][1]
            ax.plot([row['x'], anchor_x], [row['y'], anchor_y], color='gray', linestyle='--', linewidth=1)
            ab = AnnotationBbox(imbox, (anchor_x, anchor_y), frameon=True, pad=0.3, bboxprops=dict(edgecolor='black'))
            ax.add_artist(ab)
        except Exception as e:
            print(f"Overlay failed for {img_path}: {e}")

    ax.grid(False)
    ax.set_facecolor('white')
    sns.despine()
    plt.tight_layout()
    plt.savefig(output_path, dpi=300)
    plt.close()
    print(f"Plot saved to: {output_path}")


In [37]:

plot_pca_with_native_random_crops(
    df_pca=df_pca,
    selected_paths_txt="../assets/filenames-rumex_4by_4_50k-coreset-samples_20000-1688802987585.txt",
    dataset_csv='../assets/lightly_totalimages_selectedimages.csv',
    im_dir='../assets/digital-production/lightly/images',
    output_path='pca_native_crops.png'
)

Overlay failed for ../assets/digital-production/lightly/images/20230614_SchneiderWallenwilLangwisSued_S_20_F_60_H_12_O_krma_ID1_DJI_20230614164512_0226.3_3.png: 'ellipsis' object is not subscriptable
No image found for dataset 20230414_Waldegg_S_10_F_10_O_stra_ID1
Overlay failed for ../assets/digital-production/lightly/images/20230609_RuedimoosTaenikon_S_20_F_60_H_12_O_krma_ID1_DJI_20230609113024_0634.1_0.png: list index out of range
Overlay failed for ../assets/digital-production/lightly/images/20230504_HoratRitaUnterviertrlNord_S_20_F_50_O_sama_ID3_DJI_20230504165029_0138.0_0.png: list index out of range
No image found for dataset 20230414_Waldegg_S_20_F_50_O_stra_ID1
Overlay failed for ../assets/digital-production/lightly/images/20230615_SchildOberhofenBuel3_S_20_F_60_H_12_O_krma_ID3_DJI_20230615113515_0163.2_1.png: list index out of range
Overlay failed for ../assets/digital-production/lightly/images/20230403_HerrenpuentSuedOst_S_70_F_80_O_sama_ID1_DJI_20230403155610_0315.3_0.png: 