In [None]:
import os
import pandas as pd
import numpy as np

In [None]:
# Load the Lightly selection in a list

In [None]:
lightly_file_name = '../assets/filenames-rumex_4by_4_50k-coreset-samples_20000-1688802987585.txt'

with open(lightly_file_name, 'r') as file:
    lines = file.readlines()

# Remove newline characters
lines = [line.strip() for line in lines]
datasets = [line.split('/')[0] for line in lines]

In [None]:
len(set(lines))

In [None]:
# Count the number of images in each dataset before and after lightly selection

In [None]:
# Counting the number of images of each fiel in the lightly dataset
df = pd.DataFrame(columns=['dataset', 'total_nb_images', 'total_selected_lightly', 'size_gb'])
root = '/mnt/Foto-Work-RE/26_Agricultural_Engineering-RE/263_DP/Fenaco_Blackenprojekt_2021-2023/_CURATED_DATA_SETS/'
index = 0
for d in set(datasets):
    d_path = os.path.join(root, d, '1_images')

    # Total number of images  in the original dataset before lightly filtering
    jpg_count = sum( 1 for filename in os.listdir(d_path) if filename.endswith('.JPG'))
    
    # Check total number of annotated images from the lightly dataset
    total_selected_lightly = len([item for item in lines if d in item])

    total_size = 0
    for image in os.listdir(d_path):
        if image.lower().endswith(('.jpg', '.jpeg')):
            file_path = os.path.join(d_path, image)
            total_size += os.path.getsize(file_path)

    total_gb = total_size / (1024**3)


    row = [d, jpg_count, total_selected_lightly, total_gb]  # You can customize these values
    df.loc[index] = row
    index = index + 1

In [None]:
df

In [None]:
df.to_csv('../assets/lightly_totalimages_selectedimages.csv', index=False)

In [None]:
# 20220823_HaldenSued_S_10_F_50_O_stra_ID1	haldensued08
# 20220901_HaldenNord_S_10_F_50_O_sama_ID1	haldennord09
# 20221010_HaldenNord_S_25_F_60_O_sama_ID1	haldennord10
# 20221010_HaldenSued_S_25_F_60_sama_ID1	haldensued10


In [None]:
df

In [None]:
# Total size of GBs for all dataset in lightly

In [None]:
np.sum(df['size_gb'])

In [None]:
# Size of the dataset 20221010_HaldenSued_S_25_F_60_sama_ID1 ==> Total number of images

In [None]:
np.sum(df['size_gb']) + 21

In [None]:
df[df['dataset'] == '20220823_HaldenSued_S_10_F_50_O_stra_ID1']

In [None]:
df[df['dataset'] == '20220901_HaldenNord_S_10_F_50_O_sama_ID1']

In [None]:
df[df['dataset'] == '20221010_HaldenNord_S_25_F_60_O_sama_ID1']

In [None]:
df[df['dataset'] == '20221010_HaldenSued_S_25_F_60_sama_ID1']

In [None]:
# Total number of images treated

In [None]:
nb_images_20221010_HaldenSued_S_25_F_60_sama_ID1 = 999
nb_images_all_others = np.sum(df['total_nb_images'])
nb_all_images = nb_images_all_others + nb_images_20221010_HaldenSued_S_25_F_60_sama_ID1
nb_all_images

In [None]:
np.sum(df['total_selected_lightly'])

In [None]:
# Populating the CSV file with the total number of images with annotations

In [None]:
# Read the CSV file
df = pd.read_csv('../assets/lightly_totalimages_selectedimages.csv')

In [None]:
for dataset in ['haldennord09', 'haldensued08', 'haldensued10', 'haldennord10']:

   # Read the total number of annotated images
     ann_path = f'/data/images/darwin/digital-production/{dataset}/releases/latest/annotations'

    # count the json files in the annotations directory
     total_annotated = len([name for name in os.listdir(ann_path) if name.endswith('.json')]) 
    
    # Update the DataFrame
     print(total_annotated)

In [None]:
# Populating the dataframe with the number of annotated images from each dataset inside the lightly dataset
ann_path = f'/data/images/darwin/digital-production/lightly/releases/latest/annotations'
annotations = [name for name in os.listdir(ann_path) if name.endswith('.json')]
annotations =[ '_'.join(i.split('.')[0].split ('_')[:-3] )  for i in annotations] 
# loop over the df rows 
for index, row in df.iterrows():
    dataset = row['dataset']
    # Count the number of annotations for the current dataset
    total_annotated = sum(1 for ann in annotations if dataset in ann)
    
    # Update the DataFrame
    df.at[index, 'total_annotated'] = total_annotated

In [None]:
np.sum(df['total_annotated'])

In [None]:
# put df in a news scv file
df.to_csv('../assets/lightly_totalimages_selectedimages_annotated.csv', index=False)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Sort the dataframe by total_nb_images in descending order
df_sorted = df.sort_values('total_nb_images', ascending=True)  # ascending=True for horizontal bars

# Calculate the hierarchical components
df_sorted['not_selected'] = df_sorted['total_nb_images'] - df_sorted['total_selected_lightly']
df_sorted['selected_not_annotated'] = df_sorted['total_selected_lightly'] - df_sorted['total_annotated']
df_sorted['annotated'] = df_sorted['total_annotated']

# Create horizontal stacked bar plot with improved formatting
fig, ax = plt.subplots(figsize=(10, 12))

# Create the hierarchical stacked bar plot
df_sorted.set_index('dataset')[['not_selected', 'selected_not_annotated', 'annotated']].plot(
    kind='barh',
    stacked=True, 
    ax=ax,
    color=['#e6e6e6', '#ff7f0e', '#2ca02c'],  # Light gray, orange, green
    alpha=0.8,
    width=0.7
)

plt.title('Number of Images per Dataset (Hierarchical View)', fontsize=16, fontweight='bold', pad=20)
plt.ylabel('Dataset', fontsize=12, fontweight='bold')
plt.xlabel('Count', fontsize=12, fontweight='bold')

# Improve legend with hierarchical labels
plt.legend(['Not Selected', 'Selected (Not Annotated)', 'Annotated'], 
           title='Image Status',
           title_fontsize=12,
           fontsize=10,
           loc='lower right')

# Add grid for better readability
plt.grid(axis='x', alpha=0.3, linestyle='--')

# Add only annotated count labels
for i, (idx, row) in enumerate(df_sorted.iterrows()):
    annotated = row['total_annotated']
    
    # Add annotated count on the green segment
    if annotated > 0:  # Only show if there are annotated images
        ax.text(row['not_selected'] + row['selected_not_annotated'] + annotated/2, i, 
                f'{annotated:,}', va='center', ha='center', fontsize=9, color='white', fontweight='bold')

plt.tight_layout()
plt.show()