# Split Data

This notebook aim to split the data in a smaller dataset for training and testing.

It aim to avoid biais by :
- Stratified Sampling 
- Perceptual Hashing / Feature Embeddings + kmeans

### Import Libraries


In [1]:
import os
import shutil
import random
import json

### Check the path
And check if the folder exist

In [2]:
DATA_FOLDER_PATH = "../"

INPUT_DIR = DATA_FOLDER_PATH + "00_archive/data/"
OUTPUT_DIR = DATA_FOLDER_PATH + "00_archive/data_samples/"

file_types = ["train", "test", "val"]
subdirectories = ["Coccidiosis", "Healthy", "New Castle Disease", "Salmonella"]

In [3]:

def check_and_create_path(verbose = False):
    """
    Check if the input directory structure exists and create the output directory structure if it doesn't.

    Parameters:
        - verbose (bool): If True, print detailed information about the directory structure.

    Return :
        - None
    """
    # Check input structure
    for file_type in file_types:
        input_path = os.path.join(INPUT_DIR, file_type)
        if not os.path.isdir(input_path):
            raise FileNotFoundError(f"‚ùå Input directory does not exist: {input_path}")
        if verbose :
            print(f"‚úÖ Found directory: {input_path}")

        for subdirectory in subdirectories:
            sub_path = os.path.join( input_path, subdirectory)
            if not os.path.isdir(sub_path):
                raise FileNotFoundError(f"‚ùå Subdirectory missing: {sub_path}")
            if verbose :
                print(f"  ‚úÖ Found subdirectory: {sub_path}")

    #  Check/create output directory
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR, exist_ok=True)
        if verbose :
            print(f"üìÅ Output directory created: {OUTPUT_DIR}")
    else:
        if verbose :
            print(f"‚úÖ Output directory already exists: {OUTPUT_DIR}")

    # Create output structure if not exist
    for file_type in file_types:
        output_path = os.path.join(OUTPUT_DIR, file_type)
        if not os.path.exists(output_path):
            os.makedirs(output_path, exist_ok=True)
            if verbose :
                print(f"üìÅ Created output directory: {output_path}")
        else:
            if verbose :
                print(f"‚úÖ Output directory already exists: {output_path}")

        for subdirectory in subdirectories:
            sub_path = os.path.join(output_path, subdirectory)
            if not os.path.exists(sub_path):
                os.makedirs(sub_path, exist_ok=True)
                if verbose :
                    print(f"üìÅ Created output subdirectory: {sub_path}")
            else:
                if verbose :
                    print(f"‚úÖ Output subdirectory already exists: {sub_path}")

    print(f"‚úÖ All folder structures are in place.")

check_and_create_path(False)

‚úÖ All folder structures are in place.


In [4]:
def clean_directory(directory):
    """
    Remove all files and subdirectories in a given directory.

    Parameters:
        - directory (str): The path to the directory to be cleaned.

    Return :
        - None
    """
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print(f"Failed to delete {file_path}. Reason: {e}")

def clean_output_samples():
    """
    Clean the output samples directory by removing all files and subdirectories.

    Parameters:
        - None

    Return :
        - None
    """
    for file_type in file_types:
        output_path = os.path.join(OUTPUT_DIR, file_type)
        for subdirectory in subdirectories:
            sub_path = os.path.join(output_path, subdirectory)
            clean_directory(sub_path)
    print(f"Cleaned output samples directory: {OUTPUT_DIR}")


### Sampled data

V1 : basic stratified split per class and per folder using `train_test_split` from `scikit-learn`

In [5]:
#lets see how much data we have
def count_files_in_directory(directory):
    """
    Count the number of files in a given directory.

    Parameters:
        - directory (str): The path to the directory to be counted.

    Return :
        - int: The number of files in the directory.
    """
    return sum(len(files) for _, _, files in os.walk(directory))

def count_files_in_subdirectories(directory):
    """
    Count the number of files in all subdirectories of a given directory.

    Parameters:
        - directory (str): The path to the directory to be counted.

    Return :
        - int: The total number of files in all subdirectories.
    """
    total_files = 0
    for subdirectory in subdirectories:
        sub_path = os.path.join(directory, subdirectory)
        total_files += count_files_in_directory(sub_path)
    return total_files

def count_files_in_all_directories():
    """
    Count the number of files in all directories and subdirectories.

    Parameters:
        - None

    Return :
        - None
    """
    for file_type in file_types:
        input_path = os.path.join(INPUT_DIR, file_type)
        total_files = count_files_in_subdirectories(input_path)
        print(f"Total files in {file_type}: {total_files}")
        for subdirectory in subdirectories:
            sub_path = os.path.join(input_path, subdirectory)
            num_files = count_files_in_directory(sub_path)
            print(f"  {subdirectory}: {num_files} files")

print("-----------------------------------------------------")
count_files_in_all_directories()
print("-----------------------------------------------------")

-----------------------------------------------------
Total files in train: 400000
  Coccidiosis: 100000 files
  Healthy: 100000 files
  New Castle Disease: 100000 files
  Salmonella: 100000 files
Total files in test: 70677
  Coccidiosis: 18752 files
  Healthy: 17412 files
  New Castle Disease: 15888 files
  Salmonella: 18625 files
Total files in val: 40000
  Coccidiosis: 10000 files
  Healthy: 10000 files
  New Castle Disease: 10000 files
  Salmonella: 10000 files
-----------------------------------------------------


In [5]:
SAMPLES_PER_CLASS = 1000  # arbitrary number of samples per class

In [6]:
def fixed_count_sample(verbose=False):
    """
    Sample a fixed number of images per class per folder and copy them to the output directory.
    """
    for file_type in file_types:
        for subdirectory in subdirectories:
            input_path = os.path.join(INPUT_DIR, file_type, subdirectory)
            output_path = os.path.join(OUTPUT_DIR, file_type, subdirectory)

            # List images
            images = [f for f in os.listdir(input_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
            if len(images) == 0:
                print(f"No images found in {input_path}")
                continue

            # Adjust if fewer images than the sample size
            sample_count = min(SAMPLES_PER_CLASS, len(images))
            sampled_images = random.sample(images, sample_count)

            # Copy sampled images
            for img in sampled_images:
                src = os.path.join(input_path, img)
                dst = os.path.join(output_path, img)
                shutil.copy2(src, dst)

            if verbose:
                print(f"{sample_count} images copied to {output_path}")

    print("Fixed-count sampling done!")

# clean_output_samples()
# fixed_count_sample(verbose=True)

### Sampled data 

V2 : ResNet + KMeans Diversity Sampling


In [7]:
# clean_output_samples()

In [7]:
import torch
print(torch.cuda.is_available())  # doit afficher True
print(torch.cuda.get_device_name(0)) 

True
NVIDIA GeForce GTX 1660 Ti


In [8]:
# Parameters
IMAGE_SIZE = 224
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")
k_tracking = {}



Using device: cuda


In [9]:
def log_k_for_subdir(subdir, iteration, k):
    parts = subdir.split(os.sep)
    current = k_tracking
    for part in parts:
        if part not in current:
            current[part] = {}
        current = current[part]
    current[f"iteration_{iteration}"] = k


def get_next_iteration(subdir):
    parts = subdir.split(os.sep)
    current = k_tracking
    for part in parts:
        if part not in current:
            return 1  # First iteration if the path doesn't exist
        current = current[part]

    # Extract the highest iteration number from the keys
    existing = [int(k.split('_')[1]) for k in current.keys() if k.startswith('iteration_')]
    return max(existing) + 1 if existing else 1

def get_k_tracing(k_tracking):

    if os.path.exists("best_k_foreach.json"):
        with open("best_k_foreach.json", "r") as f:
            try:
                existing = json.load(f)
                k_tracking.update(existing)  # Fusionne sans √©craser
            except json.JSONDecodeError:
                print("Fichier JSON invalide, d√©marrage d'un k_tracking vide.")
    return k_tracking




In [10]:
import numpy as np
from tqdm import tqdm
from PIL import Image

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import torch
from torchvision import models, transforms



# Load Pretrained ResNet (remove final classification layer)
resnet = models.resnet18(pretrained=True)
resnet.fc = torch.nn.Identity()
resnet = resnet.to(DEVICE).eval()

# Preprocessing
preprocess = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])


def extract_embedding(image_path):
    try:
        img = Image.open(image_path).convert("RGB")
        img_tensor = preprocess(img).unsqueeze(0).to(DEVICE)
        with torch.no_grad():
            embedding = resnet(img_tensor).squeeze().cpu().numpy()
        return embedding
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

def find_optimal_k(embeddings, k_range=range(5, 15)):
    max_possible_k = min(len(embeddings), max(k_range))
    best_k = 5
    best_score = -1

    for k in range(5, max_possible_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42).fit(embeddings)
        score = silhouette_score(embeddings, kmeans.labels_)
        if score > best_score:
            best_k = k
            best_score = score

    print(f"Optimal k found: {best_k} with silhouette score: {best_score:.4f}")

    return best_k, best_score


def sample_by_auto_clustering(embeddings, paths, total_samples=300, log_info=None):
    k, _ = find_optimal_k(embeddings)
    k = min(k, len(embeddings))

    if log_info:
        log_k_for_subdir(log_info['subdir'], log_info['iteration'], k)

    kmeans = KMeans(n_clusters=k, random_state=42).fit(embeddings)
    labels = kmeans.labels_

    samples_per_cluster = total_samples // k
    selected_paths = []

    for i in range(k):
        indices = [j for j, label in enumerate(labels) if label == i]
        cluster_embeddings = [embeddings[j] for j in indices]
        cluster_paths = [paths[j] for j in indices]

        center = kmeans.cluster_centers_[i]
        dists = np.linalg.norm(np.array(cluster_embeddings) - center, axis=1)
        sorted_indices = np.argsort(dists)

        max_samples = min(samples_per_cluster, len(cluster_paths))

        for idx in sorted_indices[:max_samples]:
            selected_paths.append(cluster_paths[idx])

    return selected_paths


# For all subdirectories, sample a fixed number of images per class
def diverse_sample_per_subdirectory_all(verbose=False):  
    input_root = INPUT_DIR
    output_root = OUTPUT_DIR 
    TOTAL_SAMPLES_PER_FOLDER = SAMPLES_PER_CLASS  # You can adjust this per folder
    IMAGE_EXTENSIONS = ('.jpg', '.jpeg', '.png')

    os.makedirs(output_root, exist_ok=True)
    print("Scanning dataset structure...")

    file_types = [d for d in os.listdir(input_root) if os.path.isdir(os.path.join(input_root, d))]

    for file_type in file_types:
        file_type_path = os.path.join(input_root, file_type)
        subdirectories = [d for d in os.listdir(file_type_path) if os.path.isdir(os.path.join(file_type_path, d))]

        for subdirectory in subdirectories:
            input_path = os.path.join(file_type_path, subdirectory)
            output_path = os.path.join(output_root, file_type, subdirectory)
            os.makedirs(output_path, exist_ok=True)

            images = [f for f in os.listdir(input_path) if f.lower().endswith(IMAGE_EXTENSIONS)]
            image_paths = [os.path.join(input_path, f) for f in images]

            embeddings = []
            valid_paths = []

            for path in tqdm(image_paths, desc=f"üîç {file_type}/{subdirectory}", leave=False):
                emb = extract_embedding(path)
                if emb is not None:
                    embeddings.append(emb)
                    valid_paths.append(path)

            if len(valid_paths) == 0:
                print(f"No valid images in {file_type}/{subdirectory}")
                continue

            print(f"{file_type}/{subdirectory}: {len(valid_paths)} images, extracting clusters...")

            selected_paths = sample_by_auto_clustering(
                embeddings,
                valid_paths,
                total_samples=min(TOTAL_SAMPLES_PER_FOLDER, len(valid_paths))
            )

            for src in selected_paths:
                dst = os.path.join(output_path, os.path.basename(src))
                shutil.copy2(src, dst)

            if verbose:
                print(f"Copied {len(selected_paths)} images to {output_path}")

    print("Sampling complete for all subdirectories.")


def diverse_sample_per_subdirectory(subdirs, verbose=False):
    input_root = INPUT_DIR
    output_root = OUTPUT_DIR
    TOTAL_SAMPLES_PER_FOLDER = SAMPLES_PER_CLASS
    IMAGE_EXTENSIONS = ('.jpg', '.jpeg', '.png')

    os.makedirs(output_root, exist_ok=True)
    print("Sampling from specific subdirectories...")

    for relative_subdir in subdirs:
        iteration = get_next_iteration(relative_subdir)
        input_path = os.path.join(input_root, relative_subdir)
        output_path = os.path.join(output_root, relative_subdir)
        os.makedirs(output_path, exist_ok=True)

        if not os.path.isdir(input_path):
            print(f"Skipping non-directory: {input_path}")
            continue

        images = [f for f in os.listdir(input_path) if f.lower().endswith(IMAGE_EXTENSIONS)]
        image_paths = [os.path.join(input_path, f) for f in images]

        embeddings = []
        valid_paths = []

        for path in tqdm(image_paths, desc=f"{relative_subdir}", leave=False):
            emb = extract_embedding(path)
            if emb is not None:
                embeddings.append(emb)
                valid_paths.append(path)

        if len(valid_paths) == 0:
            print(f"No valid images in {relative_subdir}")
            continue

        print(f"{relative_subdir}: {len(valid_paths)} images, extracting clusters...")

        selected_paths = sample_by_auto_clustering(
            embeddings,
            valid_paths,
            total_samples=min(TOTAL_SAMPLES_PER_FOLDER, len(valid_paths)),
            log_info={"subdir": relative_subdir, "iteration": iteration}
        )

        for src in selected_paths:
            dst = os.path.join(output_path, os.path.basename(src))
            shutil.copy2(src, dst)

        if verbose:
            print(f"Copied {len(selected_paths)} images to {output_path}")

    with open("best_k_foreach.json", "w") as f:
        json.dump(k_tracking, f, indent=4)

    print("Sampling complete.")


all_subdirs = [
    "train/Coccidiosis",#DONE
    "train/Healthy",#DONE
    "train/New Castle Disease", 
    "train/Salmonella", 
    "val/Coccidiosis",#DONE
    "val/Healthy",  #DONE
    "val/New Castle Disease",#DONE
    "val/Salmonella",#DONE
    "test/Coccidiosis", #DONE
    "test/Healthy", #DONE
    "test/New Castle Disease",#DONE
    "test/Salmonella" #DONE
]




In [11]:
# Initialisation
k_tracking = {}
k_tracking = get_k_tracing(k_tracking)

if not os.path.exists("best_k_foreach.json"):
    with open("best_k_foreach.json", "w") as f:
        json.dump(k_tracking, f, indent=4)
        print("File created: best_k_foreach.json")

In [12]:
k_tracking

{'val/Salmonella': {'iteration_1': 5, 'iteration_2': 5, 'iteration_3': 5},
 'val/New Castle Disease': {'iteration_1': 6,
  'iteration_2': 6,
  'iteration_3': 6},
 'val/Healthy': {'iteration_1': 5, 'iteration_2': 5, 'iteration_3': 5},
 'val/Coccidiosis': {'iteration_1': 8, 'iteration_2': 8, 'iteration_3': 8},
 'test/Salmonella': {'iteration_1': 7, 'iteration_2': 9, 'iteration_3': 7},
 'test/New Castle Disease': {'iteration_1': 7,
  'iteration_2': 7,
  'iteration_3': 6},
 'test/Healthy': {'iteration_1': 6, 'iteration_2': 10},
 'test/Coccidiosis': {'iteration_1': 10, 'iteration_2': 5},
 'train/Salmonella': {'iteration_1': 9},
 'train/New Castle Disease': {'iteration_1': 7},
 'train/Coccidiosis': {'iteration_1': 5},
 'train/Healthy': {'iteration_1': 10}}

In [13]:
subdir_to_process = ["train/New Castle Disease", "train/Salmonella" ]
diverse_sample_per_subdirectory(subdir_to_process,verbose=True)

Sampling from specific subdirectories...


                                                                                

train/New Castle Disease: 100000 images, extracting clusters...
Optimal k found: 7 with silhouette score: 0.0515
Copied 994 images to ../00_archive/data_samples/train/New Castle Disease


                                                                        

train/Salmonella: 100000 images, extracting clusters...
Optimal k found: 9 with silhouette score: 0.0487
Copied 999 images to ../00_archive/data_samples/train/Salmonella
Sampling complete.


old version :`

```py 
import numpy as np
from tqdm import tqdm
from PIL import Image

from sklearn.cluster import KMeans
import torch
from torchvision import models, transforms

# Parameters
SAMPLES_PER_CLASS = 30
IMAGE_SIZE = 224
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Load Pretrained ResNet (remove final classification layer)
resnet = models.resnet18(pretrained=True)
resnet.fc = torch.nn.Identity()
resnet = resnet.to(DEVICE).eval()

# Preprocessing
preprocess = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

def extract_embedding(image_path):
    try:
        img = Image.open(image_path).convert("RGB")
        img_tensor = preprocess(img).unsqueeze(0).to(DEVICE)
        with torch.no_grad():
            embedding = resnet(img_tensor).squeeze().cpu().numpy()
        return embedding
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

def diverse_sample_with_kmeans(verbose=False):
    """
    For each class in each file_type folder, extract embeddings, perform KMeans,
    and copy the most diverse images (closest to cluster centers).
    """
    for file_type in file_types:
        for subdirectory in subdirectories:
            input_path = os.path.join("../00_archive/data_samples_old", file_type, subdirectory)
            output_path = os.path.join(OUTPUT_DIR, file_type, subdirectory)

            # List images
            images = [f for f in os.listdir(input_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
            image_paths = [os.path.join(input_path, f) for f in images]

            # Extract embeddings
            embeddings = []
            valid_paths = []

            for path in tqdm(image_paths, desc=f"üîç {file_type}/{subdirectory}", leave=False):
                emb = extract_embedding(path)
                if emb is not None:
                    embeddings.append(emb)
                    valid_paths.append(path)

            if len(valid_paths) == 0:
                print(f"No valid images found in {input_path}")
                continue

            # KMeans clustering
            n_clusters = min(SAMPLES_PER_CLASS, len(valid_paths))
            kmeans = KMeans(n_clusters=n_clusters, random_state=42)
            kmeans.fit(embeddings)

            # Find image closest to each cluster center
            selected_paths = []
            for center in kmeans.cluster_centers_:
                dists = np.linalg.norm(np.array(embeddings) - center, axis=1)
                idx = np.argmin(dists)
                selected_paths.append(valid_paths[idx])

            # Remove duplicates
            selected_paths = list(set(selected_paths))

            # Copy selected images
            for src in selected_paths:
                dst = os.path.join(output_path, os.path.basename(src))
                shutil.copy2(src, dst)

            if verbose:
                print(f"{len(selected_paths)} diverse images copied to {output_path}")

    print("üéØ Diversity-based sampling complete!")

# üöÄ Run it
diverse_sample_with_kmeans(verbose=True)
```

# Visualisation of the resnet embeddings


In [9]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

def plot_embeddings(embeddings, labels=None, method='tsne', title='Embeddings Visualization'):
    if method == 'pca':
        reducer = PCA(n_components=2)
    elif method == 'tsne':
        reducer = TSNE(n_components=2, random_state=42, perplexity=30)
    else:
        raise ValueError("Method must be 'tsne' or 'pca'.")

    reduced_embeddings = reducer.fit_transform(embeddings)

    plt.figure(figsize=(10, 8))
    if labels is not None:
        scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=labels, cmap='tab10', alpha=0.7)
        plt.legend(*scatter.legend_elements(), title="Clusters")
    else:
        plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], alpha=0.7)

    plt.title(title)
    plt.xlabel("Component 1")
    plt.ylabel("Component 2")
    plt.grid(True)
    plt.show()
