# 1. Setup and Data Loading

This first step imports all the essential libraries and loads the `train.csv` file into a pandas DataFrame. This allows us to get a first look at the dataset's structure.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
from pathlib import Path

# --- Configuration ---
# ATTENTION: Change this path to the location of your train.csv file.
# It can be a local path on your PC or a path in Google Drive.
CSV_PATH = '../data/raw_data/train.csv' # Example of a relative path

# Load the dataset
try:
    df = pd.read_csv(CSV_PATH)
    print("train.csv file loaded successfully!")
    display(df.head())
except FileNotFoundError:
    print(f"ERROR: File not found at '{CSV_PATH}'. Please check the path.")

# 2. Class Distribution Analysis

Here, we analyze the distribution of annotations across the different cell types. This is crucial to check for class imbalance, which might affect model training.

In [None]:
# Check if the dataframe 'df' was loaded
if 'df' in locals():
    print("Cell Distribution (by annotation count):")
    class_counts = df['cell_type'].value_counts()
    print(class_counts)

    # Plot the bar chart
    plt.figure(figsize=(10, 6))
    class_counts.plot(kind='bar', color=['skyblue', 'salmon', 'lightgreen'])
    plt.title('Distribution of Annotations by Cell Type')
    plt.xlabel('Cell Type')
    plt.ylabel('Number of Annotations (Cells)')
    plt.xticks(rotation=0)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()

# 3. RLE Decoding and Mask Visualization

This section contains the function to decode the Run-Length Encoding strings into binary masks. We then visualize a sample mask to ensure our decoding logic is correct before using it in the main preprocessing pipeline.

In [None]:
def rle_to_mask(rle_string: str, height: int, width: int) -> np.ndarray:
    """
    Converts a Run-Length Encoding (RLE) string to a binary mask.
    
    Args:
        rle_string (str): The RLE-encoded string.
        height (int): The height of the target image.
        width (int): The width of the target image.
        
    Returns:
        np.ndarray: A binary mask as a NumPy array.
    """
    if pd.isna(rle_string):
        return np.zeros((height, width), dtype=np.uint8)
    
    s = rle_string.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    
    img = np.zeros(width * height, dtype=np.uint8)
    for lo, hi in zip(starts, lengths):
        img[lo:lo + hi] = 1
        
    # The 'F' (Fortran) order is important to match the RLE generation method
    return img.reshape((height, width), order='F').T

# Visualize a sample mask from the dataset
if 'df' in locals():
    # Get a sample annotation (e.g., the 10th row)
    sample_annotation = df.iloc[10]
    mask = rle_to_mask(sample_annotation['annotation'], sample_annotation['height'], sample_annotation['width'])

    plt.figure(figsize=(8, 8))
    plt.imshow(mask, cmap='gray')
    plt.title(f"Sample Mask for Cell Type: {sample_annotation['cell_type']} (ID: {sample_annotation['id']})")
    plt.axis('off')
    plt.show()