### **Imports necesarios**

In [23]:
import sys
from pathlib import Path

# Ruta ABSOLUTA al proyecto (ajústala SOLO si cambias de carpeta)
PROJECT_ROOT = Path(r"C:\Users\Lottie\Desktop\project")

# Añadir al sys.path si no está
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print("PROJECT_ROOT añadido al path:", PROJECT_ROOT)


PROJECT_ROOT añadido al path: C:\Users\Lottie\Desktop\project


### **Carga del dataset original**

In [11]:
# Importar pandas
import pandas as pd

# Cargar original_train.csv y renombrar columnas
df = pd.read_csv("../data/original_train.csv")
df.columns = ["ImageId", "EncodedPixels", "ClassId"]


### **Identificación de imágenes con y sin defectos**

In [None]:
# Crear DataFrame con ImageId únicos y columna has_defect
image_ids = df["ImageId"].unique()
df_defect = df.groupby("ImageId")["EncodedPixels"].apply(lambda x: x.notna().any()).reset_index()
df_defect.columns = ["ImageId", "has_defect"]


### **División en train / validation / test**

In [None]:
# Dividir en train, val y test con estratificación
from sklearn.model_selection import train_test_split

train_ids, temp_ids = train_test_split(
    df_defect, test_size=0.30, stratify=df_defect["has_defect"], random_state=42
)

val_ids, test_ids = train_test_split(
    temp_ids, test_size=0.50, stratify=temp_ids["has_defect"], random_state=42
)


### **Construcción de los CSV finales**

In [5]:
# Crear DataFrames finales para train, val y test
train_df = df[df["ImageId"].isin(train_ids["ImageId"])]
val_df   = df[df["ImageId"].isin(val_ids["ImageId"])]
test_df  = df[df["ImageId"].isin(test_ids["ImageId"])]

# Guardar los DataFrames en archivos CSV
train_df.to_csv("../data/train_split.csv", index=False)
val_df.to_csv("../data/val_split.csv", index=False)
test_df.to_csv("../data/test_split.csv", index=False)


### **Visualización de imágenes y máscaras**

In [None]:
from src.data.utils_rle import rle_decode
import matplotlib.pyplot as plt
import numpy as np
import cv2


def show_image_with_masks(image_id, df, img_dir="data/train_images"):
    image_path = f"{img_dir}/{image_id}"
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, (256, 256))

    fig, axes = plt.subplots(1, 5, figsize=(18, 4))
    axes[0].imshow(img, cmap="gray")
    axes[0].set_title("Imagen")
    axes[0].axis("off")

    classes = ["1", "2", "3", "4"]

    for i, cls in enumerate(classes):
        row = df[(df.ImageId == image_id) & (df.ClassId == int(cls))]
        if row["EncodedPixels"].isnull().values[0]:
            mask = np.zeros((256, 256))
        else:
            mask = rle_decode(row["EncodedPixels"].values[0], (256, 1600))
            mask = cv2.resize(mask, (256, 256))

        axes[i+1].imshow(mask, cmap="gray")
        axes[i+1].set_title(f"Clase {cls}")
        axes[i+1].axis("off")

    plt.show()


ModuleNotFoundError: No module named 'src'

In [11]:
import pandas as pd
df = pd.read_csv("../data/train_split.csv")
print(df.head())


         ImageId  EncodedPixels  \
0  0002cc93b.jpg              1   
1  002af848d.jpg              4   
2  002fc4e19.jpg              1   
3  002fc4e19.jpg              2   
4  0030401a5.jpg              4   

                                             ClassId  
0  29102 12 29346 24 29602 24 29858 24 30114 24 3...  
1  290800 6 291055 13 291311 15 291566 18 291822 ...  
2  146021 3 146275 10 146529 40 146783 46 147038 ...  
3  145658 7 145901 20 146144 33 146386 47 146629 ...  
4  186833 1 187089 3 187344 6 187600 7 187855 10 ...  


In [8]:
import pandas as pd

df = pd.read_csv("../data/train_split.csv")
print(df.head())
print(df.columns)


         ImageId  EncodedPixels  \
0  0002cc93b.jpg              1   
1  002af848d.jpg              4   
2  002fc4e19.jpg              1   
3  002fc4e19.jpg              2   
4  0030401a5.jpg              4   

                                             ClassId  
0  29102 12 29346 24 29602 24 29858 24 30114 24 3...  
1  290800 6 291055 13 291311 15 291566 18 291822 ...  
2  146021 3 146275 10 146529 40 146783 46 147038 ...  
3  145658 7 145901 20 146144 33 146386 47 146629 ...  
4  186833 1 187089 3 187344 6 187600 7 187855 10 ...  
Index(['ImageId', 'EncodedPixels', 'ClassId'], dtype='object')


In [9]:
import pandas as pd

df = pd.read_csv("../data/train_split.csv")
df.columns = ["ImageId", "EncodedPixels", "ClassId"]
df.to_csv("../data/train_split.csv", index=False)