# 数据预处理操作

### 环境准备 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, concatenate, Conv2DTranspose, BatchNormalization, Activation, Dropout
from tensorflow.keras.optimizers import Adadelta, Nadam ,Adam
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.utils import  plot_model ,Sequence
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.image import load_img,img_to_array
import tensorflow as tf
from tensorflow.python.keras.losses import binary_crossentropy
from scipy.ndimage import morphology as mp
from PIL import Image,UnidentifiedImageError
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import shutil
import os
from glob import glob  # for getting list paths of image and labels
from random import choice,sample
from matplotlib import pyplot as plt
import cv2 # saving and loading images

### 将seq图片转移到待训练文件夹下

In [None]:
source_path = '../autodl-tmp/Endovis18/Train'
output_dir = os.path.join(source_path, 'images')

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

def is_image_corrupted(filepath):
    """检查图像是否损坏。"""
    try:
        with Image.open(filepath) as img:
            img.verify()  # 检查文件是否损坏
        return False
    except (IOError, SyntaxError) as e:
        print(f'Corrupted image {filepath}: {e}')
        return True

for seq in os.listdir(source_path):
    if seq in ['pixeled_annotations_train', 'images']:
        continue  # Skip special directories
    seq_path = os.path.join(source_path, seq)
    img_folder_path = os.path.join(seq_path, "left_frames")
    if not os.path.isdir(img_folder_path):
        continue  # Skip if it's not a directory
    for f in os.listdir(img_folder_path):
        _, file_extension = os.path.splitext(f)
        if file_extension.lower() not in ['.jpg', '.png']:
            continue  # Skip non-image files
        file_path = os.path.join(img_folder_path, f)
        if is_image_corrupted(file_path):
            continue  # Skip copying if the image is corrupted
        output_file_path = os.path.join(output_dir, f"{seq}_{f}")
        # Copy file to the new location
        shutil.copy(file_path, output_file_path)
    print(f"Copied {seq} to {output_dir}")

### 删除不符合规定的文件（隐藏文件）

In [None]:
def verify_and_delete_images(directory):
    files_in_directory = os.listdir(directory)
    for file in files_in_directory:
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):
            file_path = os.path.join(directory, file)
            try:
                img = Image.open(file_path)
                img.verify()  # Verify that this is an image
            except (IOError, SyntaxError) as e:
                print(f"Deleting corrupt file: {file_path} ({e})")
                os.remove(file_path)  # Delete corrupt file

# Example usage
train_img_dir = '../autodl-tmp/Endovis18/Train/images/'
train_mask_dir = '../autodl-tmp/Endovis18/Train/pixeled_annotations_train/'
verify_and_delete_images(train_img_dir)
verify_and_delete_images(train_mask_dir)

### 删除不同数据（确保训练图片和标注图片对应）

In [None]:
train_imgs = os.listdir(train_img_dir)
train_masks = os.listdir(train_mask_dir)
train_imgs= sorted([ i for i in train_imgs ])
train_masks= sorted([ i for i in train_masks ])

print("Number of images:", len(train_imgs))
print("Number of masks:", len(train_masks))

if len(train_imgs) != len(train_masks):
    print("The number of images and masks are not equal.")

    # 使用文件名，因为列表直接包含文件名
    img_set = set(train_imgs)
    mask_set = set(train_masks)

    extra_imgs = img_set - mask_set
    extra_masks = mask_set - img_set

    print("Extra images:", extra_imgs)
    print("Extra masks:", extra_masks)

    # 从图像和掩码列表中移除多余的条目
    train_imgs = [img for img in train_imgs if img not in extra_imgs]
    train_masks = [mask for mask in train_masks if mask not in extra_masks]

    print("Updated number of images:", len(train_imgs))
    print("Updated number of masks:", len(train_masks))

## 数据准备完毕，分为原始图片（images）和标注图片（masks）