# 1. 制作图片和标签

In [None]:
# 创建图片、标签和数据划分的目录
!mkdir -p ../input/mmseg_data/images
!mkdir -p ../input/mmseg_data/labels
!mkdir -p ../input/mmseg_data/splits

In [None]:
import os
from tqdm import tqdm
import numpy as np
import cv2
import shutil
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import shutil
import os
from glob import glob
from tqdm import tqdm

input_dir = "../input/google-research-identify-contrails-reduce-global-warming/train"
output_dir = "../input/mmseg_data"
os.makedirs(output_dir, exist_ok=True)

In [None]:
# 定义一个函数来读取数据记录， 符合 false color 的格式
def read_record(directory):
    record_data = {}
    for x in [
        "band_11", 
        "band_14", 
        "band_15"
    ]:
        record_data[x] = np.load(os.path.join(directory, x + ".npy"))

    return record_data

# 数据归一化函数，将数据映射到 [0, 1] 范围
def normalize_range(data, bounds):
    """Maps data to the range [0, 1]."""
    return (data - bounds[0]) / (bounds[1] - bounds[0])


# 定义函数获得假色图像
def get_false_color(record_data):
    _T11_BOUNDS = (243, 303)
    _CLOUD_TOP_TDIFF_BOUNDS = (-4, 5)
    _TDIFF_BOUNDS = (-4, 2)

    N_TIMES_BEFORE = 4

    r = normalize_range(record_data["band_15"] - record_data["band_14"], _TDIFF_BOUNDS)
    g = normalize_range(record_data["band_14"] - record_data["band_11"], _CLOUD_TOP_TDIFF_BOUNDS)
    b = normalize_range(record_data["band_14"], _T11_BOUNDS)
    false_color = np.clip(np.stack([r, g, b], axis=2), 0, 1)
    img = false_color[..., N_TIMES_BEFORE]

    return img

In [None]:
# 获取训练图像的路径
train_images = glob(f"{input_dir}/*")
print(f"train_images: {len(train_images)}")
train_images[:5]

In [None]:
# 遍历每张训练图像，制作图像和标签
for path in tqdm(train_images):
    id = path.split("/")[-1]
    os.makedirs(f"{output_dir}/{id}", exist_ok=True)
    record_data = read_record(path)
    img = get_false_color(record_data)
    np.save(f"{output_dir}/images/{id}.npy", img)

    msk = np.load(f"{path}/human_pixel_masks.npy")
    cv2.imwrite(f"{output_dir}/labels/{id}.png", msk)

# 2. 制作splits

In [None]:
# 导入相关的库
from glob import glob
from sklearn.model_selection import GroupKFold, KFold
import cv2
import numpy as np
from tqdm import tqdm

# 获取所有图像的文件路径
all_image_files = glob("../input/mmseg_data/labels/*.png")
image_ids = [filepath.split("/")[-1].split(".")[0] for filepath in all_image_files]
print(f"Total number of images: {len(image_ids)}")
print(f"First 5 image ids: {image_ids[:5]}")

# 使用5折交叉验证来划分数据集
split = list(KFold(5, random_state=42, shuffle=True).split(image_ids))

# 遍历每一个折 (fold)，制作训练和验证数据划分
for fold, (train_idx, valid_idx) in enumerate(split):
    # # 保存训练集txt
    # with open(f"../input/mmseg_data/splits/fold_{fold}.txt", "w") as f:
    #     for idx in train_idx:
    #         f.write(image_ids[idx] + "\n")
    
    # 保存有前景的训练集txt
    fore_cnt = 0
    with open(f"../input/mmseg_data/splits/fold_{fold}_fore.txt", "w") as f:
        for idx in train_idx:
            img = cv2.imread(f"../input/mmseg_data/labels/{image_ids[idx]}.png")
            if np.sum(img) > 0:
                fore_cnt += 1
                f.write(image_ids[idx] + "\n")

    # 保存验证集txt
    with open(f"../input/mmseg_data/splits/holdout_{fold}.txt", "w") as f:
        for idx in valid_idx:
            f.write(image_ids[idx] + "\n")


    print(f"fold{fold} 训练集数量：{len(train_idx)}")
    print(f"fold{fold} 有前景的训练集数量：{fore_cnt}")
    print(f"fold{fold} 验证集数量：{len(valid_idx)}\n")