In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
from tensorflow.keras.layers import (Dense, GlobalAveragePooling2D, Input, Dropout, 
                                     Flatten, Concatenate, GlobalMaxPooling2D)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from keras.losses import binary_crossentropy

import os
from glob import glob
from random import shuffle
import cv2
import tifffile
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
df_train = pd.read_csv("../input/histopathologic-cancer-detection/train_labels.csv")
id_label_map = df_train.set_index('id')['label'].to_dict()

In [3]:
# 이미지 삭제코드 누락
import concurrent.futures
from pathlib import Path

def id_from_file_path(file_path):
    file_name = Path(file_path).name
    if file_name.endswith('.tif'):
        return file_name[:-4]
    return file_name

def get_file_paths_from_directory(directory):
    return [str(file) for file in Path(directory).rglob("*.tif")]

def process_files_in_parallel(file_paths):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # 병렬로 파일 경로 처리
        return list(executor.map(id_from_file_path, file_paths))

directory_path = "../input/histopathologic-cancer-detection/train"
tif_files = get_file_paths_from_directory(directory_path)

file_ids = process_files_in_parallel(tif_files)

print(file_ids[:5])

['d43c081bafa286f9c1f7e921883f26ceafebc912', '092d0eedebce504847715ee046b6ad74b57599b4', 'b0d2582c6218a8764323fc940b41312282b99bf4', '187c99df762f13f99818e5593d4bab4c6577e7e3', '7c5270c83837de5a5cbb2dca511559dc39d19d53']


In [4]:
labeled_tif = glob('../input/histopathologic-cancer-detection/train/*.tif')
test_tif = glob('../input/histopathologic-cancer-detection/test/*.tif')

In [5]:
print("Number of labeled tif :", len(labeled_tif))
print("Number of test tif :", len(test_tif))

Number of labeled tif : 220025
Number of test tif : 57458


In [6]:
train, val = train_test_split(labeled_tif, test_size=0.1, random_state=42)

In [7]:
def chunker(seq, size):
    return (seq[position: position + size] for position in range(0, len(seq), size))

In [8]:
def increase_green_channel(image):
    image[:, :, 1] = np.clip(image[:, :, 1] * 1.5, 0, 255)  
    return image

def image_to_green_histogram(image_path):
    with tifffile.TiffFile(image_path) as tif:
        image = tif.asarray()
    
    if image.ndim == 3:
        green_channel = image[:, :, 1]  
    else:
        green_channel = image 
    
    hist, _ = np.histogram(green_channel, bins=256, range=(0, 256))
    
    return hist


def convert_images_to_histograms(directory, num_images):
    histograms = []
    count = 0
    
    for filename in os.listdir(directory):
        if filename.endswith(".tif"):
            image_path = os.path.join(directory, filename)
            histogram = image_to_green_histogram(image_path)
            histograms.append(histogram)
            count += 1
                        
            if count >= num_images:
                break
    
    return np.array(histograms)

In [9]:
def data_gen(list_files, id_label_map, batch_size, augment=False):
    tif_gen = ImageDataGenerator(
        rotation_range=8,
        width_shift_range=0.1,
        height_shift_range=0.1,
        horizontal_flip=True,
        vertical_flip=True,
        zoom_range=0.15,
        shear_range=4,
        preprocessing_function=increase_green_channel   
    )
    
    while True:
        shuffle(list_files)
        for batch in chunker(list_files, batch_size):
            X = [cv2.imread(x) for x in batch]
            Y = [id_label_map[id_from_file_path(x)] for x in batch]
            histograms = [image_to_green_histogram(x) for x in batch]   
            
            if augment:
                X = [tif_gen.random_transform(x) for x in X]
            
            X = [preprocess_input(x.astype(np.float32)) for x in X]
            
            X = np.array(X)
            histograms = np.array(histograms)   
            Y = np.array(Y)   
            
            yield (X, histograms), Y   

In [10]:
# 히스토그램 추가해서 예측하는 코드
def mobilenetv2(histogram_input_shape=(256,), image_input_shape=(96, 96, 3)):
    # 이미지 입력
    image_inputs = Input(shape=image_input_shape)
    
    # MobileNetV2 모델
    base_model = MobileNetV2(include_top=False, input_shape=image_input_shape)
    x = base_model(image_inputs)
    
    # 다양한 풀링 레이어
    out1 = GlobalMaxPooling2D()(x)
    out2 = GlobalAveragePooling2D()(x)
    out3 = Flatten()(x)
    
    # 풀링 결과를 결합
    combined_features = Concatenate(axis=-1)([out1, out2, out3])
    combined_features = Dropout(0.5)(combined_features)
    
    # 히스토그램 입력
    histogram_inputs = Input(shape=histogram_input_shape)
    
    # 히스토그램 데이터를 처리하는 Dense 레이어
    histogram_dense = Dense(128, activation='relu')(histogram_inputs)
    
    # 이미지와 히스토그램 데이터를 결합
    combined = Concatenate(axis=-1)([combined_features, histogram_dense])
    
    # 최종 출력 레이어
    out = Dense(1, activation="sigmoid", name="output")(combined)
    
    # 모델 정의
    model = Model(inputs=[image_inputs, histogram_inputs], outputs=out)
    model.compile(optimizer=Adam(0.0001), loss=binary_crossentropy, metrics=['acc'])
    model.summary()

    return model


In [11]:
model = mobilenetv2()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_96_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


In [12]:
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint

h5_path = "model.keras"

# ModelCheckpoint 콜백
checkpoint = ModelCheckpoint(
    h5_path,                  # 모델 가중치를 저장할 경로
    monitor='val_acc',        # 모니터링할 지표
    verbose=1,                # 로그 출력 여부
    save_best_only=True,      # 성능이 개선된 경우에만 저장
    mode='max'                # val_acc가 최대가 될 때 모델 저장
)

# 얼리 스토핑 콜백 (검증 정확도 모니터링)
early_stopping_acc = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max', restore_best_weights=True)

# 얼리 스토핑 콜백 (검증 손실 모니터링)
early_stopping_loss = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1)

# 학습률 감소 콜백
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', patience=3, factor=0.5, min_lr=0.00001, verbose=1)



In [13]:
h5_path = "model.keras"

# 콜백 설정
checkpoint = ModelCheckpoint(h5_path, monitor='val_acc', save_best_only=True, mode='max', verbose=1)
early_stopping_acc = EarlyStopping(monitor='val_acc', patience=3, verbose=1, mode='max', restore_best_weights=True)
early_stopping_loss = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1)

# 첫 번째 학습
batch_size = 32
history = model.fit(
    data_gen(train, id_label_map, batch_size, augment=True),
    validation_data=data_gen(val, id_label_map, batch_size),
    epochs=2, verbose=1,
    callbacks=[checkpoint, early_stopping_acc, early_stopping_loss, reduce_lr],
    steps_per_epoch=len(train) // batch_size,
    validation_steps=len(val) // batch_size)

# 두 번째 학습
batch_size = 64
history = model.fit(
    data_gen(train, id_label_map, batch_size, augment=True),
    validation_data=data_gen(val, id_label_map, batch_size),
    epochs=6, verbose=1,
    callbacks=[checkpoint, early_stopping_acc, early_stopping_loss, reduce_lr],
    steps_per_epoch=len(train) // batch_size,
    validation_steps=len(val) // batch_size)

# 세 번째 학습
model.compile(optimizer=Adam(0.00001), loss=binary_crossentropy, metrics=['acc'])
history = model.fit(
    data_gen(train, id_label_map, batch_size, augment=True),
    validation_data=data_gen(val, id_label_map, batch_size),
    epochs=6, verbose=1,
    callbacks=[checkpoint, early_stopping_acc, early_stopping_loss, reduce_lr],
    steps_per_epoch=len(train) // batch_size,
    validation_steps=len(val) // batch_size)

# 모델 가중치 로드
model.load_weights(h5_path)

Epoch 1/2
[1m6188/6188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 360ms/step - acc: 0.8468 - loss: 0.4539
Epoch 1: val_acc improved from -inf to 0.91626, saving model to model.keras
[1m6188/6188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2484s[0m 395ms/step - acc: 0.8468 - loss: 0.4538 - val_acc: 0.9163 - val_loss: 0.2240 - learning_rate: 1.0000e-04
Epoch 2/2
[1m6188/6188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step - acc: 0.9184 - loss: 0.2156
Epoch 2: val_acc improved from 0.91626 to 0.93514, saving model to model.keras
[1m6188/6188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1118s[0m 178ms/step - acc: 0.9184 - loss: 0.2156 - val_acc: 0.9351 - val_loss: 0.1804 - learning_rate: 1.0000e-04
Restoring model weights from the end of the best epoch: 2.
Restoring model weights from the end of the best epoch: 2.
Epoch 1/6
[1m3094/3094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 310ms/step - acc: 0.9427 - loss: 0.1555
Epoch 1: val_acc did 

In [14]:
preds = []
ids = []
batch_size = 32

# 데이터 제너레이터를 사용하여 예측
for batch in chunker(test_tif, batch_size):
    # 이미지 읽기 및 전처리
    X = [cv2.imread(x) for x in batch]
    ids_batch = [id_from_file_path(x) for x in batch]
    
    # 전처리 및 히스토그램 계산
    histograms = [image_to_green_histogram(x) for x in batch]  # 히스토그램 계산
    X = [preprocess_input(x.astype(np.float32)) for x in X]
    
    # NumPy 배열로 변환
    X = np.array(X)
    histograms = np.array(histograms)  # 히스토그램 배열로 변환
    
    # 예측 수행
    preds_batch = (
        (model.predict([X, histograms]).ravel() *
         model.predict([X[:, ::-1, :, :], histograms]).ravel() *
         model.predict([X[:, ::-1, ::-1, :], histograms]).ravel() *
         model.predict([X[:, :, ::-1, :], histograms]).ravel()) ** 0.25
    ).tolist()
    
    preds += preds_batch
    ids += ids_batch

# 결과를 DataFrame으로 변환하고 CSV로 저장
df = pd.DataFrame({'id': ids, 'label': preds})
df.to_csv("submission.csv", index=False)
df.head()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms

Unnamed: 0,id,label
0,a7ea26360815d8492433b14cd8318607bcf99d9e,0.023997
1,59d21133c845dff1ebc7a0c7cf40c145ea9e9664,0.000669
2,5fde41ce8c6048a5c2f38eca12d6528fa312cdbb,0.101719
3,bd953a3b1db1f7041ee95ff482594c4f46c73ed0,0.104361
4,523fc2efd7aba53e597ab0f69cc2cbded7a6ce62,0.009658


In [15]:
df.head()

Unnamed: 0,id,label
0,a7ea26360815d8492433b14cd8318607bcf99d9e,0.023997
1,59d21133c845dff1ebc7a0c7cf40c145ea9e9664,0.000669
2,5fde41ce8c6048a5c2f38eca12d6528fa312cdbb,0.101719
3,bd953a3b1db1f7041ee95ff482594c4f46c73ed0,0.104361
4,523fc2efd7aba53e597ab0f69cc2cbded7a6ce62,0.009658


![Image](https://github.com/user-attachments/assets/d11599b8-c818-4888-a944-c367cb911b1d)