In [1]:
# input data 압 축 풀 기 한 번 만 ! 
import zipfile

from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/'

output_unzip = zipfile.ZipFile("/content/drive/MyDrive/project/BreastCancer/input/open.zip", "r")  # "r": read 모드
output_unzip.extractall(file_path)
output_unzip.close()

Mounted at /content/drive


In [2]:
import random
import pandas as pd
import numpy as np
import os
import cv2
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

import torchvision.models as models

from sklearn import metrics
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings(action='ignore') 

In [3]:
# Cfg, 시드 정하기!!!!!!!
EPOCHS = 30
MODEL = 'densenet169'
VERSION = 1

CFG = {
    'IMG_SIZE': 512,
    'EPOCHS': EPOCHS,
    'LEARNING_RATE':1e-4,
    'BATCH_SIZE':16,
    'SEED':41
}

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

In [4]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [7]:
train_df['암의 장경'] = train_df['암의 장경'].fillna(train_df['암의 장경'].median())
train_df = train_df.fillna(0)

test_df['암의 장경'] = test_df['암의 장경'].fillna(train_df['암의 장경'].median())
test_df = test_df.fillna(0)

In [8]:
train_df, val_df, train_labels, val_labels = train_test_split(
                                                    train_df.drop(columns=['N_category']), 
                                                    train_df['N_category'], 
                                                    test_size=0.2, 
                                                    random_state=CFG['SEED']
                                                )

In [9]:
def get_values(value):
    return value.values.reshape(-1, 1)

numeric_cols = ['나이', '암의 장경', 'ER_Allred_score', 'PR_Allred_score', 'KI-67_LI_percent', 'HER2_SISH_ratio']
ignore_cols = ['ID', 'img_path', 'mask_path', '수술연월일', 'N_category']

for col in train_df.columns:
    if col in ignore_cols:
        continue
    if col in numeric_cols:
        scaler = StandardScaler() # 전처리
        train_df[col] = scaler.fit_transform(get_values(train_df[col]))
        val_df[col] = scaler.transform(get_values(val_df[col]))
        test_df[col] = scaler.transform(get_values(test_df[col]))
    else:
        le = LabelEncoder()
        train_df[col] = le.fit_transform(get_values(train_df[col]))
        val_df[col] = le.transform(get_values(val_df[col]))
        test_df[col] = le.transform(get_values(test_df[col]))

In [10]:
class CustomDataset(Dataset):
    def __init__(self, medical_df, labels, transforms=None):
        self.medical_df = medical_df
        self.transforms = transforms
        self.labels = labels
        
    def __getitem__(self, index):
        img_path = self.medical_df['img_path'].iloc[index]
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.transforms is not None:
            image = self.transforms(image=image)['image']
                
        if self.labels is not None:
            tabular = torch.Tensor(self.medical_df.drop(columns=['ID', 'img_path', 'mask_path', '수술연월일']).iloc[index])
            label = self.labels[index]
            return image, tabular, label
        else:
            tabular = torch.Tensor(self.medical_df.drop(columns=['ID', 'img_path', '수술연월일']).iloc[index])
            return image, tabular
        
    def __len__(self):
        return len(self.medical_df)

In [11]:
train_transforms = A.Compose([
                            A.HorizontalFlip(),
                            A.VerticalFlip(),
                            A.Rotate(limit=90, border_mode=cv2.BORDER_CONSTANT,p=0.3),
                            A.Resize(CFG['IMG_SIZE'],CFG['IMG_SIZE']),
                            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, always_apply=False, p=1.0),
                            ToTensorV2()
                            ])

test_transforms = A.Compose([
                            A.Resize(CFG['IMG_SIZE'],CFG['IMG_SIZE']),
                            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, always_apply=False, p=1.0),
                            ToTensorV2()
                            ])

In [12]:
train_dataset = CustomDataset(train_df, train_labels.values, train_transforms)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(val_df, val_labels.values, test_transforms)
val_loader = DataLoader(val_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [13]:
import os
 
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print ('Error: Creating directory. ' +  directory)

In [14]:
##gdrive path 설정
GPATH = '/content/drive/MyDrive/project/BreastCancer/baseline/'
print(GPATH)
createFolder(GPATH)


print('########################################################')
class ImgFeatureExtractor(nn.Module):
    def __init__(self):
        super(ImgFeatureExtractor, self).__init__()
        #####모델 입력하기 까먹지 말긔#####
        self.backbone = models.resnext50_32x4d(pretrained=True)
        self.embedding = nn.Linear(1000,512)
        
    def forward(self, x):
        x = self.backbone(x)
        x = self.embedding(x)
        return x

print('########################################################')    
class TabularFeatureExtractor(nn.Module):
    def __init__(self):
        super(TabularFeatureExtractor, self).__init__()
        self.embedding = nn.Sequential(
            nn.Linear(in_features=23, out_features=128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),
            nn.Linear(in_features=128, out_features=256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(),
            nn.Linear(in_features=256, out_features=512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(),
            nn.Linear(in_features=512, out_features=512)
        )
        
    def forward(self, x):
        x = self.embedding(x)
        return x
print('########################################################')    
class ClassificationModel(nn.Module):
    def __init__(self):
        super(ClassificationModel, self).__init__()
        self.img_feature_extractor = ImgFeatureExtractor()
        self.tabular_feature_extractor = TabularFeatureExtractor()
        self.classifier = nn.Sequential(
            nn.Linear(in_features=1024, out_features=1),
            nn.Sigmoid(),
        )
        
    def forward(self, img, tabular):
        img_feature = self.img_feature_extractor(img)
        tabular_feature = self.tabular_feature_extractor(tabular)
        feature = torch.cat([img_feature, tabular_feature], dim=-1)
        output = self.classifier(feature)
        return output
print('########################################################')
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    dic_log={
        'epoch':[],
        'train_loss':[],
        'val_loss':[],
        'val_score':[]
    }
    model.to(device)
    criterion = nn.BCEWithLogitsLoss().to(device)
    
    best_score = 0
    best_model = None
    
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        for img, tabular, label in tqdm(iter(train_loader)):
            img = img.float().to(device)
            tabular = tabular.float().to(device)
            label = label.float().to(device)
            
            optimizer.zero_grad()
            
            model_pred = model(img, tabular)
            
            loss = criterion(model_pred, label.reshape(-1,1))
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
            tl = np.mean(train_loss)
        val_loss, val_score = validation(model, criterion, val_loader, device)
        print(f'Epoch [{epoch}], Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}] Val Score : [{val_score:.5f}]')
        ##### 에폭마다 log 저장하기!!!
        dic_log['epoch'].append(epoch)
        dic_log['train_loss'].append(np.mean(train_loss))
        dic_log['val_loss'].append(val_loss)
        dic_log['val_score'].append(val_score)

        if scheduler is not None:
            scheduler.step(val_score)
        
        if best_score < val_score:
            best_score = val_score
            best_model = model
            # best 뽑기
            test_dataset = CustomDataset(test_df, None, test_transforms)
            test_loader = DataLoader(test_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=0)
            torch.save(model, GPATH+f'{MODEL}/{VERSION}/ckp/best_{MODEL}_{epoch}.pt') # 각 epoch가 끝날 때마다 모델 상태 저장
            predictions,preds = inference(best_model, test_loader, device)
            print('########################################################')
            submit = pd.read_csv('./sample_submission.csv')
            submit['N_category'] = preds
            submit['predictions'] = predictions
            submit.to_csv(GPATH+f'{MODEL}/{VERSION}/submit/best_submit_{MODEL}_{EPOCHS}_{epoch}.csv', index=False)


        torch.save(model, GPATH+f'{MODEL}/{VERSION}/ckp/checkpoint_{MODEL}_{epoch}.pt') # 각 epoch가 끝날 때마다 모델 상태 저장
        print('########################################################')
        ##### 에폭마다 submission 뽑기!
        submit = pd.read_csv('./sample_submission.csv')
        predictions,preds = inference(model, test_loader, device)
        submit['N_category'] = preds
        submit['predictions'] = predictions
        submit.to_csv(GPATH+ f'{MODEL}/{VERSION}/submit/submit_{MODEL}_{epoch}.csv', index=False)

    # 로그 저장하기!
    df_log = pd.DataFrame(dic_log)
    df_log.to_csv(GPATH+ f'{MODEL}/{VERSION}/log_{MODEL}_{VERSION}.csv',index=False)
    return best_model

print('########################################################')
def validation(model, criterion, val_loader, device):
    model.eval()
    pred_labels = []
    true_labels = []
    val_loss = []
    threshold = 0.1
    with torch.no_grad():
        for img, tabular, label in tqdm(iter(val_loader)):
            true_labels += label.tolist()
            
            img = img.float().to(device)
            tabular = tabular.float().to(device)
            label = label.float().to(device)
            
            model_pred = model(img, tabular)
            
            loss = criterion(model_pred, label.reshape(-1,1))
            
            val_loss.append(loss.item())
            
            model_pred = model_pred.squeeze(1).to('cpu')  
            pred_labels += model_pred.tolist()


    pred_labels = np.where(np.array(pred_labels) > threshold, 1, 0)
    val_score = metrics.f1_score(y_true=true_labels, y_pred=pred_labels, average='macro')


    return np.mean(val_loss), val_score

print('########################################################')

def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    preds = []
    threshold = 0.1
    
    with torch.no_grad():
        for img, tabular in tqdm(iter(test_loader)):
            img = img.float().to(device)
            tabular = tabular.float().to(device)
            
            model_pred = model(img, tabular)
            
            model_pred = model_pred.squeeze(1).to('cpu')
            
            preds += model_pred.tolist()
    
    predictions=np.array(preds)
    preds = np.where(np.array(preds) > threshold, 1, 0)
    
    return predictions,preds


/content/drive/MyDrive/project/BreastCancer/baseline/
########################################################
########################################################
########################################################
########################################################
########################################################
########################################################


# MODEL_1

In [15]:
print(CFG)

{'IMG_SIZE': 512, 'EPOCHS': 30, 'LEARNING_RATE': 0.0001, 'BATCH_SIZE': 16, 'SEED': 41}


In [16]:
createFolder(GPATH+'/'+MODEL+'/'+str(VERSION))
createFolder(GPATH+'/'+MODEL+'/'+str(VERSION)+'/ckp/')
createFolder(GPATH+'/'+MODEL+'/'+str(VERSION)+'/submit/')
print(GPATH+'/'+MODEL+'/'+str(VERSION))

device = torch.device("cuda")
model = nn.DataParallel(ClassificationModel())

# 학습 이어 하기
#PATH = '/content/drive/MyDrive/project/BreastCancer/baseline/resnext50_32x4d/1/ckp/best_resnext50_32x4d_9.pt'
#model = torch.load(PATH)
model.eval()
optimizer = torch.optim.AdamW(params = model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=1, threshold_mode='abs',min_lr=1e-8, verbose=True)

infer_model = train(model, optimizer, train_loader, val_loader, scheduler, device)



/content/drive/MyDrive/project/BreastCancer/baseline//densenet169/1


Downloading: "https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth" to /root/.cache/torch/hub/checkpoints/resnext50_32x4d-7cdf4587.pth


  0%|          | 0.00/95.8M [00:00<?, ?B/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [1], Train Loss : [0.65304] Val Loss : [0.64987] Val Score : [0.65240]


  0%|          | 0/16 [00:00<?, ?it/s]

########################################################
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [2], Train Loss : [0.62318] Val Loss : [0.61125] Val Score : [0.74000]


  0%|          | 0/16 [00:00<?, ?it/s]

########################################################
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [3], Train Loss : [0.60534] Val Loss : [0.60864] Val Score : [0.73350]
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [4], Train Loss : [0.59024] Val Loss : [0.60331] Val Score : [0.77198]


  0%|          | 0/16 [00:00<?, ?it/s]

########################################################
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [5], Train Loss : [0.58884] Val Loss : [0.65562] Val Score : [0.67505]
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [6], Train Loss : [0.58618] Val Loss : [0.60958] Val Score : [0.76812]
Epoch 00006: reducing learning rate of group 0 to 5.0000e-05.
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [7], Train Loss : [0.58216] Val Loss : [0.60712] Val Score : [0.75978]
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [8], Train Loss : [0.57722] Val Loss : [0.60516] Val Score : [0.77454]


  0%|          | 0/16 [00:00<?, ?it/s]

########################################################
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [9], Train Loss : [0.57093] Val Loss : [0.61628] Val Score : [0.72976]
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [10], Train Loss : [0.56295] Val Loss : [0.65320] Val Score : [0.66473]
Epoch 00010: reducing learning rate of group 0 to 2.5000e-05.
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [11], Train Loss : [0.56156] Val Loss : [0.61649] Val Score : [0.72989]
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [12], Train Loss : [0.56017] Val Loss : [0.60922] Val Score : [0.75845]
Epoch 00012: reducing learning rate of group 0 to 1.2500e-05.
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [13], Train Loss : [0.56239] Val Loss : [0.61297] Val Score : [0.73906]
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [14], Train Loss : [0.55431] Val Loss : [0.60673] Val Score : [0.75499]
Epoch 00014: reducing learning rate of group 0 to 6.2500e-06.
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [15], Train Loss : [0.54801] Val Loss : [0.61280] Val Score : [0.73906]
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [16], Train Loss : [0.54585] Val Loss : [0.61405] Val Score : [0.74877]
Epoch 00016: reducing learning rate of group 0 to 3.1250e-06.
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [17], Train Loss : [0.54600] Val Loss : [0.60665] Val Score : [0.77998]


  0%|          | 0/16 [00:00<?, ?it/s]

########################################################
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [18], Train Loss : [0.55223] Val Loss : [0.60796] Val Score : [0.74977]
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [19], Train Loss : [0.55007] Val Loss : [0.60688] Val Score : [0.75978]
Epoch 00019: reducing learning rate of group 0 to 1.5625e-06.
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [20], Train Loss : [0.54232] Val Loss : [0.60910] Val Score : [0.76979]
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [21], Train Loss : [0.54377] Val Loss : [0.60935] Val Score : [0.75962]
Epoch 00021: reducing learning rate of group 0 to 7.8125e-07.
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [22], Train Loss : [0.54157] Val Loss : [0.61140] Val Score : [0.75396]
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [23], Train Loss : [0.54746] Val Loss : [0.61212] Val Score : [0.74877]
Epoch 00023: reducing learning rate of group 0 to 3.9063e-07.
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [24], Train Loss : [0.54810] Val Loss : [0.60836] Val Score : [0.76452]
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [25], Train Loss : [0.54608] Val Loss : [0.60843] Val Score : [0.74937]
Epoch 00025: reducing learning rate of group 0 to 1.9531e-07.
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [26], Train Loss : [0.54532] Val Loss : [0.60773] Val Score : [0.75913]
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [27], Train Loss : [0.54224] Val Loss : [0.60753] Val Score : [0.76452]
Epoch 00027: reducing learning rate of group 0 to 9.7656e-08.
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [28], Train Loss : [0.54200] Val Loss : [0.60819] Val Score : [0.75913]
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [29], Train Loss : [0.54483] Val Loss : [0.61430] Val Score : [0.74877]
Epoch 00029: reducing learning rate of group 0 to 4.8828e-08.
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [30], Train Loss : [0.54638] Val Loss : [0.60676] Val Score : [0.75450]
########################################################


  0%|          | 0/16 [00:00<?, ?it/s]

# test

In [17]:
test_dataset = CustomDataset(test_df, None, test_transforms)
test_loader = DataLoader(test_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

predictions,preds = inference(infer_model, test_loader, device)


submit = pd.read_csv('./sample_submission.csv')
submit['N_category'] = preds
submit['predictions'] = predictions
submit.to_csv(f'/content/drive/MyDrive/project/BreastCancer/baseline/{MODEL}/{VERSION}/submit/{MODEL}_{EPOCHS}_th1.csv', index=False)

  0%|          | 0/16 [00:00<?, ?it/s]