In [1]:
import os
import random
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import albumentations
import torch.optim as optim
from albumentations.pytorch import ToTensorV2, ToTensor

from C2C.models.resnet import *
from C2C import train
from C2C.loss import KLDLoss
from C2C.eval_model import *

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
torch.manual_seed(12)
torch.cuda.manual_seed(12)
np.random.seed(12)
random.seed(12)

torch.backends.cudnn.deterministic=True

### Read Data CSV

- CSV file must contain following columns:
    - path - location of each patch
    - wsi - Unique identifier for WSI
    - label - Label of WSI (Binary 0 or 1)
    - is_valid - If WSI part of validation cohort

In [3]:
CSV_PATH = '11-3-2021 celiac_normal_split.csv'
df = pd.read_csv(CSV_PATH)

In [4]:
# # Hard Coding for Celiac vs Normal

# def sample_path(li, bpath):
    
#     img_name = list(set([x.split('__')[0].split('_')[0] for x in li]))
#     valid_name = random.sample(img_name, int(0.2*len(img_name)))
#     train_name = list(set(img_name) - set(valid_name))
        
#     train_path = [os.path.join(bpath, x) for x in li if x.split('__')[0].split('_')[0] in train_name]
#     valid_path = [os.path.join(bpath, x) for x in li if x.split('__')[0].split('_')[0] in valid_name]
    
#     return train_path, valid_path

# def convert_list_to_nested_list(image_list):
#     sample_df = pd.DataFrame({'patch_name': image_list})
#     sample_df['img_name'] = sample_df['patch_name'].apply(lambda x: x.split('/')[-1].split('__')[0])
#     return list(sample_df.groupby('img_name')['patch_name'].apply(list))


# def extract_celiac_normal_df():
    
#     base_path = '/project/GutIntelligenceLab/ys5hd/MSDS/images_512x512_non_resized/threshold_0.5/'    

#     celiac_path = os.path.join(base_path, 'train/Celiac')
#     normal_path = os.path.join(base_path, 'train/Normal')    

#     # Extract Celiac
#     celiac_train, celiac_valid = sample_path(os.listdir(celiac_path), celiac_path)
#     # Extract Normal
#     normal_train, normal_valid = sample_path(os.listdir(normal_path), normal_path)

#     # Train Patches
#     train_patches = celiac_train + normal_train
#     # Valid Patches
#     valid_patches = celiac_valid + normal_valid   

#     # Train Image List
#     train_images = convert_list_to_nested_list(train_patches)
#     # Valid Image List
#     valid_images = convert_list_to_nested_list(valid_patches)

#     df = pd.DataFrame({'path': train_patches+valid_patches})
#     df['wsi'] = df['path'].apply(lambda x: x.split('/')[-1].split('__')[0])

#     train_img_name = [x[0].split('/')[-1].split('__')[0] for x in train_images]

#     df['is_valid'] = 1
#     df.loc[df['wsi'].isin(train_img_name), 'is_valid'] = 0
#     df['label'] = df['wsi'].str.startswith('C')
    
#     return df

# df = extract_celiac_normal_df()

def sample_for_testing(df):
    sample_for_check = df[df['wsi'].str.startswith('C')]['wsi'][:5].tolist() + df[(df['wsi'].str.startswith('C')) & (df['is_valid']==1)]['wsi'][:5].tolist()\
    + df[~df['wsi'].str.startswith('C')]['wsi'][:5].tolist() + df[(~df['wsi'].str.startswith('C')) & (df['is_valid']==1)]['wsi'][:5].tolist()
    df = df.loc[df['wsi'].isin(sample_for_check)].reset_index(drop=True)
    
    return df 

df = sample_for_testing(df)

In [5]:
df.head()

Unnamed: 0,path,wsi,is_valid,label
0,/project/GutIntelligenceLab/ys5hd/MSDS/images_...,C06-28_05,0,True
1,/project/GutIntelligenceLab/ys5hd/MSDS/images_...,C10-50_01,0,True
2,/project/GutIntelligenceLab/ys5hd/MSDS/images_...,C10-48_01,0,True
3,/project/GutIntelligenceLab/ys5hd/MSDS/images_...,C15-74_04,0,True
4,/project/GutIntelligenceLab/ys5hd/MSDS/images_...,C07-40_01,0,True


### Model

In [6]:
# Initialize Model
model_ft = WSIClassifier(2, bn_track_running_stats=True)
model_ft = model_ft.to(device)

### Loss, Optimizer, and Data Transformation

In [7]:
# Data Transforms
data_transforms = albumentations.Compose([
    ToTensor()
    ])    

# Cross Entropy Loss 
criterion_ce = nn.CrossEntropyLoss()
criterion_kld = KLDLoss()
criterion_dic = {'CE': criterion_ce, 'KLD': criterion_kld}

# Observe that all parameters are being optimized
optimizer = optim.Adam(model_ft.parameters(), lr=1e-4)

### Train Model

In [8]:
model_ft = train.train_model(model_ft, criterion_dic, optimizer, df, data_transforms=data_transforms,\
                             use_kld=True, num_epochs=2, fpath='trained/checkpoint.pt')

100%|██████████| 8/8 [00:00<00:00, 35810.49it/s]
  0%|          | 0/9 [00:00<?, ?it/s]

Epoch 0/1
----------


100%|██████████| 9/9 [00:49<00:00,  5.46s/it]
  0%|          | 0/8 [00:00<?, ?it/s]

train Loss Patch: 0.6983 Loss WSI: 0.7612 Loss KLD: 0.0004 Acc: 0.4444


100%|██████████| 8/8 [00:27<00:00,  3.45s/it]


Test Accuracy:  0.5
Predicted  Normal
Actual           
Disease         4
Normal          4


100%|██████████| 8/8 [00:00<00:00, 47326.42it/s]
  0%|          | 0/9 [00:00<?, ?it/s]

Epoch 1/1
----------


100%|██████████| 9/9 [00:43<00:00,  4.84s/it]


NMI: 0.41385914058010753


  0%|          | 0/8 [00:00<?, ?it/s]

train Loss Patch: 0.6974 Loss WSI: 0.6936 Loss KLD: 0.0004 Acc: 0.4444


100%|██████████| 8/8 [00:27<00:00,  3.39s/it]


Test Accuracy:  0.5
Predicted  Normal
Actual           
Disease         4
Normal          4
Training complete in 2m 58s
Best val Acc: 0.500000


### Evaluate on Test Data

In [8]:
TEST_PATH = '11-3-2021 celiac_normal_test_split.csv'
df_test = pd.read_csv(TEST_PATH)

In [9]:
df_test = eval_test(model_ft, df.iloc[:500], data_transforms)

100%|██████████| 5/5 [00:11<00:00,  2.36s/it]

Test Accuracy:  0.0





In [10]:
df_test.head()

Unnamed: 0,wsi,prediction,attention,path,actual
0,C06-28_05,0,-0.174141,/project/GutIntelligenceLab/ys5hd/MSDS/images_...,True
1,C06-28_05,0,-0.185678,/project/GutIntelligenceLab/ys5hd/MSDS/images_...,True
2,C06-28_05,0,-0.194565,/project/GutIntelligenceLab/ys5hd/MSDS/images_...,True
3,C06-28_05,0,-0.184081,/project/GutIntelligenceLab/ys5hd/MSDS/images_...,True
4,C06-28_05,0,-0.218684,/project/GutIntelligenceLab/ys5hd/MSDS/images_...,True
