In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os, cv2, random, math, glob
import numpy as np, pandas as pd, matplotlib.pyplot as plt
import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

dev  = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
ROOT = '/kaggle/input/histopathologic-cancer-detection'
print('device =', dev)
print('labels:', os.path.exists(f'{ROOT}/train_labels.csv'))
print('train tiles:', len(glob.glob(f'{ROOT}/train/*.tif')))
print('test tiles:',  len(glob.glob(f'{ROOT}/test/*.tif')))

In [None]:
labels = pd.read_csv(f'{ROOT}/train_labels.csv')

print('rows:', len(labels))
print('class counts:\n', labels['label'].value_counts())
print('class %:\n', (labels['label'].value_counts(normalize=True)*100).round(2), '%')

def pltgrid(imgids, title):
    numimgs = len(imgids); ncols = 4; nrows = math.ceil(numimgs/ncols)
    figobj, axgrid = plt.subplots(nrows, ncols, figsize=(ncols*2, nrows*2))
    axgrid = axgrid.ravel()
    for idx, imgid in enumerate(imgids):
        imgpath = f'{ROOT}/train/{imgid}.tif'
        imgrgb = cv2.cvtColor(cv2.imread(imgpath), cv2.COLOR_BGR2RGB)
        axgrid[idx].imshow(imgrgb); axgrid[idx].axis('off')
    for padidx in range(idx+1, len(axgrid)): axgrid[padidx].axis('off')
    figobj.suptitle(title); plt.tight_layout(); plt.show()

posimg = labels[labels.label==1].sample(8, random_state=7).id.tolist()
negimg = labels[labels.label==0].sample(8, random_state=7).id.tolist()
pltgrid(posimg, 'random positive tiles')
pltgrid(negimg, 'random negative tiles')

In [None]:
tdat, vdat = train_test_split(labels, test_size=0.10, random_state=7, stratify=labels['label'])
# shrink 
tdat = tdat.sample(20000, random_state=7)   
vdat = vdat.sample(4000,  random_state=7)  
print('train rows:', len(tdat), '| val rows:', len(vdat))

In [None]:
def imgtrans(imgrgb, istrain=True):
    if istrain:
        if random.random() < 0.5:
            imgrgb = np.ascontiguousarray(imgrgb[:, ::-1, :])
        rotk = random.randint(0, 3)
        imgrgb = np.ascontiguousarray(np.rot90(imgrgb, rotk))
    xchw = imgrgb.astype(np.float32) / 255.0
    xchw = np.transpose(xchw, (2, 0, 1))
    return xchw

class PCamDataset(Dataset):
    def __init__(self, table, root, istrain):
        self.df = table.reset_index(drop=True); self.root = root; self.istrain = istrain
    def __len__(self): return len(self.df)
    def __getitem__(self, i):
        row = self.df.iloc[i]
        imgpath = f'{self.root}/train/{row.id}.tif'
        imgrgb = cv2.cvtColor(cv2.imread(imgpath), cv2.COLOR_BGR2RGB)
        xchw = imgtrans(imgrgb, istrain=self.istrain)
        ylab = np.float32(row.label)
        return torch.from_numpy(xchw), torch.tensor(ylab)

dattrain = DataLoader(PCamDataset(tdat, ROOT, True),  batch_size=32, shuffle=True,  num_workers=0, pin_memory=False)
datval   = DataLoader(PCamDataset(vdat, ROOT, False), batch_size=32, shuffle=False, num_workers=0, pin_memory=False)
len(dattrain), len(datval)

In [None]:
class SmallNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(3, 16, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(),
            nn.AdaptiveAvgPool2d(1)
        )
        self.head = nn.Linear(64, 1)
    def forward(self, x):
        feat = self.conv(x).view(x.size(0), -1)
        return self.head(feat).squeeze(1)

model = SmallNet().to(dev)
model

In [None]:
losses = nn.BCEWithLogitsLoss()
opti = torch.optim.Adam(model.parameters(), lr=1e-3)

def runitonce(netmod, datload, dotrain=True):
    netmod.train() if dotrain else netmod.eval()
    losslist, allprobs, alltargets = [], [], []
    for xbatch, ybatch in datload:
        xbatch = xbatch.to(dev); ybatch = ybatch.to(dev)
        with torch.set_grad_enabled(dotrain):
            rlogs = netmod(xbatch)
            lossval = losses(rlogs, ybatch)
            if dotrain:
                opti.zero_grad(); lossval.backward(); opti.step()
        losslist.append(lossval.item())
        allprobs.append(torch.sigmoid(rlogs).detach().cpu().numpy())
        alltargets.append(ybatch.detach().cpu().numpy())
    probarr = np.concatenate(allprobs); targarr = np.concatenate(alltargets)
    try: aucval = roc_auc_score(targarr, probarr)
    except ValueError: aucval = float('nan')
    return float(np.mean(losslist)), float(aucval)

bestval, bestwt = 0.0, None
for epoch in range(1, 3): 
    trloss, trauc = runitonce(model, dattrain, True)
    vloss,  vauc  = runitonce(model, datval,   False)
    if vauc > bestval:
        bestval = vauc; bestwt = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
    print(f"epoch {epoch:02d} | trainloss {trloss:.4f} auc {trauc:.4f} | valloss {vloss:.4f} auc {vauc:.4f}")

if bestwt is not None:
    model.load_state_dict(bestwt)
    print("best val AUC:", round(bestval, 4))

In [None]:
import torch
print("cuda:", torch.cuda.is_available())

In [None]:
import os, glob, cv2, numpy as np, pandas as pd, torch
from torch.utils.data import Dataset, DataLoader

class TestSet(Dataset):
    def __init__(self, root):
        self.root = root
        self.ids = [os.path.basename(p)[:-4] for p in glob.glob(f'{root}/test/*.tif')]
        self.ids.sort()
    def __len__(self):
        return len(self.ids)
    def __getitem__(self, i):
        imgid = self.ids[i]
        imgpath = f'{self.root}/test/{imgid}.tif'
        imgrgb = cv2.cvtColor(cv2.imread(imgpath), cv2.COLOR_BGR2RGB)
        xchw = imgtrans(imgrgb, istrain=False)
        return imgid, torch.from_numpy(xchw)

testset = TestSet(ROOT)
testloader = DataLoader(testset, batch_size=128, shuffle=False, num_workers=0, pin_memory=False)

ids_all, probs_all = [], []
model.eval()
with torch.no_grad():
    for ids, xb in testloader:
        xb = xb.to(dev)
        pr = torch.sigmoid(model(xb)).cpu().numpy().tolist()
        ids_all.extend(ids)
        probs_all.extend(pr)

subdat = pd.DataFrame({'id': ids_all, 'label': probs_all})
subdat.to_csv('submission.csv', index=False)
print(subdat.shape)
subdat.head()

In [None]:
import torch
print("cuda:", torch.cuda.is_available())