# Introduction

Here we make use of ResNeXt and "weakly supervised pre-training", the ResNeXt-101 . See https://github.com/facebookresearch/WSL-Images for model information.

In [1]:
# Input

dir_csv = '../input/rsna-intracranial-hemorrhage-detection'
dir_train_img = '../input/rsna-train-stage-1-images-png-224x/stage_1_train_png_224x'
dir_test_img = '../input/rsna-test-stage-1-images-png-224x/stage_1_test_png_224x'


Defining some necessary parameters

In [2]:
# Parameters

n_classes = 6
n_epochs = 1
batch_size = 32

Installing some usful libraries

In [5]:
!git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
    

fatal: destination path 'apex' already exists and is not an empty directory.


In [6]:
# Libraries

from apex import amp
import os
import cv2
import glob
import pydicom
import numpy as np
import pandas as pd
import torch
import torch.optim as optim
from albumentations import Compose, ShiftScaleRotate, Resize
from albumentations.pytorch import ToTensor
from torch.utils.data import Dataset
import torchvision.models as models
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

ImportError: cannot import name 'amp' from 'apex' (unknown location)

Defining our dataset function

In [None]:

# Functions

class IntracranialDataset(Dataset):

    def __init__(self, csv_file, path, labels, transform=None):
        
        self.path = path
        self.data = pd.read_csv(csv_file)
        self.transform = transform
        self.labels = labels

    def __len__(self):
        
        return len(self.data)

    def __getitem__(self, idx):
        
        img_name = os.path.join(self.path, self.data.loc[idx, 'Image'] + '.png')
        img = cv2.imread(img_name)   
        
        if self.transform:       
            
            augmented = self.transform(image=img)
            img = augmented['image']   
            
        if self.labels:
            
            labels = torch.tensor(
                self.data.loc[idx, ['epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural', 'any']])
            return {'image': img, 'labels': labels}    
        
        else:      
            
            return {'image': img}
    
    


# CSV

In [None]:
# CSVs

train = pd.read_csv(os.path.join(dir_csv, 'stage_1_train.csv'))
test = pd.read_csv(os.path.join(dir_csv, 'stage_1_sample_submission.csv'))

In [None]:
# Split train out into row per image and save a sample

train[['ID', 'Image', 'Diagnosis']] = train['ID'].str.split('_', expand=True)
train = train[['Image', 'Diagnosis', 'Label']]
train.drop_duplicates(inplace=True)
train = train.pivot(index='Image', columns='Diagnosis', values='Label').reset_index()
train['Image'] = 'ID_' + train['Image']
train.head()

Removing class imbalance in dataset

In [None]:
#Checking number of healthy and number of ill patients

undersample_seed=0
train["any"].value_counts()

In [None]:
#Stroing number of ill patients

num_ill_patients = train[train["any"]==1].shape[0]
num_ill_patients

In [None]:
#Selecting healthy patients equal to number of ill patients

healthy_patients = train[train["any"]==0].index.values
healthy_patients_selection = np.random.RandomState(undersample_seed).choice(
    healthy_patients, size=num_ill_patients, replace=False
)
len(healthy_patients_selection)

In [None]:
#Fusing together our selected healthy patients and the sick ones

sick_patients = train[train["any"]==1].index.values
selected_patients = list(set(healthy_patients_selection).union(set(sick_patients)))
len(selected_patients)/2

In [None]:
#Checking healthy and new patients in the new dataset

new_train = train.loc[selected_patients].copy()
new_train["any"].value_counts()

In [None]:
# Some files didn't contain legitimate images, so we need to remove them

png = glob.glob(os.path.join(dir_train_img, '*.png'))
png = [os.path.basename(png)[:-4] for png in png]
png = np.array(png)

new_train = new_train[new_train['Image'].isin(png)]

In [None]:
#Splitting train dataset to training and validation

training , validation = train_test_split(new_train, test_size = 0.2, random_state =42)

training.to_csv('train.csv', index=False)
validation.to_csv('valid.csv', index = False)

In [None]:
# Also prepare the test data

test[['ID','Image','Diagnosis']] = test['ID'].str.split('_', expand=True)
test['Image'] = 'ID_' + test['Image']
test = test[['Image', 'Label']]
test.drop_duplicates(inplace=True)

test.to_csv('test.csv', index=False)

Using the predefined dataloader provided by pytorch

In [None]:
# Data loaders

transform_train = Compose([
    ShiftScaleRotate(),
    ToTensor()
])

transform_test= Compose([
    ToTensor()
])

train_dataset = IntracranialDataset(
    csv_file='train.csv', path=dir_train_img, transform=transform_train, labels=True)

valid_dataset = IntracranialDataset(
    csv_file='valid.csv', path=dir_train_img, transform=transform_train, labels=True)

test_dataset = IntracranialDataset(
    csv_file='test.csv', path=dir_test_img, transform=transform_test, labels=False)


data_loader_train = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
data_loader_valid = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
data_loader_test = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

In [None]:
# Plot train example

batch = next(iter(data_loader_train))
fig, axs = plt.subplots(1, 5, figsize=(15,5))

for i in np.arange(5):
    
    axs[i].imshow(np.transpose(batch['image'][i].numpy(), (1,2,0))[:,:,0], cmap=plt.cm.bone)


In [None]:
# Plot validation example

batch = next(iter(data_loader_valid))
fig, axs = plt.subplots(1, 5, figsize=(15,5))

for i in np.arange(5):
    
    axs[i].imshow(np.transpose(batch['image'][i].numpy(), (1,2,0))[:,:,0], cmap=plt.cm.bone)

In [None]:
# Plot test example

batch = next(iter(data_loader_test))
fig, axs = plt.subplots(1, 5, figsize=(15,5))

for i in np.arange(5):
    
    axs[i].imshow(np.transpose(batch['image'][i].numpy(), (1,2,0))[:,:,0], cmap=plt.cm.bone)


Loading our ResNeXt-101 model

In [None]:
# Model

device = torch.device("cuda:0")
model = torch.hub.load('facebookresearch/WSL-Images', 'resnext101_32x16d_wsl')
model.fc = torch.nn.Linear(2048, n_classes)

model.to(device)

criterion = torch.nn.BCEWithLogitsLoss()
plist = [{'params': model.parameters(), 'lr': 2e-3}]
optimizer = optim.Adam(plist, lr=2e-3)

model, optimizer = amp.initialize(model, optimizer, opt_level="O1")


Training the model on the training dataset

In [None]:
# Train


for epoch in range(n_epochs):
    
    print('Epoch {}/{}'.format(epoch, n_epochs - 1))
    print('-' * 10)

    model.train()    
    tr_loss = 0
    
    tk0 = tqdm(data_loader_train, desc="Iteration")

    for step, batch in enumerate(tk0):

        inputs = batch["image"]
        labels = batch["labels"]

        inputs = inputs.to(device, dtype=torch.float)
        labels = labels.to(device, dtype=torch.float)

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()

        tr_loss += loss.item()

        optimizer.step()
        optimizer.zero_grad()

    epoch_loss = tr_loss / len(data_loader_train)
    print('Training Loss: {:.4f}'.format(epoch_loss))
    
    
    for param in model.parameters():
        param.requires_grad = False

    model.eval()
    val_loss = 0
    
    tk1 = tqdm(data_loader_valid, desc="Iteration")

    for step, batch in enumerate(tk1):

        inputs = batch["image"]
        labels = batch["labels"]

        inputs = inputs.to(device, dtype=torch.float)
        labels = labels.to(device, dtype=torch.float)

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        val_loss += loss.item()

    epoch_loss = val_loss / len(data_loader_valid)
    print('Validation Loss: {:.4f}'.format(epoch_loss))

    
    
    

Evaluating model on test data set and getting the predictions

In [None]:
# Inference

for param in model.parameters():
    param.requires_grad = False

model.eval()

test_pred = np.zeros((len(test_dataset) * n_classes, 1))

for i, x_batch in enumerate(tqdm(data_loader_test)):
    
    x_batch = x_batch["image"]
    x_batch = x_batch.to(device, dtype=torch.float)
    
    with torch.no_grad():
        
        pred = model(x_batch)
        
        test_pred[(i * batch_size * n_classes):((i + 1) * batch_size * n_classes)] = torch.sigmoid(
            pred).detach().cpu().reshape((len(x_batch) * n_classes, 1))

Writing the results into a submission csv file

In [None]:
# Submission

submission =  pd.read_csv(os.path.join(dir_csv, 'stage_1_sample_submission.csv'))
submission = pd.concat([submission.drop(columns=['Label']), pd.DataFrame(test_pred)], axis=1)
submission.columns = ['ID', 'Label']

submission.to_csv('submission.csv', index=False)
submission.head()

Have to clean up since Kaggle limits the number of files that can be output from a kernel

In [None]:
!rm -rf /kaggle/working/apex
!rm test.csv
!rm train.csv