# Environment Setup

In [None]:
!pip install pylibjpeg pylibjpeg-libjpeg pydicom dicomsdl

In [None]:
import cv2
import dicomsdl
import glob
import os
import pydicom
import torch
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.models import resnet50

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {DEVICE} device")

# Data Review

In [None]:
class DataReviewer():
    def __init__(self, train_csv, train_path):
        self.train_csv = pd.read_csv(train_csv)
        self.train_path = train_path
        
    def show_data_for_patient(self, patient_id):
        patient_dir = os.path.join(self.train_path, str(patient_id))
        num_images = len(glob.glob(f"{patient_dir}/*"))
        print(f"Number of images for patient: {num_images}\n")
        for dcm in os.listdir(patient_dir):
            for index in range(self.train_csv.shape[0]):
                if dcm[0:-4] == str(self.train_csv["image_id"][index]):
                    row = self.train_csv[index:index+1]
            print(f"Patient ID :", row["patient_id"].item())
            # print(f"Site    ID :", row["site_id"].item())
            print(f"Image   ID :", row["image_id"].item())
            # print(f"Machine ID :", row["machine_id"].item())
            print(f"Left/Right :", row["laterality"].item())
            # print(f"View       :", row["view"].item())
            print(f"Age        :", row["age"].item())
            print(f"Cancer     :", row["cancer"].item())
            # print(f"Biopsy     :", row["biopsy"].item())
            # print(f"Invasive   :", row["invasive"].item())
            # print(f"BIRADS     :", row["BIRADS"].item())
            # print(f"Implant    :", row["implant"].item())
            # print(f"Density    :", row["density"].item())
            # print(f"Difficult Negative Case:", row["difficult_negative_case"].item())
            
            dcm_data = pydicom.dcmread(os.path.join(patient_dir, dcm))
            output_data = dcm_data.pixel_array
            if dcm_data.PhotometricInterpretation == "MONOCHROME1":
                output_data = np.amax(output_data) - output_data
            output_data = output_data * dcm_data.RescaleSlope + dcm_data.RescaleIntercept
            plt.imshow(output_data, cmap="bone")
            plt.show()
            print("\n\n")

In [None]:
train_csv = "../input/rsna-breast-cancer-detection/train.csv"
train_path = "../input/rsna-breast-cancer-detection/train_images"
data_reviewer = DataReviewer(train_csv, train_path)
data_reviewer.show_data_for_patient(10006)

# Data Loader

In [None]:
class RSNADataset(Dataset):
    def __init__(self, dataframe, is_train=True):
        self.dataframe, self.is_train = dataframe, is_train
        if is_train:
            self.transform = transforms.Compose([
                transforms.ToTensor(),
                transforms.RandomHorizontalFlip(p=0.5),
                transforms.RandomVerticalFlip(p=0.5)
            ])
        else:
            self.transform = transforms.Compose([
                transforms.ToTensor()
            ])
            
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, index):
        
        # image
        dcm_path = self.dataframe["path"][index]
        dcm_file = dicomsdl.open(str(dcm_path))
        image = dcm_file.pixelData()
        image = (image - image.min()) / (image.max() - image.min())
        if dcm_file.getPixelDataInfo()['PhotometricInterpretation'] == "MONOCHROME1":
            image = 1 - image
        image = cv2.resize(image, (224, 224))
        image = (image * 255).astype(np.uint8)
        image = self.transform(image)
        out_image = np.concatenate([image, image, image], axis=0)
        
        # label
        label = self.dataframe["cancer"][index]

        return out_image, label

# Model

In [None]:
class RSNA_Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.features = resnet50(pretrained=True)
        self.classification = nn.Linear(1000, 1)
    
    def forward(self, x):
        x = self.features(x)
        x = self.classification(x)
        return x
        

# Training

In [None]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, Y) in enumerate(dataloader):
        X, Y = X.to(DEVICE), Y.to(DEVICE)

        pred = model(X)
        loss = loss_fn(pred, Y.unsqueeze(1).float())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 5 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
            
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, Y in dataloader:
            X, Y = X.to(DEVICE), Y.to(DEVICE)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [None]:
def main():
    
    # Parameters
    BATCH_TRAIN = 64
    BATCH_TEST = 32
    EPOCHS = 3
    LR = 0.0005
    
    # Make the training csv file
    train_csv = "../input/rsna-breast-cancer-detection/train.csv"
    train_path = "../input/rsna-breast-cancer-detection/train_images"
    target_csv = pd.read_csv(train_csv)
    target_path = []
    target_cancer = []
    
    for index in range(target_csv.shape[0]):
        path = "../input/rsna-breast-cancer-detection/train_images/" \
            +str(target_csv["patient_id"][index].item())+"/" \
            +str(target_csv["image_id"][index].item())+".dcm"
        cancer = int(target_csv["cancer"][index].item())
        target_path.append(path)
        target_cancer.append(cancer)
        
    target_dictionary = {
        "path": target_path,
        "cancer": target_cancer
    }
    
    target_csv = pd.DataFrame(target_dictionary)
    
    # Training
    training_data = RSNADataset(target_csv)
    testing_data = RSNADataset(target_csv, is_train = False)
    train_dataloader = DataLoader(training_data, batch_size=BATCH_TRAIN, shuffle=True)
    test_dataloader = DataLoader(training_data, batch_size=BATCH_TEST, shuffle=True)
    
    model = RSNA_Model()
    model = model.to(DEVICE)
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    
    for epoch in range(EPOCHS):
        print(f"Epoch {epoch+1}\n-------------------------------")
        train(train_dataloader, model, loss_fn, optimizer)
        test(test_dataloader, model, loss_fn)
    print("Done!")

In [None]:
main()

In [None]:
test_csv = pd.read_csv('../input/rsna-breast-cancer-detection/test.csv')
submission = pd.DataFrame(data={'prediction_id': test_csv['prediction_id'], 'cancer': np.random.rand(test_csv.shape[0])}).drop_duplicates(subset='prediction_id')
submission.head()
submission.to_csv('submission.csv', index=False)