<a href="https://www.kaggle.com/code/rimzakhama/rsna-pytorch-baseline-training-for-beginners?scriptVersionId=143879746" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
!pip install timm -q

from IPython.display import display_html
def restartkernel() :
    display_html("",raw=True)
restartkernel()

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

import torch

import torch.nn as nn
from torchvision import transforms

import os

import pydicom as dicom
import cv2

import timm
import torch.optim as optim
from sklearn import model_selection
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedGroupKFold

from tqdm.autonotebook import tqdm

import gc
gc.collect()
torch.cuda.empty_cache()


In [None]:
# For a given patient_id explore all the images in the folder
patient_id = '10006'

#List all images for each image_id
#list_dcm_images = os.listdir('/kaggle/input/rsna-breast-cancer-detection/train_images/'+patient_id)
png_path = '/kaggle/input/rsna-png-images-same-format-as-original/output/rsna_pngs/train_images'

list_images = os.listdir(png_path + '/' + patient_id)

fig = plt.figure(figsize=(10,10))
i=0
for image_png in list_images:
    
    image_png_path = png_path + '/' + patient_id + '/' + image_png
    image = mpimg.imread(image_png_path)
    
    image = cv2.resize(image, (512,512))
    
    ax = fig.add_subplot(2, 2, i+1)
    i = i + 1
    
    ax.imshow(image)
    ax.set_title('image_png')
    

In [None]:

class Dataset:
    def __init__(self, df, transform=None):
        self.df = df.copy()
        self.transform = transform
     
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        
        patient_id = self.df.loc[idx, 'patient_id']
        image_id = self.df.loc[idx, 'image_id']
        
        # Get and preprocess images
        #dcm_path = '/kaggle/input/rsna-breast-cancer-detection/train_images/'
        #image = dicom.dcmread(image_path)
        
        png_path = '/kaggle/input/rsna-png-images-same-format-as-original/output/rsna_pngs/train_images'
        
        # Image path
        #image_dcm_path =  os.path.join(png_path, patient_id.astype(str), image_id.astype(str)+'.dcm')
        image_png_path =  os.path.join(png_path, patient_id.astype(str), image_id.astype(str)+'.png')

        image = mpimg.imread(image_png_path)
        
        image = cv2.resize(image, (512,512))
        
        # Apply transformers on images
        if self.transform:
            image = self.transform(image)
            
        # Target
        target = self.df.loc[idx, 'cancer']  
        
        # Convert to tensors
        image = torch.tensor(image, dtype=torch.float)
        target = torch.tensor(target, dtype=torch.long)
        
        return image, target

In [None]:
class Config:
    NUM_CLASSES = 1
    MODEL_PATH = 'model.bin'
    TRAIN_BATCH_SIZE = 6 #6
    VALID_BATCH_SIZE = 5 #5
    EPOCHS = 2
    

In [None]:
''' Model (we need to improve our model''' 
class BreastCancerModel(nn.Module):
    def __init__(self, Config):
        super().__init__()
        self.efficientnet = timm.create_model('efficientnet_b4', pretrained=True,
                                             in_chans=1)
        in_features = self.efficientnet.classifier.in_features
        self.efficientnet.classifier = nn.Linear(in_features, Config.NUM_CLASSES)   
        
    def forward(self, image):
        output = self.efficientnet(image)
        
        return output
    


In [None]:
'''Training function'''
def train(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    tk0 = tqdm(data_loader, total=len(data_loader))
    
    for batch_idx, data in enumerate(tk0):
        images = data[0]
        targets = data[1]
        
        images = images.to(device, dtype=torch.float)
        targets = targets.to(device, dtype=torch.long)
        
        outputs = model(images).squeeze()
        
        optimizer.zero_grad()
    
        loss = criterion(outputs, targets)
        
        loss.backward()
        optimizer.step()
        

In [None]:
'''Evaluation function'''

def evaluation(data_loader, model, device):
    model.eval()
    
    tar = []
    predictions = []
    for batch_idx, data in enumerate(data_loader):
           
            images = data[0]
            targets = data[1]
        
            images = images.to(device, dtype=torch.float) # or long?
            targets = targets.to(device, dtype=torch.long)
       
            with torch.no_grad():
                outputs = model(images).squeeze()
                loss = criterion(outputs, targets)
                
                #Cross entropy loss
                #_, preds = torch.max(outputs, 1)
                #predictions.append(preds.cpu().numpy())
                
                #loss bcewith logits
                predictions.append(outputs.sigmoid().cpu().numpy())
                
                #target
                tar.append(targets.cpu().numpy())
            
    predictions = np.concatenate(predictions) # this line convert the list 
    # of lists into a 1d array.
    tar = np.concatenate(tar)

    return predictions, tar     
        
    

In [None]:
'''Loss'''
#criterion = nn.CrossEntropyLoss()
#criterion = nn.BCELoss()
#criterion = nn.BCEWithLogitsLoss()

#class_weight = torch.tensor([0.02, 0.98])

pos_weight = torch.tensor([46]).cuda()
#pos_weight = torch.tensor([34]).cuda()
#pos_weight = torch.tensor([20]).cuda()
criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
#criterion = nn.CrossEntropyLoss()


In [None]:
''' Probabilistic F1 score'''
def pfbeta(labels, predictions, beta):
    y_true_count = 0
    ctp = 0
    cfp = 0

    for idx in range(len(labels)):
        prediction = min(max(predictions[idx], 0), 1)
        if (labels[idx]):
            y_true_count += 1
            ctp += prediction
        else:
            cfp += prediction

    beta_squared = beta * beta
    c_precision = ctp / (ctp + cfp)
    c_recall = ctp / y_true_count
    if (c_precision > 0 and c_recall > 0):
        result = (1 + beta_squared) * (c_precision * c_recall) / (beta_squared * c_precision + c_recall)
        return result
    else:
        return 0

In [None]:
'''Running'''
def run(df_train, df_valid):
    #dfx = pd.read_csv('/kaggle/input/rsna-breast-cancer-detection/train.csv').dropna(axis=0).reset_index(drop=True)
    
    # Split data to training and validation data
    #df_train, df_valid= model_selection.train_test_split(dfx, test_size=0.15, 
                                                   #random_state=42, 
                                                 #stratify=dfx['cancer'].values)
    #df_train = df_train.reset_index(drop=True)
    #df_valid = df_valid.reset_index(drop=True)
    
    # Transforms on images
    # MAKE THEM HERE!!!!!
    transform = transforms.Compose([
    transforms.ToTensor()
    ])
    
    # Instantiate Dataset with training data
    train_dataset = Dataset(df_train, transform=transform)
    
    # Instantiate Dataloader with training dataset
    train_data_loader = torch.utils.data.DataLoader(train_dataset, 
                                                    batch_size=Config.TRAIN_BATCH_SIZE, 
                                                    num_workers=1,
                                                    shuffle=True,
                                                    drop_last=True)
    
    # Instantiate Dataset with validation data
    valid_data =  Dataset(df_valid, transform=transform)
    
    # Instantiate Dataloader with valiation dataset
    valid_data_loader = torch.utils.data.DataLoader(valid_data, 
                                                    batch_size=Config.VALID_BATCH_SIZE,
                                                    num_workers=1, 
                                                    shuffle=False,
                                                    drop_last=True)
    
    # Set device as `cuda` (GPU)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Load pretrained model (we need to improve our model!!!)
    #model = timm.create_model('efficientnet_b4', pretrained=True, in_chans=1)
    model = BreastCancerModel(Config=Config)
    
    # Move the model to the GPU
    model.to(device)
    
    
    # The optimizer
    params = model.parameters()
    optimizer = optim.AdamW(params=params, lr=1e-4) 
    
    
    dataloaders_dict = {"train": train_data_loader, "val": valid_data_loader}
    train_model(model, dataloaders_dict, criterion, optimizer, num_epochs=3)
    
    '''
    best_score = 0
    for epoch in range(Config.EPOCHS):
        print(epoch)
        train(train_data_loader, model, optimizer, device)
        predictions, targets = evaluation(valid_data_loader, model, device)
        F1_score = pfbeta(targets, predictions, beta=1)
        if F1_score > best_score:
            best_score = F1_score
            torch.save(model.state_dict(), Config.MODEL_PATH)
        print('best_score = ', best_score)     
    '''
    
        
        
        
        

In [None]:
def set_seed(seed):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    print('> SEEDING DONE')
    
set_seed(seed=42)

In [None]:
def train_model(model, dataloaders_dict, criterion, optimizer, num_epochs):
    best_acc = 0.0

    for epoch in range(num_epochs):
        model.cuda()
        
        
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()
                
            epoch_loss = 0.0
            epoch_acc = 0
            best_f1_score = 0.0
            
            dataloader = dataloaders_dict[phase]
            predictions = []
            targets = []
            for item in tqdm(dataloader, leave=False):
                images = item[0].cuda().float()
                classes = item[1].cuda().float()

                optimizer.zero_grad()
                
                with torch.set_grad_enabled(phase == 'train'):
                    output = model(images).squeeze()
                    #print('output size = ', output.shape)
                    #print('target size = ', classes.shape)
                    loss = criterion(output, classes)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                    epoch_loss += loss.item() * len(output)
                    
                    probs = output.sigmoid()
                    threshold = 0.5
                    predicted_vals = probs > threshold
                    epoch_acc += torch.sum(predicted_vals == classes.data)
                    
                    predictions.append(output.sigmoid().cpu().detach().numpy())
                    targets.append(classes.cpu().numpy())
                    
            predictions = np.concatenate(predictions) 
            targets = np.concatenate(targets)
            
            f1_score = pfbeta(targets, predictions, beta = 1.0)
            if f1_score > best_f1_score :
                best_f1_score = f1_score
            print('f1_score = ', f1_score)        

            data_size = len(dataloader.dataset)
            epoch_loss = epoch_loss / data_size
            epoch_acc = epoch_acc.double() / data_size

            print(f'Epoch {epoch + 1}/{num_epochs} | {phase:^5} | Loss: {epoch_loss:.4f} | Acc: {epoch_acc:.4f}')
            
        if epoch_acc > best_acc:
            traced = torch.jit.trace(model.cpu(), torch.rand(1, 1, 224, 224))
            traced.save('model.pth')
            best_acc = epoch_acc

In [None]:
if __name__ == '__main__' :
    
    # Read training csv
    dfx = pd.read_csv('/kaggle/input/rsna-breast-cancer-detection/train.csv').dropna(axis=0).reset_index(drop=True)


    kfold = StratifiedGroupKFold(n_splits=5)

    for fold, (train_idx, valid_idx) in enumerate(kfold.split(dfx, dfx['cancer'].values, dfx['patient_id'].values)):
        print(f"{'='*40} Fold: {fold} / 5 {'='*40}")

        df_train = dfx.loc[train_idx].reset_index(drop=True)
        df_valid = dfx.loc[valid_idx].reset_index(drop=True)
        run(df_train, df_valid)