In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pydicom
import torch
import torchvision
import torch.nn.functional as F
from skimage.color import rgb2gray
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import matplotlib.pyplot as plt

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_dcom_folder='../input/siim-isic-melanoma-classification/train'
test_dcom_folder='../input/siim-isic-melanoma-classification/test'
img_path=os.path.join(train_dcom_folder,'ISIC_0084395.dcm')
dcm_file=pydicom.dcmread(img_path)
plt.imshow(dcm_file.pixel_array)
plt.show()

In [None]:
from torch.autograd import Variable
import torchvision.transforms.functional as transF

path2='../input/siim-isic-melanoma-classification/train/ISIC_0015719.dcm'
img=pydicom.dcmread(path2).pixel_array
img=rgb2gray(img)
img=transF.to_tensor(img)

img=img.squeeze(0)
# 1 x size
plt.imshow(img)
plt.show()
img=img.unsqueeze(0)
# size

# x x y -> normal size
def resize2d(img, size):
    return (F.adaptive_avg_pool2d(Variable(img), size)).data

img=resize2d(img,(224,224))
# 1x224x224
img=img.squeeze(0)
# 224x224
# squeeze takes out 1 in front of tensor size
plt.imshow(img)
plt.show()


In [None]:
testxl=pd.read_csv('../input/siim-isic-melanoma-classification/test.csv')
testxl.head()

In [None]:
trainxl=pd.read_csv('../input/siim-isic-melanoma-classification/train.csv')
trainxl.head()

In [None]:
print(trainxl.columns)
trainxl=trainxl.drop(['benign_malignant'],axis=1)
print(trainxl.columns)
trainxl.head()

In [None]:
train_testxl=[trainxl,testxl]
sex_dict={'male':0,'female':1}
for dataset in train_testxl:
    dataset['sex']=dataset['sex'].map(sex_dict)
trainxl.head()

In [None]:
testxl.head()

In [None]:
#input: image+other values
#output: probability of disease
print(trainxl)

In [None]:
keys=set(trainxl['diagnosis'].values)
print(set(trainxl['diagnosis'].values))
print(len(set(trainxl['diagnosis'].values)))

In [None]:
values=np.eye(9)
#print(values)
diag_dict=dict(zip(keys,values))

#print(diag_dict)

values2=np.arange(9)/8
print(values2)
diag_dict2=dict(zip(keys,values2))
print(diag_dict2)
#should diag_dict be one hot encoding or can this just be numbers?

In [None]:
trainxl['diagnosis']=trainxl['diagnosis'].map(diag_dict2)
trainxl.head()

In [None]:
for dataset in train_testxl:
    dataset['age_approx']=dataset['age_approx']/(dataset['age_approx'].max())
trainxl.head()

In [None]:
keys=set(trainxl['anatom_site_general_challenge'].values)
print(keys)
print(len(keys))

In [None]:
'''
values=np.eye(len(keys))
anat_dict=dict(zip(keys,values))
'''

values=np.arange(7)/6
anat_dict=dict(zip(keys,values))
print(anat_dict)

In [None]:
for dataset in train_testxl:
    dataset['anatom_site_general_challenge']=dataset['anatom_site_general_challenge'].map(anat_dict)
trainxl.head()

In [None]:
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.autograd import Variable

In [None]:
meta_features=['sex','age_approx','anatom_site_general_challenge','diagnosis']
meta2=np.array(trainxl.iloc[0][meta_features].values)
print(meta2)

In [None]:
imfolder='../input/siim-isic-melanoma-classification/train'


class MelanomaDataset(Dataset):
    
    def __init__(self, df:pd.DataFrame, imfolder:str, train:bool=True, meta_features=None):
        self.df=df
        self.imfolder=imfolder
        self.meta_features=meta_features
        self.train=train
        
        
    def __getitem__(self,idx):
        impath=os.path.join(imfolder,self.df['image_name'][idx]+'.dcm')
        #get idx'th image path
        
        x=pydicom.dcmread(impath).pixel_array
        x=transF.to_tensor(x)
        x=F.adaptive_avg_pool2d(Variable(x),(224,224)).data
        x=x.unsqueeze(0)
        #get image from that path and turn into grayscale
        
        
        meta=np.array(self.df.iloc[idx][meta_features].values)
        #meta is a vector
        #df.iloc[idx] because meta is info about the idx'th image
        
        if self.train:#if its test set, then it gives target with the idx'th image
            y=torch.tensor(self.df['target'][idx]).reshape(1,1)
            
            return (x,meta),y
        else:
            return (x,meta)
    
    
    def __len__(self):
        return len(self.df)
    


In [None]:
#input image shape is (4000, 6000, 3)
#meta features has 4 numbers

from efficientnet_pytorch import EfficientNet
activation={}
def get_activation(name):
    def hook(model, input, output):
        activation[name]=output.detach()
        return hook
class EffNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.features=EfficientNet.from_pretrained('efficientnet-b7')
        
        # csv
        self.csv=nn.Sequential(nn.Linear(4,250),
                              nn.BatchNorm1d(250),
                              nn.ReLU(),
                              nn.Dropout(p=0.2),
                              
                              nn.Linear(250, 250),
                              nn.BatchNorm1d(250),
                              nn.ReLU(),
                              nn.Dropout(p=0.2))
        
        self.classify=nn.Sequential(nn.Linear(2560+250,1))
    
    def forward(self,x,csv_data):
        # x is image
        x=self.features.extract_features(x)
        x=F.avg_pool2d(x,x.size()[2:]).reshape(-1,2560) # dimensions here depend on what efficientnet i'm using
        
        # FNN
        csv_data=np.asarray(csv_data,dtype=np.float32)
        csv_data=torch.tensor(csv_data)
        print(csv_data.dtype)
        csv_data=csv_data
        #print(csv_data.size())
        csv_data=csv_data.unsqueeze(0)
        csv_data=self.csv(csv_data)
        
        
        out=self.classify(torch.cat((x,csv_data),dim=1))
        
        return out
        
        
net=EffNet()#initialize
print(net)

In [None]:
dataset=MelanomaDataset(trainxl,imfolder,True,meta_features)
print(meta_features)
#loader=torch.utils.data.DataLoader(dataset,batch_size=3,shuffle=True)


for (image, csv_data), labels in dataset:
    img_ex, csv_ex=image, csv_data
    labels_ex=torch.tensor(labels, dtype=torch.float32)
    break
print('data shape:',img_ex.shape)
print('csv: ',csv_ex)

print('label: ',labels_ex)

In [None]:
net.eval()

In [None]:
out=net(img_ex,csv_ex)#forward
crit=nn.BCEWithLogitsLoss()
print('out shape: ',out.shape)
print('labels shape: ',labels_ex.shape)
print('labels: ',labels_ex)
'''
labels_ex=labels_ex.reshape(1,1)
print('unsqueezed shape: ',labels_ex.shape)
'''
loss=crit(out,labels_ex)
print('loss: ',loss.item())

In [None]:
############
# training #
############

train_len=len(trainxl)
test_len=len(testxl)
print(train_len,test_len)

In [None]:
from sklearn.model_selection import GroupKFold
k=6 # no. of folds
group_fold=GroupKFold(n_splits=k)
folds=group_fold.split(X=np.zeros(train_len),y=trainxl['target'],groups=trainxl['patient_id'].tolist())

In [None]:
# parameters
epochs = 15
patience = 3
TTA = 3
num_workers = 8
learning_rate = 0.0005
weight_decay = 0.0
lr_patience = 1            # 1 model not improving until lr is decreasing
lr_factor = 0.4            # by how much the lr is decreasing

batch_size1 = 32
batch_size2 = 16

version = 'v6'

In [None]:
def train_folds(preds_submission, model, version = 'v1'):
    # Creates a .txt file that will contain the logs
    f = open(f"logs_{version}.txt", "w+")
    
    
    for fold, (train_index, valid_index) in enumerate(folds):
        # Append to .txt
        with open(f"logs_{version}.txt", 'a+') as f:
            print('-'*10, 'Fold:', fold+1, '-'*10, file=f)
        print('-'*10, 'Fold:', fold+1, '-'*10)


        # --- Create Instances ---
        # Best ROC score in this fold
        best_roc = None
        # Reset patience before every fold
        patience_f = patience
        
        # Initiate the model
        model = model

        optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate, weight_decay=weight_decay)
        scheduler = ReduceLROnPlateau(optimizer=optimizer, mode='max', 
                                      patience=lr_patience, verbose=True, factor=lr_factor)
        criterion = nn.BCEWithLogitsLoss()


        # --- Read in Data ---
        train_data = trainxl.iloc[train_index].reset_index(drop=True)
        valid_data = trainxl.iloc[valid_index].reset_index(drop=True)

        # Create Data instances
        train = MelanomaDataset(train_data, imfolder, True, meta_features)
        valid = MelanomaDataset(valid_data, imfolder, False, meta_features)
        # Read in test data | Remember! We're using data augmentation like we use for Train data.
        test = MelanomaDataset(testxl, vertical_flip=vertical_flip, horizontal_flip=horizontal_flip,
                               is_train=False, is_valid=False, is_test=True)

        # Dataloaders
        train_loader = DataLoader(train, batch_size=batch_size1, shuffle=True, num_workers=num_workers)
        
        
        # https://stackoverflow.com/questions/41836602/why-does-shuffling-my-validation-set-in-keras-change-my-models-performance
        valid_loader = DataLoader(valid, batch_size=batch_size2, shuffle=False, num_workers=num_workers)
        test_loader = DataLoader(test, batch_size=batch_size2, shuffle=False, num_workers=num_workers)


        # === EPOCHS ===
        for epoch in range(epochs):
            start_time = time.time()
            correct = 0
            train_losses = 0

            # === TRAIN ===
            # Sets the module in training mode.
            model.train()

            for (images, csv_data), labels in train_loader:
                # Save them to device
                images = torch.tensor(images, device=device, dtype=torch.float32)
                csv_data = torch.tensor(csv_data, device=device, dtype=torch.float32)
                labels = torch.tensor(labels, device=device, dtype=torch.float32)

                # Clear gradients first; very important, usually done BEFORE prediction
                optimizer.zero_grad()

                # Log Probabilities & Backpropagation
                out = model(images, csv_data)
                loss = criterion(out, labels.unsqueeze(1))
                loss.backward()
                optimizer.step()

                # --- Save information after this batch ---
                # Save loss
                train_losses += loss.item()
                # From log probabilities to actual probabilities
                train_preds = torch.round(torch.sigmoid(out)) # 0 and 1
                # Number of correct predictions
                correct += (train_preds.cpu() == labels.cpu().unsqueeze(1)).sum().item()

            # Compute Train Accuracy
            train_acc = correct / len(train_index)


            # === EVAL ===
            # Sets the model in evaluation mode
            model.eval()

            # Create matrix to store evaluation predictions (for accuracy)
            valid_preds = torch.zeros(size = (len(valid_index), 1), device=device, dtype=torch.float32)


            # Disables gradients (we need to be sure no optimization happens)
            with torch.no_grad():
                for k, ((images, csv_data), labels) in enumerate(valid_loader):
                    images = torch.tensor(images, device=device, dtype=torch.float32)
                    csv_data = torch.tensor(csv_data, device=device, dtype=torch.float32)
                    labels = torch.tensor(labels, device=device, dtype=torch.float32)

                    out = model(images, csv_data)
                    pred = torch.sigmoid(out)
                    valid_preds[k*images.shape[0] : k*images.shape[0] + images.shape[0]] = pred

                # Compute accuracy
                valid_acc = accuracy_score(valid_data['target'].values, 
                                           torch.round(valid_preds.cpu()))
                # Compute ROC
                valid_roc = roc_auc_score(valid_data['target'].values, 
                                          valid_preds.cpu())

                # Compute time on Train + Eval
                duration = str(datetime.timedelta(seconds=time.time() - start_time))[:7]


                # PRINT INFO
                # Append to .txt file
                with open(f"logs_{version}.txt", 'a+') as f:
                    print('{} | Epoch: {}/{} | Loss: {:.4} | Train Acc: {:.3} | Valid Acc: {:.3} | ROC: {:.3}'.\
                     format(duration, epoch+1, epochs, train_losses, train_acc, valid_acc, valid_roc), file=f)
                # Print to console
                print('{} | Epoch: {}/{} | Loss: {:.4} | Train Acc: {:.3} | Valid Acc: {:.3} | ROC: {:.3}'.\
                     format(duration, epoch+1, epochs, train_losses, train_acc, valid_acc, valid_roc))


                # === SAVE MODEL ===

                # Update scheduler (for learning_rate)
                scheduler.step(valid_roc)

                # Update best_roc
                if not best_roc: # If best_roc = None
                    best_roc = valid_roc
                    torch.save(model.state_dict(),
                               f"Fold{fold+1}_Epoch{epoch+1}_ValidAcc_{valid_acc:.3f}_ROC_{valid_roc:.3f}.pth")
                    continue

                if valid_roc > best_roc:
                    best_roc = valid_roc
                    # Reset patience (because we have improvement)
                    patience_f = patience
                    torch.save(model.state_dict(),
                               f"Fold{fold+1}_Epoch{epoch+1}_ValidAcc_{valid_acc:.3f}_ROC_{valid_roc:.3f}.pth")
                else:
                    # Decrease patience (no improvement in ROC)
                    patience_f = patience_f - 1
                    if patience_f == 0:
                        with open(f"logs_{version}.txt", 'a+') as f:
                            print('Early stopping (no improvement since 3 models) | Best ROC: {}'.\
                                  format(best_roc), file=f)
                        print('Early stopping (no improvement since 3 models) | Best ROC: {}'.\
                              format(best_roc))
                        break


        # === INFERENCE ===
        # Choose model with best_roc in this fold
        best_model_path = '../working/' + [file for file in os.listdir('../working') if str(round(best_roc, 3)) in file and 'Fold'+str(fold+1) in file][0]
        # Using best model from Epoch Train
        
        model = EffNet().to(device)
        model.load_state_dict(torch.load(best_model_path))
        # Set the model in evaluation mode
        model.eval()


        with torch.no_grad():
            # --- EVAL ---
            # Predicting again on Validation data to get preds for OOF
            valid_preds = torch.zeros(size = (len(valid_index), 1), device=device, dtype=torch.float32)

            for k, ((images, csv_data), _) in enumerate(valid_loader):
                images = torch.tensor(images, device=device, dtype=torch.float32)
                csv_data = torch.tensor(csv_data, device=device, dtype=torch.float32)

                out = model(images, csv_data)
                pred = torch.sigmoid(out)
                valid_preds[k*images.shape[0] : k*images.shape[0] + images.shape[0]] = pred

            # Save info to OOF
            oof[valid_index] = valid_preds.cpu().numpy()


            # --- TEST ---
            # Now (Finally) prediction for our TEST data
            for i in range(TTA):
                for k, (images, csv_data) in enumerate(test_loader):
                    images = torch.tensor(images, device=device, dtype=torch.float32)
                    csv_data = torch.tensor(csv_data, device=device, dtype=torch.float32)

                    out = model(images, csv_data)
                    # Covert to probablities
                    out = torch.sigmoid(out)

                    # ADDS! the prediction to the matrix we already created
                    preds_submission[k*images.shape[0] : k*images.shape[0] + images.shape[0]] += out


            # Divide Predictions by TTA (to average the results during TTA)
            preds_submission /= TTA


        # === CLEANING ===
        # Clear memory
        del train, valid, train_loader, valid_loader, images, labels
        # Garbage collector
        gc.collect()