# Preprocesado de imagenes


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import os

In [None]:
from PIL import Image
path='../input/rsna-miccai-png/train'

images=[]
pacientes=[]
list_directorios=[folder[1] for folder in os.walk(path)]
for paciente in list_directorios[0][:10]:
    fotos_paciente=[folde[2] for folde in os.walk(f'{path}/{paciente}/FLAIR')]
    if len(fotos_paciente)==0:
        continue
    for foto in fotos_paciente[0]:
        images.append(Image.open(f"{path}/{paciente}/FLAIR/{foto}").convert('RGB'))
        pacientes.append(paciente)
        
        
        
path='../input/rsna-miccai-png/test'
images_t=[]
pacientes_t=[]
list_directorios=[folder[1] for folder in os.walk(path)]
for paciente in list_directorios[0][:10]:
    fotos_paciente=[folde[2] for folde in os.walk(f'{path}/{paciente}/FLAIR')]
    if len(fotos_paciente)==0:
        continue
    for foto in fotos_paciente[0]:
        images_t.append(Image.open(f"{path}/{paciente}/FLAIR/{foto}").convert('RGB'))
        pacientes_t.append(paciente)
    
    

In [None]:
# Creo un df con el id del paciente y la fotos
# (cada fila es una foto, un paciente tiene muchas filas por lo tanto)

#TRAIN
images_df=pd.DataFrame()
images_df['Pacientes']=pacientes
images_df['Imagenes']=images
images_df

#TEST


In [None]:

# Import labels
labels_df=pd.read_csv('../input/train-labelscsv/train_labels.csv')
labels_df=labels_df.rename(columns={'BraTS21ID': 'Pacientes'})
images_df['Pacientes']=pd.to_numeric(images_df['Pacientes'])

#  Añado las labels con left join
images_df=images_df.merge(labels_df,on='Pacientes',how='left')
# images_df=images_df.drop(columns=['Pacientes','%Blanco']).rename(columns={'MGMT_value':'Label'})


#Genero train/test set

train_df=images_df.iloc[:29400]
test_df=images_df.iloc[29400:]


In [None]:
test_df=test_df.reset_index()

In [None]:
#Hago los trai/test sets de diferentes tamaños

# Train 882 Test 378
train_small_df=train_df.sample(frac=0.03,random_state=200).reset_index().drop(columns=['index'])
test_small_df=test_df.sample(frac=0.03,random_state=200).reset_index().drop(columns=['index'])

# Train 2940 Test 1260
train_medium_df=train_df.sample(frac=0.1,random_state=200).reset_index().drop(columns=['index'])
test_medium_df=test_df.sample(frac=0.1,random_state=200).reset_index().drop(columns=['index'])

# Train 8820 Test 2781
train_big_df=train_df.sample(frac=0.3,random_state=200).reset_index().drop(columns=['index'])
test_big_df=test_df.sample(frac=0.3,random_state=200).reset_index().drop(columns=['index'])

In [None]:
import torch
#Creo clase para facilitar la entrada de los datos a dataloaders
from PIL import Image

class BrainData(torch.utils.data.Dataset):
    def __init__(self, dataframe, transforms):
       
     
        self.transforms = transforms
        self.dataframe=dataframe
        

    def __getitem__(self, idx):
         
        #Unnecesary
        #pacient_id =self.dataframe.loc[[idx], ['Pacientes']]
        #Get image and category, the two statements that vit is going to use to learn
        imgs=[x[0] for x in self.dataframe.loc[[idx],['Imagenes']].values][0]
        #Cuando los labels esten en el df
        category=int(self.dataframe.loc[[idx],['MGMT_value']].values)
        if category==0:
            category=torch.Tensor([0.])
        else:
            category=torch.Tensor([1.])
        
        if self.transforms is not None:
            imgs = self.transforms(imgs)

        return imgs, category

    def __len__(self):
        return len(self.dataframe['MGMT_value'])

In [None]:
#Transforms: Main, the resize to the vit default size. Maybe we can append more???

import torchvision.transforms as T
import random

def get_transform(train):
    transform = []
    transform.append(T.PILToTensor())
    transform.append(T.ConvertImageDtype(torch.float))
    transform.append(T.Resize((224,224)))
 

    return T.Compose(transform)

In [None]:
#Creation of datasets (one to train, one to test)

train_dataset=BrainData(train_df, get_transform(train=True))
test_dataset=BrainData(test_df,get_transform(train=False))

In [None]:
#Dataloaders

import torch.utils

#data_loaders
data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=16, shuffle=True, num_workers=0)

data_loader_test = torch.utils.data.DataLoader(
        test_dataset, batch_size=16, shuffle=False, num_workers=0)

dataloaders={"train": data_loader, "test": data_loader_test}

dataset_sizes={"train": len(train_dataset), "test":len(test_dataset)}

In [None]:
#Train model function

import time
import copy
import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_model(model, dataloaders, criterion, optimizer, scheduler, num_epochs=25):
    model.to(device)
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'test']:
            if phase == 'train':
                model.train()  # Set model to training mode
                scheduler.step()
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            #Tqdm for estimation of time
            for inputs, labels in tqdm.tqdm(dataloaders[phase]):
                inputs = inputs.to(device)
                labels = labels.to(device)
               
                # zero the parameter gradients
                optimizer.zero_grad()

                # forward

                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

               
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

               #Doesnt matter, only if we have time
                #running_loss += loss.item() * inputs.size(0)
              
                running_corrects += torch.sum(torch.reshape(preds,(1,-1)) == torch.reshape(labels.data,(1,-1)))
              
             #Doesnt matter, only if we have time
            #epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects/ dataset_sizes[phase]

            print(f'{phase} Acc: {epoch_acc:.4f}')

            # Save best model
            if phase == 'test' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best test Acc: {best_acc:4f}')

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [None]:
#Define the parameters
import torchvision
from torch import nn, optim
from torch.optim import lr_scheduler
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#Load VIT (Maybe a big vit model)
model_transf = torchvision.models.vit_b_16(pretrained=True)

for param in model_transf.parameters():
    param.requires_grad = False




#Number of classes: 2, positive or negative. Access to the last layer and change output size
model_transf.heads = nn.Linear( model_transf.heads.head.in_features, 1)

#loss function
criterion = nn.CrossEntropyLoss()
optimizer= optim.SGD(model_transf.parameters(), lr=0.001, momentum=0.9)
lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [None]:

#Train time!!

print(device)
model_transf = train_model(model_transf, dataloaders, criterion, optimizer,lr_scheduler, num_epochs=25)

In [None]:
#With model trained, to predict:
#Pseudo

# input=imagen
# output=model_transf(input)
# _,pred=torch.max(output, 1)
# return pred
